8164888: Intrinsify fused mac operations on SPARC

Such speed, much wow

Reviewed-by: kvn
This commit is contained in:
Patric Hedlin 2017-06-27 15:50:09 +02:00 committed by Nils Eliasson
parent 6a9aa18f63
commit 431063deba
10 changed files with 218 additions and 45 deletions

View File

@ -52,8 +52,16 @@ int AbstractInterpreter::BasicType_as_index(BasicType type) {
return i;
}
// These should never be compiled since the interpreter will prefer the compiled
// version to the intrinsic version.
bool AbstractInterpreter::can_be_compiled(methodHandle m) {
// No special entry points that preclude compilation
switch (method_kind(m)) {
case Interpreter::java_lang_math_fmaD:
case Interpreter::java_lang_math_fmaF:
return false;
default:
break;
}
return true;
}

View File

@ -628,6 +628,9 @@ class Assembler : public AbstractAssembler {
// CRC32C instruction supported only on certain processors
static void crc32c_only() { assert(VM_Version::has_crc32c(), "This instruction only works on SPARC with CRC32C"); }
// FMAf instructions supported only on certain processors
static void fmaf_only() { assert(VM_Version::has_fmaf(), "This instruction only works on SPARC with FMAf"); }
// instruction only in VIS1
static void vis1_only() { assert(VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
@ -923,6 +926,10 @@ class Assembler : public AbstractAssembler {
inline void fsqrt(FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d);
// fmaf instructions.
inline void fmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
// pp 165
inline void flush(Register s1, Register s2);

View File

@ -355,6 +355,11 @@ inline void Assembler::fsqrt(FloatRegisterImpl::Width w, FloatRegister s, FloatR
emit_int32(op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w));
}
inline void Assembler::fmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d) {
fmaf_only();
emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(w) | fs2(s2, w));
}
inline void Assembler::flush(Register s1, Register s2) {
emit_int32(op(arith_op) | op3(flush_op3) | rs1(s1) | rs2(s2));
}

View File

@ -440,6 +440,31 @@ void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo *info) {
}
void LIR_Assembler::emit_op3(LIR_Op3* op) {
switch (op->code()) {
case lir_idiv:
case lir_irem: // Both idiv & irem are handled after the switch (below).
break;
case lir_fmaf:
__ fmadd(FloatRegisterImpl::S,
op->in_opr1()->as_float_reg(),
op->in_opr2()->as_float_reg(),
op->in_opr3()->as_float_reg(),
op->result_opr()->as_float_reg());
return;
case lir_fmad:
__ fmadd(FloatRegisterImpl::D,
op->in_opr1()->as_double_reg(),
op->in_opr2()->as_double_reg(),
op->in_opr3()->as_double_reg(),
op->result_opr()->as_double_reg());
return;
default:
ShouldNotReachHere();
break;
}
// Handle idiv & irem:
Register Rdividend = op->in_opr1()->as_register();
Register Rdivisor = noreg;
Register Rscratch = op->in_opr3()->as_register();

View File

@ -953,7 +953,29 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
fatal("FMA intrinsic is not implemented on this platform");
assert(x->number_of_arguments() == 3, "wrong type");
assert(UseFMA, "Needs FMA instructions support.");
LIRItem a(x->argument_at(0), this);
LIRItem b(x->argument_at(1), this);
LIRItem c(x->argument_at(2), this);
a.load_item();
b.load_item();
c.load_item();
LIR_Opr ina = a.result();
LIR_Opr inb = b.result();
LIR_Opr inc = c.result();
LIR_Opr res = rlock_result(x);
switch (x->id()) {
case vmIntrinsics::_fmaF: __ fmaf(ina, inb, inc, res); break;
case vmIntrinsics::_fmaD: __ fmad(ina, inb, inc, res); break;
default:
ShouldNotReachHere();
break;
}
}
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {

View File

@ -33,7 +33,7 @@
define_pd_global(bool, BackgroundCompilation, true);
define_pd_global(bool, CICompileOSR, true);
define_pd_global(bool, InlineIntrinsics, false);
define_pd_global(bool, InlineIntrinsics, true);
define_pd_global(bool, PreferInterpreterNativeStubs, false);
define_pd_global(bool, ProfileTraps, true);
define_pd_global(bool, UseOnStackReplacement, true);

View File

@ -2627,6 +2627,33 @@ enc_class fsqrtd (dflt_reg dst, dflt_reg src) %{
__ fsqrt(FloatRegisterImpl::D, Fsrc, Fdst);
%}
enc_class fmadds (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
MacroAssembler _masm(&cbuf);
FloatRegister Frd = reg_to_SingleFloatRegister_object($dst$$reg);
FloatRegister Fra = reg_to_SingleFloatRegister_object($a$$reg);
FloatRegister Frb = reg_to_SingleFloatRegister_object($b$$reg);
FloatRegister Frc = reg_to_SingleFloatRegister_object($c$$reg);
__ fmadd(FloatRegisterImpl::S, Fra, Frb, Frc, Frd);
%}
enc_class fmaddd (dflt_reg dst, dflt_reg a, dflt_reg b, dflt_reg c) %{
MacroAssembler _masm(&cbuf);
FloatRegister Frd = reg_to_DoubleFloatRegister_object($dst$$reg);
FloatRegister Fra = reg_to_DoubleFloatRegister_object($a$$reg);
FloatRegister Frb = reg_to_DoubleFloatRegister_object($b$$reg);
FloatRegister Frc = reg_to_DoubleFloatRegister_object($c$$reg);
__ fmadd(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
%}
enc_class fmovs (dflt_reg dst, dflt_reg src) %{
MacroAssembler _masm(&cbuf);
@ -4540,6 +4567,26 @@ pipe_class fdivD_reg_reg(regD dst, regD src1, regD src2) %{
FDIV : C(17);
%}
// Fused floating-point multiply-add float.
pipe_class fmaF_regx4(regF dst, regF src1, regF src2, regF src3) %{
single_instruction;
dst : X(write);
src1 : E(read);
src2 : E(read);
src3 : E(read);
FM : R;
%}
// Fused gloating-point multiply-add double.
pipe_class fmaD_regx4(regD dst, regD src1, regD src2, regD src3) %{
single_instruction;
dst : X(write);
src1 : E(read);
src2 : E(read);
src3 : E(read);
FM : R;
%}
// Floating Point Move/Negate/Abs Float
pipe_class faddF_reg(regF dst, regF src) %{
single_instruction;
@ -7531,6 +7578,24 @@ instruct sqrtD_reg_reg(regD dst, regD src) %{
ins_pipe(fdivD_reg_reg);
%}
// Single precision fused floating-point multiply-add (d = a * b + c).
instruct fmaF_regx4(regF dst, regF a, regF b, regF c) %{
predicate(UseFMA);
match(Set dst (FmaF c (Binary a b)));
format %{ "fmadds $a,$b,$c,$dst\t# $dst = $a * $b + $c" %}
ins_encode(fmadds(dst, a, b, c));
ins_pipe(fmaF_regx4);
%}
// Double precision fused floating-point multiply-add (d = a * b + c).
instruct fmaD_regx4(regD dst, regD a, regD b, regD c) %{
predicate(UseFMA);
match(Set dst (FmaD c (Binary a b)));
format %{ "fmaddd $a,$b,$c,$dst\t# $dst = $a * $b + $c" %}
ins_encode(fmaddd(dst, a, b, c));
ins_pipe(fmaD_regx4);
%}
//----------Logical Instructions-----------------------------------------------
// And Instructions
// Register And

View File

@ -153,13 +153,12 @@ address TemplateInterpreterGenerator::generate_slow_signature_handler() {
__ delayed()->srl( G4_scratch, 2, G4_scratch );
__ bind(NextArg);
}
__ bind(done);
__ ret();
__ delayed()->
restore(O0, 0, Lscratch); // caller's Lscratch gets the result handler
__ delayed()->restore(O0, 0, Lscratch); // caller's Lscratch gets the result handler
return entry;
}
@ -177,7 +176,6 @@ void TemplateInterpreterGenerator::generate_counter_overflow(Label& Lcontinue) {
// returns verified_entry_point or NULL
// we ignore it in any case
__ ba_short(Lcontinue);
}
@ -196,7 +194,6 @@ address TemplateInterpreterGenerator::generate_abstract_entry(void) {
// the call_VM checks for exception, so we should never return here.
__ should_not_reach_here();
return entry;
}
void TemplateInterpreterGenerator::save_native_result(void) {
@ -474,7 +471,6 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
__ delayed()->nop();
__ bind(done);
}
}
// Allocate monitor and lock method (asm interpreter)
@ -590,7 +586,7 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe
// pop parameters from the callers stack by adjusting Lesp
// set O0 to Lesp
// compute X = (max_locals - num_parameters)
// bump SP up by X to accomadate the extra locals
// bump SP up by X to accommodate the extra locals
// compute X = max_expression_stack
// + vm_local_words
// + 16 words of register save area
@ -688,7 +684,7 @@ void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
// 1) Increase caller's SP by for the extra local space needed:
// (check for overflow)
// Efficient implementation of xload/xstore bytecodes requires
// that arguments and non-argument locals are in a contigously
// that arguments and non-argument locals are in a contiguously
// addressable memory block => non-argument locals must be
// allocated in the caller's frame.
//
@ -782,7 +778,7 @@ void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
__ sub(Gframe_size, Glocals_size, Gframe_size);
//
// bump SP to accomodate the extra locals
// bump SP to accommodate the extra locals
//
__ sub(SP, Glocals_size, SP);
}
@ -810,9 +806,9 @@ void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
Register mirror = LcpoolCache;
__ load_mirror(mirror, Lmethod);
__ st_ptr(mirror, FP, (frame::interpreter_frame_mirror_offset * wordSize) + STACK_BIAS);
__ get_constant_pool_cache( LcpoolCache ); // set LcpoolCache
__ get_constant_pool_cache(LcpoolCache); // set LcpoolCache
__ sub(FP, rounded_vm_local_words * BytesPerWord, Lmonitors ); // set Lmonitors
__ add( Lmonitors, STACK_BIAS, Lmonitors ); // Account for 64 bit stack bias
__ add(Lmonitors, STACK_BIAS, Lmonitors); // Account for 64 bit stack bias
__ sub(Lmonitors, BytesPerWord, Lesp); // set Lesp
// setup interpreter activation registers
@ -984,7 +980,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
__ ldx( Gargs, 16, buf);
__ lduw(Gargs, 24, crc);
__ add(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE), buf); // account for the header size
__ add(buf ,offset, buf);
__ add(buf, offset, buf);
}
// Call the crc32 kernel
@ -1057,8 +1053,58 @@ address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(Abstract
return NULL;
}
// Not supported
address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
/* Math routines only partially supported.
*
* Providing support for fma (float/double) only.
*/
address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind)
{
if (!InlineIntrinsics) return NULL; // Generate a vanilla entry
address entry = __ pc();
switch (kind) {
case Interpreter::java_lang_math_fmaF:
if (UseFMA) {
// float .fma(float a, float b, float c)
const FloatRegister ra = F1;
const FloatRegister rb = F2;
const FloatRegister rc = F3;
const FloatRegister rd = F0; // Result.
__ ldf(FloatRegisterImpl::S, Gargs, 0, rc);
__ ldf(FloatRegisterImpl::S, Gargs, 8, rb);
__ ldf(FloatRegisterImpl::S, Gargs, 16, ra);
__ fmadd(FloatRegisterImpl::S, ra, rb, rc, rd);
__ retl(); // Result in F0 (rd).
__ delayed()->mov(O5_savedSP, SP);
return entry;
}
break;
case Interpreter::java_lang_math_fmaD:
if (UseFMA) {
// double .fma(double a, double b, double c)
const FloatRegister ra = F2; // D1
const FloatRegister rb = F4; // D2
const FloatRegister rc = F6; // D3
const FloatRegister rd = F0; // D0 Result.
__ ldf(FloatRegisterImpl::D, Gargs, 0, rc);
__ ldf(FloatRegisterImpl::D, Gargs, 16, rb);
__ ldf(FloatRegisterImpl::D, Gargs, 32, ra);
__ fmadd(FloatRegisterImpl::D, ra, rb, rc, rd);
__ retl(); // Result in D0 (rd).
__ delayed()->mov(O5_savedSP, SP);
return entry;
}
break;
default:
break;
}
return NULL;
}
@ -1071,7 +1117,7 @@ void TemplateInterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
// Doing the banging earlier fails if the caller frame is not an interpreter
// frame.
// (Also, the exception throwing code expects to unlock any synchronized
// method receiever, so do the banging after locking the receiver.)
// method receiver, so do the banging after locking the receiver.)
// Bang each page in the shadow zone. We can't assume it's been done for
// an interpreter frame with greater than a page of locals, so each page
@ -1112,8 +1158,7 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
// rethink these assertions - they can be simplified and shared (gri 2/25/2000)
#ifdef ASSERT
__ ld(G5_method, Method::access_flags_offset(), Gtmp1);
{
Label L;
{ Label L;
__ btst(JVM_ACC_NATIVE, Gtmp1);
__ br(Assembler::notZero, false, Assembler::pt, L);
__ delayed()->nop();
@ -1362,7 +1407,7 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
// didn't see any synchronization is progress, and escapes.
__ set(_thread_in_native_trans, G3_scratch);
__ st(G3_scratch, thread_state);
if(os::is_MP()) {
if (os::is_MP()) {
if (UseMembar) {
// Force this write out before the read below
__ membar(Assembler::StoreLoad);
@ -1425,8 +1470,7 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
// If we have an oop result store it where it will be safe for any further gc
// until we return now that we've released the handle it might be protected by
{
Label no_oop, store_result;
{ Label no_oop, store_result;
__ set((intptr_t)AbstractInterpreter::result_handler(T_OBJECT), G3_scratch);
__ cmp_and_brx_short(G3_scratch, Lscratch, Assembler::notEqual, Assembler::pt, no_oop);
@ -1484,8 +1528,7 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
// dispose of return address and remove activation
#ifdef ASSERT
{
Label ok;
{ Label ok;
__ cmp_and_brx_short(I5_savedSP, FP, Assembler::greaterEqualUnsigned, Assembler::pt, ok);
__ stop("bad I5_savedSP value");
__ should_not_reach_here();
@ -1495,15 +1538,12 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
__ jmp(Lscratch, 0);
__ delayed()->nop();
if (inc_counter) {
// handle invocation counter overflow
__ bind(invocation_counter_overflow);
generate_counter_overflow(Lcontinue);
}
return entry;
}
@ -1533,8 +1573,7 @@ address TemplateInterpreterGenerator::generate_normal_entry(bool synchronized) {
// rethink these assertions - they can be simplified and shared (gri 2/25/2000)
#ifdef ASSERT
__ ld(G5_method, Method::access_flags_offset(), Gtmp1);
{
Label L;
{ Label L;
__ btst(JVM_ACC_NATIVE, Gtmp1);
__ br(Assembler::zero, false, Assembler::pt, L);
__ delayed()->nop();
@ -1666,7 +1705,6 @@ address TemplateInterpreterGenerator::generate_normal_entry(bool synchronized) {
generate_counter_overflow(Lcontinue);
}
return entry;
}
@ -1786,8 +1824,7 @@ void TemplateInterpreterGenerator::generate_throw_exception() {
}
#if INCLUDE_JVMTI
{
Label L_done;
{ Label L_done;
__ ldub(Address(Lbcp, 0), G1_scratch); // Load current bytecode
__ cmp_and_br_short(G1_scratch, Bytecodes::_invokestatic, Assembler::notEqual, Assembler::pn, L_done);
@ -1827,7 +1864,7 @@ void TemplateInterpreterGenerator::generate_throw_exception() {
__ get_vm_result(Oexception);
__ verify_oop(Oexception);
const int return_reg_adjustment = frame::pc_return_offset;
const int return_reg_adjustment = frame::pc_return_offset;
Address issuing_pc_addr(I7, return_reg_adjustment);
// We are done with this activation frame; find out where to go next.

View File

@ -317,7 +317,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (UseFMA) {
if (has_fmaf()) {
if (FLAG_IS_DEFAULT(UseFMA)) {
UseFMA = true;
}
} else if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}

View File

@ -248,7 +248,7 @@ class StubRoutines: AllStatic {
static jint verify_oop_count() { return _verify_oop_count; }
static jint* verify_oop_count_addr() { return &_verify_oop_count; }
// a subroutine for debugging the GC
static address verify_oop_subroutine_entry_address() { return (address)&_verify_oop_subroutine_entry; }
static address verify_oop_subroutine_entry_address() { return (address)&_verify_oop_subroutine_entry; }
static address catch_exception_entry() { return _catch_exception_entry; }
@ -335,8 +335,8 @@ class StubRoutines: AllStatic {
static address checkcast_arraycopy(bool dest_uninitialized = false) {
return dest_uninitialized ? _checkcast_arraycopy_uninit : _checkcast_arraycopy;
}
static address unsafe_arraycopy() { return _unsafe_arraycopy; }
static address generic_arraycopy() { return _generic_arraycopy; }
static address unsafe_arraycopy() { return _unsafe_arraycopy; }
static address generic_arraycopy() { return _generic_arraycopy; }
static address jbyte_fill() { return _jbyte_fill; }
static address jshort_fill() { return _jshort_fill; }
@ -349,8 +349,8 @@ class StubRoutines: AllStatic {
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
static address ghash_processBlocks() { return _ghash_processBlocks; }
static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
static address ghash_processBlocks() { return _ghash_processBlocks; }
static address sha1_implCompress() { return _sha1_implCompress; }
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
@ -366,9 +366,9 @@ class StubRoutines: AllStatic {
static address updateBytesCRC32C() { return _updateBytesCRC32C; }
static address updateBytesAdler32() { return _updateBytesAdler32; }
static address multiplyToLen() {return _multiplyToLen; }
static address squareToLen() {return _squareToLen; }
static address mulAdd() {return _mulAdd; }
static address multiplyToLen() { return _multiplyToLen; }
static address squareToLen() { return _squareToLen; }
static address mulAdd() { return _mulAdd; }
static address montgomeryMultiply() { return _montgomeryMultiply; }
static address montgomerySquare() { return _montgomerySquare; }
@ -376,7 +376,7 @@ class StubRoutines: AllStatic {
static address dexp() { return _dexp; }
static address dlog() { return _dlog; }
static address dlog10() { return _dlog10; }
static address dlog10() { return _dlog10; }
static address dpow() { return _dpow; }
static address dsin() { return _dsin; }
static address dcos() { return _dcos; }
@ -387,7 +387,7 @@ class StubRoutines: AllStatic {
static address select_fill_function(BasicType t, bool aligned, const char* &name);
static address zero_aligned_words() { return _zero_aligned_words; }
static address zero_aligned_words() { return _zero_aligned_words; }
static double intrinsic_log10(double d) {
assert(_intrinsic_log10 != NULL, "must be defined");