8145688: Update for x86 pow in the math lib

Optimizes Math.pow() for 64 and 32 bit X86 architecture using Intel LIBM  implementation.

Reviewed-by: kvn
This commit is contained in:
Vivek R Deshpande 2015-12-23 21:09:50 -08:00 committed by Vladimir Kozlov
parent bc04deac15
commit 453650389f
27 changed files with 3714 additions and 691 deletions

View File

@ -772,6 +772,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
case 0x55: // andnps
case 0x56: // orps
case 0x57: // xorps
case 0x58: // addpd
case 0x59: // mulpd
case 0x6E: // movd
case 0x7E: // movd
@ -3363,6 +3364,7 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
emit_int8(imm8);
}
// The encoding for pextrw is SSE2 to support the LIBM implementation.
void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
assert(VM_Version::supports_sse2(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@ -4361,6 +4363,17 @@ void Assembler::addpd(XMMRegister dst, XMMRegister src) {
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::addpd(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionMark im(this);
InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int8(0x58);
emit_operand(dst, src);
}
void Assembler::addps(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);

View File

@ -1791,6 +1791,7 @@ private:
// Add Packed Floating-Point Values
void addpd(XMMRegister dst, XMMRegister src);
void addpd(XMMRegister dst, Address src);
void addps(XMMRegister dst, XMMRegister src);
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

View File

@ -2381,9 +2381,6 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, L
// Should consider not saving rbx, if not necessary
__ trigfunc('t', op->as_Op2()->fpu_stack_size());
break;
case lir_pow :
__ pow_with_fallback(op->as_Op2()->fpu_stack_size());
break;
default : ShouldNotReachHere();
}
} else {

View File

@ -810,7 +810,8 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog) {
if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog ||
x->id() == vmIntrinsics::_dpow) {
do_LibmIntrinsic(x);
return;
}
@ -824,7 +825,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dcos:
case vmIntrinsics::_dtan:
case vmIntrinsics::_dlog10:
case vmIntrinsics::_dpow:
use_fpu = true;
}
} else {
@ -874,7 +874,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dcos: __ cos (calc_input, calc_result, tmp1, tmp2); break;
case vmIntrinsics::_dtan: __ tan (calc_input, calc_result, tmp1, tmp2); break;
case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1); break;
case vmIntrinsics::_dpow: __ pow (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
default: ShouldNotReachHere();
}
@ -890,11 +889,25 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
LIR_Opr calc_result = rlock_result(x);
LIR_Opr result_reg = result_register_for(x->type());
CallingConvention* cc = NULL;
if (x->id() == vmIntrinsics::_dpow) {
LIRItem value1(x->argument_at(1), this);
value1.set_destroys_register();
BasicTypeList signature(2);
signature.append(T_DOUBLE);
signature.append(T_DOUBLE);
cc = frame_map()->c_calling_convention(&signature);
value.load_item_force(cc->at(0));
value1.load_item_force(cc->at(1));
} else {
BasicTypeList signature(1);
signature.append(T_DOUBLE);
CallingConvention* cc = frame_map()->c_calling_convention(&signature);
cc = frame_map()->c_calling_convention(&signature);
value.load_item_force(cc->at(0));
}
#ifndef _LP64
LIR_Opr tmp = FrameMap::fpu0_double_opr;
@ -915,6 +928,14 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
__ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dlog), getThreadTemp(), result_reg, cc->args());
}
break;
case vmIntrinsics::_dpow:
if (VM_Version::supports_sse2()) {
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
}
else {
__ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
}
break;
default: ShouldNotReachHere();
}
#else
@ -925,6 +946,9 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dlog:
__ call_runtime_leaf(StubRoutines::dlog(), getThreadTemp(), result_reg, cc->args());
break;
case vmIntrinsics::_dpow:
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
break;
}
#endif
__ move(result_reg, calc_result);

View File

@ -840,53 +840,6 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
break;
}
case lir_pow: {
// pow needs two temporary fpu stack slots, so there are two temporary
// registers (stored in tmp1 and tmp2 of the operation).
// the stack allocator must guarantee that the stack slots are really free,
// otherwise there might be a stack overflow.
assert(left->is_fpu_register(), "must be");
assert(right->is_fpu_register(), "must be");
assert(res->is_fpu_register(), "must be");
assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
insert_free_if_dead(op2->tmp1_opr());
insert_free_if_dead(op2->tmp2_opr());
// Must bring both operands to top of stack with following operand ordering:
// * fpu stack before pow: ... right left
// * fpu stack after pow: ... left
insert_free_if_dead(res, right);
if (tos_offset(right) != 1) {
insert_exchange(right);
insert_exchange(1);
}
insert_exchange(left);
assert(tos_offset(right) == 1, "check");
assert(tos_offset(left) == 0, "check");
new_left = to_fpu_stack_top(left);
new_right = to_fpu_stack(right);
op2->set_fpu_stack_size(sim()->stack_size());
assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
sim()->pop();
do_rename(right, res);
new_res = to_fpu_stack_top(res);
break;
}
default: {
assert(false, "missed a fpu-operation");
}

View File

@ -149,10 +149,15 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
break;
case Interpreter::java_lang_math_pow:
__ fld_d(Address(rsp, 3*wordSize)); // second argument
__ pow_with_fallback(0);
// Store to stack to convert 80bit precision back to 64bits
__ push_fTOS();
__ pop_fTOS();
__ subptr(rsp, 4 * wordSize);
__ fstp_d(Address(rsp, 0));
__ fstp_d(Address(rsp, 2 * wordSize));
if (VM_Version::supports_sse2()) {
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dpow())));
} else {
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dpow)));
}
__ addptr(rsp, 4 * wordSize);
break;
case Interpreter::java_lang_math_exp:
__ subptr(rsp, 2*wordSize);

View File

@ -255,6 +255,10 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
} else if (kind == Interpreter::java_lang_math_log) {
__ movdbl(xmm0, Address(rsp, wordSize));
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlog())));
} else if (kind == Interpreter::java_lang_math_pow) {
__ movdbl(xmm1, Address(rsp, wordSize));
__ movdbl(xmm0, Address(rsp, 3 * wordSize));
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dpow())));
} else {
__ fld_d(Address(rsp, wordSize));
switch (kind) {
@ -273,11 +277,6 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
case Interpreter::java_lang_math_log10:
__ flog10();
break;
case Interpreter::java_lang_math_pow:
__ fld_d(Address(rsp, 3*wordSize)); // second argument (one
// empty stack slot)
__ pow_with_fallback(0);
break;
default :
ShouldNotReachHere();
}

View File

@ -3060,50 +3060,6 @@ void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
}
}
void MacroAssembler::pow_exp_core_encoding() {
// kills rax, rcx, rdx
subptr(rsp,sizeof(jdouble));
// computes 2^X. Stack: X ...
// f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
// keep it on the thread's stack to compute 2^int(X) later
// then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
// final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
fld_s(0); // Stack: X X ...
frndint(); // Stack: int(X) X ...
fsuba(1); // Stack: int(X) X-int(X) ...
fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ...
f2xm1(); // Stack: 2^(X-int(X))-1 ...
fld1(); // Stack: 1 2^(X-int(X))-1 ...
faddp(1); // Stack: 2^(X-int(X))
// computes 2^(int(X)): add exponent bias (1023) to int(X), then
// shift int(X)+1023 to exponent position.
// Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
// bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
// values so detect them and set result to NaN.
movl(rax,Address(rsp,0));
movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
addl(rax, 1023);
movl(rdx,rax);
shll(rax,20);
// Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
addl(rdx,1);
// Check that 1 < int(X)+1023+1 < 2048
// in 3 steps:
// 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
// 2- (int(X)+1023+1)&-2048 != 0
// 3- (int(X)+1023+1)&-2048 != 1
// Do 2- first because addl just updated the flags.
cmov32(Assembler::equal,rax,rcx);
cmpl(rdx,1);
cmov32(Assembler::equal,rax,rcx);
testl(rdx,rcx);
cmov32(Assembler::notEqual,rax,rcx);
movl(Address(rsp,4),rax);
movl(Address(rsp,0),0);
fmul_d(Address(rsp,0)); // Stack: 2^X ...
addptr(rsp,sizeof(jdouble));
}
void MacroAssembler::increase_precision() {
subptr(rsp, BytesPerWord);
fnstcw(Address(rsp, 0));
@ -3119,194 +3075,6 @@ void MacroAssembler::restore_precision() {
addptr(rsp, BytesPerWord);
}
void MacroAssembler::fast_pow() {
// computes X^Y = 2^(Y * log2(X))
// if fast computation is not possible, result is NaN. Requires
// fallback from user of this macro.
// increase precision for intermediate steps of the computation
BLOCK_COMMENT("fast_pow {");
increase_precision();
fyl2x(); // Stack: (Y*log2(X)) ...
pow_exp_core_encoding(); // Stack: exp(X) ...
restore_precision();
BLOCK_COMMENT("} fast_pow");
}
void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
// kills rax, rcx, rdx
// pow and exp needs 2 extra registers on the fpu stack.
Label slow_case, done;
Register tmp = noreg;
if (!VM_Version::supports_cmov()) {
// fcmp needs a temporary so preserve rdx,
tmp = rdx;
}
Register tmp2 = rax;
Register tmp3 = rcx;
// Stack: X Y
Label x_negative, y_not_2;
static double two = 2.0;
ExternalAddress two_addr((address)&two);
// constant maybe too far on 64 bit
lea(tmp2, two_addr);
fld_d(Address(tmp2, 0)); // Stack: 2 X Y
fcmp(tmp, 2, true, false); // Stack: X Y
jcc(Assembler::parity, y_not_2);
jcc(Assembler::notEqual, y_not_2);
fxch(); fpop(); // Stack: X
fmul(0); // Stack: X*X
jmp(done);
bind(y_not_2);
fldz(); // Stack: 0 X Y
fcmp(tmp, 1, true, false); // Stack: X Y
jcc(Assembler::above, x_negative);
// X >= 0
fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
fld_s(1); // Stack: X Y X Y
fast_pow(); // Stack: X^Y X Y
fcmp(tmp, 0, false, false); // Stack: X^Y X Y
// X^Y not equal to itself: X^Y is NaN go to slow case.
jcc(Assembler::parity, slow_case);
// get rid of duplicate arguments. Stack: X^Y
if (num_fpu_regs_in_use > 0) {
fxch(); fpop();
fxch(); fpop();
} else {
ffree(2);
ffree(1);
}
jmp(done);
// X <= 0
bind(x_negative);
fld_s(1); // Stack: Y X Y
frndint(); // Stack: int(Y) X Y
fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
jcc(Assembler::notEqual, slow_case);
subptr(rsp, 8);
// For X^Y, when X < 0, Y has to be an integer and the final
// result depends on whether it's odd or even. We just checked
// that int(Y) == Y. We move int(Y) to gp registers as a 64 bit
// integer to test its parity. If int(Y) is huge and doesn't fit
// in the 64 bit integer range, the integer indefinite value will
// end up in the gp registers. Huge numbers are all even, the
// integer indefinite number is even so it's fine.
#ifdef ASSERT
// Let's check we don't end up with an integer indefinite number
// when not expected. First test for huge numbers: check whether
// int(Y)+1 == int(Y) which is true for very large numbers and
// those are all even. A 64 bit integer is guaranteed to not
// overflow for numbers where y+1 != y (when precision is set to
// double precision).
Label y_not_huge;
fld1(); // Stack: 1 int(Y) X Y
fadd(1); // Stack: 1+int(Y) int(Y) X Y
#ifdef _LP64
// trip to memory to force the precision down from double extended
// precision
fstp_d(Address(rsp, 0));
fld_d(Address(rsp, 0));
#endif
fcmp(tmp, 1, true, false); // Stack: int(Y) X Y
#endif
// move int(Y) as 64 bit integer to thread's stack
fistp_d(Address(rsp,0)); // Stack: X Y
#ifdef ASSERT
jcc(Assembler::notEqual, y_not_huge);
// Y is huge so we know it's even. It may not fit in a 64 bit
// integer and we don't want the debug code below to see the
// integer indefinite value so overwrite int(Y) on the thread's
// stack with 0.
movl(Address(rsp, 0), 0);
movl(Address(rsp, 4), 0);
bind(y_not_huge);
#endif
fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
fld_s(1); // Stack: X Y X Y
fabs(); // Stack: abs(X) Y X Y
fast_pow(); // Stack: abs(X)^Y X Y
fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
// abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
pop(tmp2);
NOT_LP64(pop(tmp3));
jcc(Assembler::parity, slow_case);
#ifdef ASSERT
// Check that int(Y) is not integer indefinite value (int
// overflow). Shouldn't happen because for values that would
// overflow, 1+int(Y)==Y which was tested earlier.
#ifndef _LP64
{
Label integer;
testl(tmp2, tmp2);
jcc(Assembler::notZero, integer);
cmpl(tmp3, 0x80000000);
jcc(Assembler::notZero, integer);
STOP("integer indefinite value shouldn't be seen here");
bind(integer);
}
#else
{
Label integer;
mov(tmp3, tmp2); // preserve tmp2 for parity check below
shlq(tmp3, 1);
jcc(Assembler::carryClear, integer);
jcc(Assembler::notZero, integer);
STOP("integer indefinite value shouldn't be seen here");
bind(integer);
}
#endif
#endif
// get rid of duplicate arguments. Stack: X^Y
if (num_fpu_regs_in_use > 0) {
fxch(); fpop();
fxch(); fpop();
} else {
ffree(2);
ffree(1);
}
testl(tmp2, 1);
jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
// X <= 0, Y even: X^Y = -abs(X)^Y
fchs(); // Stack: -abs(X)^Y Y
jmp(done);
// slow case: runtime call
bind(slow_case);
fpop(); // pop incorrect result or int(Y)
fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
// Come here with result in F-TOS
bind(done);
}
void MacroAssembler::fpop() {
ffree();
fincstp();

View File

@ -918,24 +918,19 @@ class MacroAssembler: public Assembler {
void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
Register rax, Register rcx, Register rdx, Register tmp1 LP64_ONLY(COMMA Register tmp2));
void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
Register rdx NOT_LP64(COMMA Register tmp) LP64_ONLY(COMMA Register tmp1)
LP64_ONLY(COMMA Register tmp2) LP64_ONLY(COMMA Register tmp3) LP64_ONLY(COMMA Register tmp4));
void increase_precision();
void restore_precision();
// computes pow(x,y). Fallback to runtime call included.
void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); }
private:
// call runtime as a fallback for trig functions and pow/exp.
void fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use);
// computes 2^(Ylog2X); Ylog2X in ST(0)
void pow_exp_core_encoding();
// computes pow(x,y) or exp(x). Fallback to runtime call included.
void pow_or_exp(int num_fpu_regs_in_use);
// these are private because users should be doing movflt/movdbl
void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); }

File diff suppressed because it is too large Load Diff

View File

@ -2126,15 +2126,6 @@ class StubGenerator: public StubCodeGenerator {
__ trigfunc('t');
__ ret(0);
}
{
StubCodeMark mark(this, "StubRoutines", "pow");
StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
__ fld_d(Address(rsp, 12));
__ fld_d(Address(rsp, 4));
__ pow_with_fallback(0);
__ ret(0);
}
}
// AES intrinsic stubs
@ -3082,6 +3073,30 @@ class StubGenerator: public StubCodeGenerator {
}
address generate_libmPow() {
address start = __ pc();
const XMMRegister x0 = xmm0;
const XMMRegister x1 = xmm1;
const XMMRegister x2 = xmm2;
const XMMRegister x3 = xmm3;
const XMMRegister x4 = xmm4;
const XMMRegister x5 = xmm5;
const XMMRegister x6 = xmm6;
const XMMRegister x7 = xmm7;
const Register tmp = rbx;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// Safefetch stubs.
@ -3310,6 +3325,7 @@ class StubGenerator: public StubCodeGenerator {
if (VM_Version::supports_sse2()) {
StubRoutines::_dexp = generate_libmExp();
StubRoutines::_dlog = generate_libmLog();
StubRoutines::_dpow = generate_libmPow();
}
}

View File

@ -3025,21 +3025,6 @@ class StubGenerator: public StubCodeGenerator {
__ addq(rsp, 8);
__ ret(0);
}
{
StubCodeMark mark(this, "StubRoutines", "pow");
StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
__ subq(rsp, 8);
__ movdbl(Address(rsp, 0), xmm1);
__ fld_d(Address(rsp, 0));
__ movdbl(Address(rsp, 0), xmm0);
__ fld_d(Address(rsp, 0));
__ pow_with_fallback(0);
__ fstp_d(Address(rsp, 0));
__ movdbl(xmm0, Address(rsp, 0));
__ addq(rsp, 8);
__ ret(0);
}
}
// AES intrinsic stubs
@ -4283,6 +4268,48 @@ class StubGenerator: public StubCodeGenerator {
}
address generate_libmPow() {
address start = __ pc();
const XMMRegister x0 = xmm0;
const XMMRegister x1 = xmm1;
const XMMRegister x2 = xmm2;
const XMMRegister x3 = xmm3;
const XMMRegister x4 = xmm4;
const XMMRegister x5 = xmm5;
const XMMRegister x6 = xmm6;
const XMMRegister x7 = xmm7;
const Register tmp1 = r8;
const Register tmp2 = r9;
const Register tmp3 = r10;
const Register tmp4 = r11;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64
// save the xmm registers which must be preserved 6-7
__ subptr(rsp, 4 * wordSize);
__ movdqu(Address(rsp, 0), xmm6);
__ movdqu(Address(rsp, 2 * wordSize), xmm7);
#endif
__ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
#ifdef _WIN64
// restore xmm regs belonging to calling function
__ movdqu(xmm6, Address(rsp, 0));
__ movdqu(xmm7, Address(rsp, 2 * wordSize));
__ addptr(rsp, 4 * wordSize);
#endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
#undef __
#define __ masm->
@ -4478,6 +4505,7 @@ class StubGenerator: public StubCodeGenerator {
if (VM_Version::supports_sse2()) {
StubRoutines::_dexp = generate_libmExp();
StubRoutines::_dlog = generate_libmLog();
StubRoutines::_dpow = generate_libmPow();
}
}

View File

@ -9885,39 +9885,6 @@ instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
ins_pipe( pipe_slow );
%}
instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE<=1);
match(Set Y (PowD X Y)); // Raise X to the Yth power
effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
format %{ "fast_pow $X $Y -> $Y // KILL $rax, $rcx, $rdx" %}
ins_encode %{
__ subptr(rsp, 8);
__ fld_s($X$$reg - 1);
__ fast_pow();
__ addptr(rsp, 8);
%}
ins_pipe( pipe_slow );
%}
instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE>=2);
match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power
effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %}
ins_encode %{
__ subptr(rsp, 8);
__ movdbl(Address(rsp, 0), $src1$$XMMRegister);
__ fld_d(Address(rsp, 0));
__ movdbl(Address(rsp, 0), $src0$$XMMRegister);
__ fld_d(Address(rsp, 0));
__ fast_pow();
__ fstp_d(Address(rsp, 0));
__ movdbl($dst$$XMMRegister, Address(rsp, 0));
__ addptr(rsp, 8);
%}
ins_pipe( pipe_slow );
%}
instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
predicate (UseSSE<=1);
// The source Double operand on FPU stack

View File

@ -9864,24 +9864,6 @@ instruct log10D_reg(regD dst) %{
ins_pipe( pipe_slow );
%}
instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power
effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %}
ins_encode %{
__ subptr(rsp, 8);
__ movdbl(Address(rsp, 0), $src1$$XMMRegister);
__ fld_d(Address(rsp, 0));
__ movdbl(Address(rsp, 0), $src0$$XMMRegister);
__ fld_d(Address(rsp, 0));
__ fast_pow();
__ fstp_d(Address(rsp, 0));
__ movdbl($dst$$XMMRegister, Address(rsp, 0));
__ addptr(rsp, 8);
%}
ins_pipe( pipe_slow );
%}
//----------Arithmetic Conversion Instructions---------------------------------
instruct roundFloat_nop(regF dst)

View File

@ -4018,7 +4018,6 @@ int MatchRule::is_expensive() const {
strcmp(opType,"ModD")==0 ||
strcmp(opType,"ModF")==0 ||
strcmp(opType,"ModI")==0 ||
strcmp(opType,"PowD")==0 ||
strcmp(opType,"SinD")==0 ||
strcmp(opType,"SqrtD")==0 ||
strcmp(opType,"TanD")==0 ||

View File

@ -754,31 +754,6 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
break;
}
case lir_pow: {
assert(op->as_Op2() != NULL, "must be");
LIR_Op2* op2 = (LIR_Op2*)op;
// On x86 pow needs two temporary fpu stack slots: tmp1 and
// tmp2. Register input operands as temps to guarantee that it
// doesn't overlap with the temporary slots.
assert(op2->_info == NULL, "not used");
assert(op2->_opr1->is_valid() && op2->_opr2->is_valid(), "used");
assert(op2->_tmp1->is_valid() && op2->_tmp2->is_valid() && op2->_tmp3->is_valid()
&& op2->_tmp4->is_valid() && op2->_tmp5->is_valid(), "used");
assert(op2->_result->is_valid(), "used");
do_input(op2->_opr1); do_temp(op2->_opr1);
do_input(op2->_opr2); do_temp(op2->_opr2);
do_temp(op2->_tmp1);
do_temp(op2->_tmp2);
do_temp(op2->_tmp3);
do_temp(op2->_tmp4);
do_temp(op2->_tmp5);
do_output(op2->_result);
break;
}
// LIR_Op3
case lir_idiv:
case lir_irem: {
@ -1769,7 +1744,6 @@ const char * LIR_Op::name() const {
case lir_cos: s = "cos"; break;
case lir_tan: s = "tan"; break;
case lir_log10: s = "log10"; break;
case lir_pow: s = "pow"; break;
case lir_logic_and: s = "logic_and"; break;
case lir_logic_or: s = "logic_or"; break;
case lir_logic_xor: s = "logic_xor"; break;

View File

@ -962,7 +962,6 @@ enum LIR_Code {
, lir_cos
, lir_tan
, lir_log10
, lir_pow
, lir_logic_and
, lir_logic_or
, lir_logic_xor
@ -2198,7 +2197,6 @@ class LIR_List: public CompilationResourceObj {
void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }
void add (LIR_Opr left, LIR_Opr right, LIR_Opr res) { append(new LIR_Op2(lir_add, left, right, res)); }
void sub (LIR_Opr left, LIR_Opr right, LIR_Opr res, CodeEmitInfo* info = NULL) { append(new LIR_Op2(lir_sub, left, right, res, info)); }

View File

@ -740,7 +740,6 @@ void LIR_Assembler::emit_op2(LIR_Op2* op) {
case lir_tan:
case lir_cos:
case lir_log10:
case lir_pow:
intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
break;

View File

@ -6603,7 +6603,6 @@ void LinearScanStatistic::collect(LinearScan* allocator) {
case lir_cos:
case lir_abs:
case lir_log10:
case lir_pow:
case lir_logic_and:
case lir_logic_or:
case lir_logic_xor:

View File

@ -319,6 +319,7 @@ const char* Runtime1::name_for_address(address entry) {
FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
FUNCTION_CASE(entry, StubRoutines::dexp());
FUNCTION_CASE(entry, StubRoutines::dlog());
FUNCTION_CASE(entry, StubRoutines::dpow());
#undef FUNCTION_CASE

View File

@ -216,7 +216,6 @@ macro(PartialSubtypeCheck)
macro(Phi)
macro(PopCountI)
macro(PopCountL)
macro(PowD)
macro(PrefetchAllocation)
macro(Proj)
macro(RShiftI)

View File

@ -230,8 +230,6 @@ class LibraryCallKit : public GraphKit {
bool inline_math_negateExactL();
bool inline_math_subtractExactI(bool is_decrement);
bool inline_math_subtractExactL(bool is_decrement);
bool inline_pow();
Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
bool inline_min_max(vmIntrinsics::ID id);
bool inline_notify(vmIntrinsics::ID id);
Node* generate_min_max(vmIntrinsics::ID id, Node* x, Node* y);
@ -1718,243 +1716,6 @@ bool LibraryCallKit::inline_trig(vmIntrinsics::ID id) {
return true;
}
Node* LibraryCallKit::finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName) {
//-------------------
//result=(result.isNaN())? funcAddr():result;
// Check: If isNaN() by checking result!=result? then either trap
// or go to runtime
Node* cmpisnan = _gvn.transform(new CmpDNode(result, result));
// Build the boolean node
Node* bolisnum = _gvn.transform(new BoolNode(cmpisnan, BoolTest::eq));
if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
{ BuildCutout unless(this, bolisnum, PROB_STATIC_FREQUENT);
// The pow or exp intrinsic returned a NaN, which requires a call
// to the runtime. Recompile with the runtime call.
uncommon_trap(Deoptimization::Reason_intrinsic,
Deoptimization::Action_make_not_entrant);
}
return result;
} else {
// If this inlining ever returned NaN in the past, we compile a call
// to the runtime to properly handle corner cases
IfNode* iff = create_and_xform_if(control(), bolisnum, PROB_STATIC_FREQUENT, COUNT_UNKNOWN);
Node* if_slow = _gvn.transform(new IfFalseNode(iff));
Node* if_fast = _gvn.transform(new IfTrueNode(iff));
if (!if_slow->is_top()) {
RegionNode* result_region = new RegionNode(3);
PhiNode* result_val = new PhiNode(result_region, Type::DOUBLE);
result_region->init_req(1, if_fast);
result_val->init_req(1, result);
set_control(if_slow);
const TypePtr* no_memory_effects = NULL;
Node* rt = make_runtime_call(RC_LEAF, call_type, funcAddr, funcName,
no_memory_effects,
x, top(), y, y ? top() : NULL);
Node* value = _gvn.transform(new ProjNode(rt, TypeFunc::Parms+0));
#ifdef ASSERT
Node* value_top = _gvn.transform(new ProjNode(rt, TypeFunc::Parms+1));
assert(value_top == top(), "second value must be top");
#endif
result_region->init_req(2, control());
result_val->init_req(2, value);
set_control(_gvn.transform(result_region));
return _gvn.transform(result_val);
} else {
return result;
}
}
}
//------------------------------inline_pow-------------------------------------
// Inline power instructions, if possible.
bool LibraryCallKit::inline_pow() {
// Pseudocode for pow
// if (y == 2) {
// return x * x;
// } else {
// if (x <= 0.0) {
// long longy = (long)y;
// if ((double)longy == y) { // if y is long
// if (y + 1 == y) longy = 0; // huge number: even
// result = ((1&longy) == 0)?-DPow(abs(x), y):DPow(abs(x), y);
// } else {
// result = NaN;
// }
// } else {
// result = DPow(x,y);
// }
// if (result != result)? {
// result = uncommon_trap() or runtime_call();
// }
// return result;
// }
Node* x = round_double_node(argument(0));
Node* y = round_double_node(argument(2));
Node* result = NULL;
Node* const_two_node = makecon(TypeD::make(2.0));
Node* cmp_node = _gvn.transform(new CmpDNode(y, const_two_node));
Node* bool_node = _gvn.transform(new BoolNode(cmp_node, BoolTest::eq));
IfNode* if_node = create_and_xform_if(control(), bool_node, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
Node* if_true = _gvn.transform(new IfTrueNode(if_node));
Node* if_false = _gvn.transform(new IfFalseNode(if_node));
RegionNode* region_node = new RegionNode(3);
region_node->init_req(1, if_true);
Node* phi_node = new PhiNode(region_node, Type::DOUBLE);
// special case for x^y where y == 2, we can convert it to x * x
phi_node->init_req(1, _gvn.transform(new MulDNode(x, x)));
// set control to if_false since we will now process the false branch
set_control(if_false);
if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
// Short form: skip the fancy tests and just check for NaN result.
result = _gvn.transform(new PowDNode(C, control(), x, y));
} else {
// If this inlining ever returned NaN in the past, include all
// checks + call to the runtime.
// Set the merge point for If node with condition of (x <= 0.0)
// There are four possible paths to region node and phi node
RegionNode *r = new RegionNode(4);
Node *phi = new PhiNode(r, Type::DOUBLE);
// Build the first if node: if (x <= 0.0)
// Node for 0 constant
Node *zeronode = makecon(TypeD::ZERO);
// Check x:0
Node *cmp = _gvn.transform(new CmpDNode(x, zeronode));
// Check: If (x<=0) then go complex path
Node *bol1 = _gvn.transform(new BoolNode( cmp, BoolTest::le ));
// Branch either way
IfNode *if1 = create_and_xform_if(control(),bol1, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
// Fast path taken; set region slot 3
Node *fast_taken = _gvn.transform(new IfFalseNode(if1));
r->init_req(3,fast_taken); // Capture fast-control
// Fast path not-taken, i.e. slow path
Node *complex_path = _gvn.transform(new IfTrueNode(if1));
// Set fast path result
Node *fast_result = _gvn.transform(new PowDNode(C, control(), x, y));
phi->init_req(3, fast_result);
// Complex path
// Build the second if node (if y is long)
// Node for (long)y
Node *longy = _gvn.transform(new ConvD2LNode(y));
// Node for (double)((long) y)
Node *doublelongy= _gvn.transform(new ConvL2DNode(longy));
// Check (double)((long) y) : y
Node *cmplongy= _gvn.transform(new CmpDNode(doublelongy, y));
// Check if (y isn't long) then go to slow path
Node *bol2 = _gvn.transform(new BoolNode( cmplongy, BoolTest::ne ));
// Branch either way
IfNode *if2 = create_and_xform_if(complex_path,bol2, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
Node* ylong_path = _gvn.transform(new IfFalseNode(if2));
Node *slow_path = _gvn.transform(new IfTrueNode(if2));
// Calculate DPow(abs(x), y)*(1 & (long)y)
// Node for constant 1
Node *conone = longcon(1);
// 1& (long)y
Node *signnode= _gvn.transform(new AndLNode(conone, longy));
// A huge number is always even. Detect a huge number by checking
// if y + 1 == y and set integer to be tested for parity to 0.
// Required for corner case:
// (long)9.223372036854776E18 = max_jlong
// (double)(long)9.223372036854776E18 = 9.223372036854776E18
// max_jlong is odd but 9.223372036854776E18 is even
Node* yplus1 = _gvn.transform(new AddDNode(y, makecon(TypeD::make(1))));
Node *cmpyplus1= _gvn.transform(new CmpDNode(yplus1, y));
Node *bolyplus1 = _gvn.transform(new BoolNode( cmpyplus1, BoolTest::eq ));
Node* correctedsign = NULL;
if (ConditionalMoveLimit != 0) {
correctedsign = _gvn.transform(CMoveNode::make(NULL, bolyplus1, signnode, longcon(0), TypeLong::LONG));
} else {
IfNode *ifyplus1 = create_and_xform_if(ylong_path,bolyplus1, PROB_FAIR, COUNT_UNKNOWN);
RegionNode *r = new RegionNode(3);
Node *phi = new PhiNode(r, TypeLong::LONG);
r->init_req(1, _gvn.transform(new IfFalseNode(ifyplus1)));
r->init_req(2, _gvn.transform(new IfTrueNode(ifyplus1)));
phi->init_req(1, signnode);
phi->init_req(2, longcon(0));
correctedsign = _gvn.transform(phi);
ylong_path = _gvn.transform(r);
record_for_igvn(r);
}
// zero node
Node *conzero = longcon(0);
// Check (1&(long)y)==0?
Node *cmpeq1 = _gvn.transform(new CmpLNode(correctedsign, conzero));
// Check if (1&(long)y)!=0?, if so the result is negative
Node *bol3 = _gvn.transform(new BoolNode( cmpeq1, BoolTest::ne ));
// abs(x)
Node *absx=_gvn.transform(new AbsDNode(x));
// abs(x)^y
Node *absxpowy = _gvn.transform(new PowDNode(C, control(), absx, y));
// -abs(x)^y
Node *negabsxpowy = _gvn.transform(new NegDNode (absxpowy));
// (1&(long)y)==1?-DPow(abs(x), y):DPow(abs(x), y)
Node *signresult = NULL;
if (ConditionalMoveLimit != 0) {
signresult = _gvn.transform(CMoveNode::make(NULL, bol3, absxpowy, negabsxpowy, Type::DOUBLE));
} else {
IfNode *ifyeven = create_and_xform_if(ylong_path,bol3, PROB_FAIR, COUNT_UNKNOWN);
RegionNode *r = new RegionNode(3);
Node *phi = new PhiNode(r, Type::DOUBLE);
r->init_req(1, _gvn.transform(new IfFalseNode(ifyeven)));
r->init_req(2, _gvn.transform(new IfTrueNode(ifyeven)));
phi->init_req(1, absxpowy);
phi->init_req(2, negabsxpowy);
signresult = _gvn.transform(phi);
ylong_path = _gvn.transform(r);
record_for_igvn(r);
}
// Set complex path fast result
r->init_req(2, ylong_path);
phi->init_req(2, signresult);
static const jlong nan_bits = CONST64(0x7ff8000000000000);
Node *slow_result = makecon(TypeD::make(*(double*)&nan_bits)); // return NaN
r->init_req(1,slow_path);
phi->init_req(1,slow_result);
// Post merge
set_control(_gvn.transform(r));
record_for_igvn(r);
result = _gvn.transform(phi);
}
result = finish_pow_exp(result, x, y, OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW");
// control from finish_pow_exp is now input to the region node
region_node->set_req(2, control());
// the result from finish_pow_exp is now input to the phi node
phi_node->init_req(2, result);
set_control(_gvn.transform(region_node));
record_for_igvn(region_node);
set_result(_gvn.transform(phi_node));
C->set_has_split_ifs(true); // Has chance for split-if optimization
return true;
}
//------------------------------runtime_math-----------------------------
bool LibraryCallKit::runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName) {
assert(call_type == OptoRuntime::Math_DD_D_Type() || call_type == OptoRuntime::Math_D_D_Type(),
@ -2005,7 +1766,9 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
return StubRoutines::dexp() != NULL ?
runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(), "dexp") :
runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp), "EXP");
case vmIntrinsics::_dpow: return Matcher::has_match_rule(Op_PowD) ? inline_pow() :
case vmIntrinsics::_dpow:
return StubRoutines::dpow() != NULL ?
runtime_math(OptoRuntime::Math_DD_D_Type(), StubRoutines::dpow(), "dpow") :
runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow), "POW");
#undef FN_PTR

View File

@ -1519,17 +1519,3 @@ const Type *Log10DNode::Value( PhaseTransform *phase ) const {
return TypeD::make( StubRoutines::intrinsic_log10( d ) );
}
//=============================================================================
//------------------------------Value------------------------------------------
// Compute pow
const Type *PowDNode::Value( PhaseTransform *phase ) const {
const Type *t1 = phase->type( in(1) );
if( t1 == Type::TOP ) return Type::TOP;
if( t1->base() != Type::DoubleCon ) return Type::DOUBLE;
const Type *t2 = phase->type( in(2) );
if( t2 == Type::TOP ) return Type::TOP;
if( t2->base() != Type::DoubleCon ) return Type::DOUBLE;
double d1 = t1->getd();
double d2 = t2->getd();
return TypeD::make( StubRoutines::intrinsic_pow( d1, d2 ) );
}

View File

@ -491,20 +491,6 @@ public:
virtual const Type *Value( PhaseTransform *phase ) const;
};
//------------------------------PowDNode---------------------------------------
// Raise a double to a double power
class PowDNode : public Node {
public:
PowDNode(Compile* C, Node *c, Node *in1, Node *in2 ) : Node(c, in1, in2) {
init_flags(Flag_is_expensive);
C->add_expensive_node(this);
}
virtual int Opcode() const;
const Type *bottom_type() const { return Type::DOUBLE; }
virtual uint ideal_reg() const { return Op_RegD; }
virtual const Type *Value( PhaseTransform *phase ) const;
};
//-------------------------------ReverseBytesINode--------------------------------
// reverse bytes of an integer
class ReverseBytesINode : public Node {

View File

@ -153,9 +153,9 @@ address StubRoutines::_vectorizedMismatch = NULL;
address StubRoutines::_dexp = NULL;
address StubRoutines::_dlog = NULL;
address StubRoutines::_dpow = NULL;
double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
double (* StubRoutines::_intrinsic_pow )(double, double) = NULL;
double (* StubRoutines::_intrinsic_sin )(double) = NULL;
double (* StubRoutines::_intrinsic_cos )(double) = NULL;
double (* StubRoutines::_intrinsic_tan )(double) = NULL;

View File

@ -212,6 +212,7 @@ class StubRoutines: AllStatic {
static address _dexp;
static address _dlog;
static address _dpow;
// These are versions of the java.lang.Math methods which perform
// the same operations as the intrinsic version. They are used for
@ -384,6 +385,7 @@ class StubRoutines: AllStatic {
static address dexp() { return _dexp; }
static address dlog() { return _dlog; }
static address dpow() { return _dpow; }
static address select_fill_function(BasicType t, bool aligned, const char* &name);

View File

@ -860,6 +860,7 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
static_field(StubRoutines, _mulAdd, address) \
static_field(StubRoutines, _dexp, address) \
static_field(StubRoutines, _dlog, address) \
static_field(StubRoutines, _dpow, address) \
static_field(StubRoutines, _vectorizedMismatch, address) \
static_field(StubRoutines, _jbyte_arraycopy, address) \
static_field(StubRoutines, _jshort_arraycopy, address) \
@ -2058,7 +2059,6 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
declare_c2_type(AtanDNode, Node) \
declare_c2_type(SqrtDNode, Node) \
declare_c2_type(Log10DNode, Node) \
declare_c2_type(PowDNode, Node) \
declare_c2_type(ReverseBytesINode, Node) \
declare_c2_type(ReverseBytesLNode, Node) \
declare_c2_type(ReductionNode, Node) \