8132207: update for x86 exp in the math lib

Add new java.lang.Math() intrinsics from x86 Reviewed-by: kvn, iveresov
2015-10-05 20:02:40 -07:00 · 2015-10-05 20:02:40 -07:00 · 5a633b1cbd
commit 5a633b1cbd
parent 12cba200a3
28 changed files with 1022 additions and 307 deletions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -770,6 +770,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
    case 0x55: // andnps
    case 0x56: // orps
    case 0x57: // xorps
    case 0x59: //mulpd
    case 0x6E: // movd
    case 0x7E: // movd
    case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
@ -3030,6 +3031,15 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true,
                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
  emit_int8(0x15);
  emit_int8((unsigned char)(0xC0 | encode));
  emit_int8(imm8);
 }
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
@ -3048,6 +3058,15 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
  emit_int8((unsigned char)0xC4);
  emit_int8((unsigned char)(0xC0 | encode));
  emit_int8(imm8);
 }
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
  assert(VM_Version::supports_sse4_1(), "");
  if (VM_Version::supports_evex()) {
@ -4063,6 +4082,16 @@ void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
  }
 }
 void Assembler::mulpd(XMMRegister dst, Address src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  if (VM_Version::supports_evex()) {
    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
  } else {
    emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
  }
 }
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@ -4251,6 +4280,26 @@ void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector
  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 void Assembler::unpckhpd(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  if (VM_Version::supports_evex()) {
    emit_simd_arith_q(0x15, dst, src, VEX_SIMD_66);
  } else {
    emit_simd_arith(0x15, dst, src, VEX_SIMD_66);
  }
 }
 void Assembler::unpcklpd(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  if (VM_Version::supports_evex()) {
    emit_simd_arith_q(0x14, dst, src, VEX_SIMD_66);
  } else {
    emit_simd_arith(0x14, dst, src, VEX_SIMD_66);
  }
 }
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  if (VM_Version::supports_avx512dq()) {
@ -4871,8 +4920,9 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int
 }
-// AND packed integers
+// logical operations packed integers
 void Assembler::pand(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
 }
@ -4893,6 +4943,17 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_
  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 void Assembler::pandn(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  if (VM_Version::supports_evex()) {
    emit_simd_arith_q(0xDF, dst, src, VEX_SIMD_66);
  }
  else {
    emit_simd_arith(0xDF, dst, src, VEX_SIMD_66);
  }
 }
 void Assembler::por(XMMRegister dst, XMMRegister src) {
  _instruction_uses_vl = true;
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1679,10 +1679,14 @@ private:
  // SSE 4.1 extract
  void pextrd(Register dst, XMMRegister src, int imm8);
  void pextrq(Register dst, XMMRegister src, int imm8);
  // SSE 2 extract
  void pextrw(Register dst, XMMRegister src, int imm8);
  // SSE 4.1 insert
  void pinsrd(XMMRegister dst, Register src, int imm8);
  void pinsrq(XMMRegister dst, Register src, int imm8);
  // SSE 2 insert
  void pinsrw(XMMRegister dst, Register src, int imm8);
  // SSE4.1 packed move
  void pmovzxbw(XMMRegister dst, XMMRegister src);
@ -1933,6 +1937,7 @@ private:
  // Multiply Packed Floating-Point Values
  void mulpd(XMMRegister dst, XMMRegister src);
  void mulpd(XMMRegister dst, Address src);
  void mulps(XMMRegister dst, XMMRegister src);
  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -1959,6 +1964,9 @@ private:
  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void unpckhpd(XMMRegister dst, XMMRegister src);
  void unpcklpd(XMMRegister dst, XMMRegister src);
  // Bitwise Logical XOR of Packed Floating-Point Values
  void xorpd(XMMRegister dst, XMMRegister src);
  void xorps(XMMRegister dst, XMMRegister src);
@ -2054,6 +2062,9 @@ private:
  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  // Andn packed integers
  void pandn(XMMRegister dst, XMMRegister src);
  // Or packed integers
  void por(XMMRegister dst, XMMRegister src);
  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -2457,9 +2457,6 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, L
        // Should consider not saving rbx, if not necessary
        __ trigfunc('t', op->as_Op2()->fpu_stack_size());
        break;
      case lir_exp :
        __ exp_with_fallback(op->as_Op2()->fpu_stack_size());
        break;
      case lir_pow :
        __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
        break;
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@ -808,6 +808,12 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
  if (x->id() == vmIntrinsics::_dexp) {
    do_ExpIntrinsic(x);
    return;
  }
  LIRItem value(x->argument_at(0), this);
  bool use_fpu = false;
@ -818,7 +824,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
      case vmIntrinsics::_dtan:
      case vmIntrinsics::_dlog:
      case vmIntrinsics::_dlog10:
      case vmIntrinsics::_dexp:
      case vmIntrinsics::_dpow:
        use_fpu = true;
    }
@ -870,7 +875,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
    case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
    case vmIntrinsics::_dlog:   __ log  (calc_input, calc_result, tmp1);                    break;
    case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
    case vmIntrinsics::_dexp:   __ exp  (calc_input, calc_result,              tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
    case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
    default:                    ShouldNotReachHere();
  }
@ -880,6 +884,32 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
  }
 }
 void LIRGenerator::do_ExpIntrinsic(Intrinsic* x) {
  LIRItem value(x->argument_at(0), this);
  value.set_destroys_register();
  LIR_Opr calc_result = rlock_result(x);
  LIR_Opr result_reg = result_register_for(x->type());
  BasicTypeList signature(1);
  signature.append(T_DOUBLE);
  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
  value.load_item_force(cc->at(0));
 #ifndef _LP64
  LIR_Opr tmp = FrameMap::fpu0_double_opr;
  result_reg = tmp;
  if (VM_Version::supports_sse2()) {
    __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
  } else {
    __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dexp), getThreadTemp(), result_reg, cc->args());
  }
 #else
  __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
 #endif
  __ move(result_reg, calc_result);
 }
 void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
  assert(x->number_of_arguments() == 5, "wrong type");
--- a/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp
@ -814,8 +814,7 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
    case lir_tan:
    case lir_sin:
-    case lir_cos:
+    case lir_cos: {
    case lir_exp: {
      // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
      // registers (stored in right and temp of the operation).
      // the stack allocator must guarantee that the stack slots are really free,
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp
@ -151,11 +151,15 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
      __ pop_fTOS();
      break;
    case Interpreter::java_lang_math_exp:
-      __ exp_with_fallback(0);
+      __ subptr(rsp, 2*wordSize);
-      // Store to stack to convert 80bit precision back to 64bits
+      __ fstp_d(Address(rsp, 0));
-      __ push_fTOS();
+      if (VM_Version::supports_sse2()) {
-      __ pop_fTOS();
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
-      break;
+      } else {
        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dexp)));
      }
      __ addptr(rsp, 2*wordSize);
    break;
    default                              :
        ShouldNotReachHere();
  }
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp
@ -252,6 +252,9 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
  if (kind == Interpreter::java_lang_math_sqrt) {
    __ sqrtsd(xmm0, Address(rsp, wordSize));
  } else if (kind == Interpreter::java_lang_math_exp) {
    __ movdbl(xmm0, Address(rsp, wordSize));
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
  } else {
    __ fld_d(Address(rsp, wordSize));
    switch (kind) {
@ -278,9 +281,6 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
                                              // empty stack slot)
          __ pow_with_fallback(0);
          break;
      case Interpreter::java_lang_math_exp:
          __ exp_with_fallback(0);
           break;
      default                              :
          ShouldNotReachHere();
    }
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -3033,6 +3033,15 @@ void MacroAssembler::fldcw(AddressLiteral src) {
  Assembler::fldcw(as_Address(src));
 }
 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
  if (reachable(src)) {
    Assembler::mulpd(dst, as_Address(src));
  } else {
    lea(rscratch1, src);
    Assembler::mulpd(dst, Address(rscratch1, 0));
  }
 }
 void MacroAssembler::pow_exp_core_encoding() {
  // kills rax, rcx, rdx
  subptr(rsp,sizeof(jdouble));
@ -3105,19 +3114,7 @@ void MacroAssembler::fast_pow() {
  BLOCK_COMMENT("} fast_pow");
 }
-void MacroAssembler::fast_exp() {
+void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
  // computes exp(X) = 2^(X * log2(e))
  // if fast computation is not possible, result is NaN. Requires
  // fallback from user of this macro.
  // increase precision for intermediate steps of the computation
  increase_precision();
  fldl2e();                // Stack: log2(e) X ...
  fmulp(1);                // Stack: (X*log2(e)) ...
  pow_exp_core_encoding(); // Stack: exp(X) ...
  restore_precision();
 }
 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
  // kills rax, rcx, rdx
  // pow and exp needs 2 extra registers on the fpu stack.
  Label slow_case, done;
@ -3129,182 +3126,164 @@ void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
  Register tmp2 = rax;
  Register tmp3 = rcx;
-  if (is_exp) {
+  // Stack: X Y
-    // Stack: X
+  Label x_negative, y_not_2;
-    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
+
-    fast_exp();                 // Stack: exp(X) X
+  static double two = 2.0;
-    fcmp(tmp, 0, false, false); // Stack: exp(X) X
+  ExternalAddress two_addr((address)&two);
-    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
+
-    jcc(Assembler::parity, slow_case);
+  // constant maybe too far on 64 bit
-    // get rid of duplicate argument. Stack: exp(X)
+  lea(tmp2, two_addr);
-    if (num_fpu_regs_in_use > 0) {
+  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
-      fxch();
+  fcmp(tmp, 2, true, false);  // Stack: X Y
-      fpop();
+  jcc(Assembler::parity, y_not_2);
-    } else {
+  jcc(Assembler::notEqual, y_not_2);
-      ffree(1);
+
-    }
+  fxch(); fpop();             // Stack: X
-    jmp(done);
+  fmul(0);                    // Stack: X*X
  jmp(done);
  bind(y_not_2);
  fldz();                     // Stack: 0 X Y
  fcmp(tmp, 1, true, false);  // Stack: X Y
  jcc(Assembler::above, x_negative);
  // X >= 0
  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
  fld_s(1);                   // Stack: X Y X Y
  fast_pow();                 // Stack: X^Y X Y
  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
  // X^Y not equal to itself: X^Y is NaN go to slow case.
  jcc(Assembler::parity, slow_case);
  // get rid of duplicate arguments. Stack: X^Y
  if (num_fpu_regs_in_use > 0) {
    fxch(); fpop();
    fxch(); fpop();
  } else {
-    // Stack: X Y
+    ffree(2);
-    Label x_negative, y_not_2;
+    ffree(1);
  }
  jmp(done);
-    static double two = 2.0;
+  // X <= 0
-    ExternalAddress two_addr((address)&two);
+  bind(x_negative);
-    // constant maybe too far on 64 bit
+  fld_s(1);                   // Stack: Y X Y
-    lea(tmp2, two_addr);
+  frndint();                  // Stack: int(Y) X Y
-    fld_d(Address(tmp2, 0));    // Stack: 2 X Y
+  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
-    fcmp(tmp, 2, true, false);  // Stack: X Y
+  jcc(Assembler::notEqual, slow_case);
    jcc(Assembler::parity, y_not_2);
    jcc(Assembler::notEqual, y_not_2);
-    fxch(); fpop();             // Stack: X
+  subptr(rsp, 8);
    fmul(0);                    // Stack: X*X
-    jmp(done);
+  // For X^Y, when X < 0, Y has to be an integer and the final
-
+  // result depends on whether it's odd or even. We just checked
-    bind(y_not_2);
+  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
-
+  // integer to test its parity. If int(Y) is huge and doesn't fit
-    fldz();                     // Stack: 0 X Y
+  // in the 64 bit integer range, the integer indefinite value will
-    fcmp(tmp, 1, true, false);  // Stack: X Y
+  // end up in the gp registers. Huge numbers are all even, the
-    jcc(Assembler::above, x_negative);
+  // integer indefinite number is even so it's fine.
    // X >= 0
    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
    fld_s(1);                   // Stack: X Y X Y
    fast_pow();                 // Stack: X^Y X Y
    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
    // X^Y not equal to itself: X^Y is NaN go to slow case.
    jcc(Assembler::parity, slow_case);
    // get rid of duplicate arguments. Stack: X^Y
    if (num_fpu_regs_in_use > 0) {
      fxch(); fpop();
      fxch(); fpop();
    } else {
      ffree(2);
      ffree(1);
    }
    jmp(done);
    // X <= 0
    bind(x_negative);
    fld_s(1);                   // Stack: Y X Y
    frndint();                  // Stack: int(Y) X Y
    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
    jcc(Assembler::notEqual, slow_case);
    subptr(rsp, 8);
    // For X^Y, when X < 0, Y has to be an integer and the final
    // result depends on whether it's odd or even. We just checked
    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
    // integer to test its parity. If int(Y) is huge and doesn't fit
    // in the 64 bit integer range, the integer indefinite value will
    // end up in the gp registers. Huge numbers are all even, the
    // integer indefinite number is even so it's fine.
 #ifdef ASSERT
-    // Let's check we don't end up with an integer indefinite number
+  // Let's check we don't end up with an integer indefinite number
-    // when not expected. First test for huge numbers: check whether
+  // when not expected. First test for huge numbers: check whether
-    // int(Y)+1 == int(Y) which is true for very large numbers and
+  // int(Y)+1 == int(Y) which is true for very large numbers and
-    // those are all even. A 64 bit integer is guaranteed to not
+  // those are all even. A 64 bit integer is guaranteed to not
-    // overflow for numbers where y+1 != y (when precision is set to
+  // overflow for numbers where y+1 != y (when precision is set to
-    // double precision).
+  // double precision).
-    Label y_not_huge;
+  Label y_not_huge;
-    fld1();                     // Stack: 1 int(Y) X Y
+  fld1();                     // Stack: 1 int(Y) X Y
-    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
+  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
 #ifdef _LP64
-    // trip to memory to force the precision down from double extended
+  // trip to memory to force the precision down from double extended
-    // precision
+  // precision
-    fstp_d(Address(rsp, 0));
+  fstp_d(Address(rsp, 0));
-    fld_d(Address(rsp, 0));
+  fld_d(Address(rsp, 0));
 #endif
-    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
+  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
 #endif
-    // move int(Y) as 64 bit integer to thread's stack
+  // move int(Y) as 64 bit integer to thread's stack
-    fistp_d(Address(rsp,0));    // Stack: X Y
+  fistp_d(Address(rsp,0));    // Stack: X Y
 #ifdef ASSERT
-    jcc(Assembler::notEqual, y_not_huge);
+  jcc(Assembler::notEqual, y_not_huge);
-    // Y is huge so we know it's even. It may not fit in a 64 bit
+  // Y is huge so we know it's even. It may not fit in a 64 bit
-    // integer and we don't want the debug code below to see the
+  // integer and we don't want the debug code below to see the
-    // integer indefinite value so overwrite int(Y) on the thread's
+  // integer indefinite value so overwrite int(Y) on the thread's
-    // stack with 0.
+  // stack with 0.
-    movl(Address(rsp, 0), 0);
+  movl(Address(rsp, 0), 0);
-    movl(Address(rsp, 4), 0);
+  movl(Address(rsp, 4), 0);
-    bind(y_not_huge);
+  bind(y_not_huge);
 #endif
-    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-    fld_s(1);                   // Stack: X Y X Y
+  fld_s(1);                   // Stack: X Y X Y
-    fabs();                     // Stack: abs(X) Y X Y
+  fabs();                     // Stack: abs(X) Y X Y
-    fast_pow();                 // Stack: abs(X)^Y X Y
+  fast_pow();                 // Stack: abs(X)^Y X Y
-    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
-    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
-    pop(tmp2);
+  pop(tmp2);
-    NOT_LP64(pop(tmp3));
+  NOT_LP64(pop(tmp3));
-    jcc(Assembler::parity, slow_case);
+  jcc(Assembler::parity, slow_case);
 #ifdef ASSERT
-    // Check that int(Y) is not integer indefinite value (int
+  // Check that int(Y) is not integer indefinite value (int
-    // overflow). Shouldn't happen because for values that would
+  // overflow). Shouldn't happen because for values that would
-    // overflow, 1+int(Y)==Y which was tested earlier.
+  // overflow, 1+int(Y)==Y which was tested earlier.
 #ifndef _LP64
-    {
+  {
-      Label integer;
+    Label integer;
-      testl(tmp2, tmp2);
+    testl(tmp2, tmp2);
-      jcc(Assembler::notZero, integer);
+    jcc(Assembler::notZero, integer);
-      cmpl(tmp3, 0x80000000);
+    cmpl(tmp3, 0x80000000);
-      jcc(Assembler::notZero, integer);
+    jcc(Assembler::notZero, integer);
-      STOP("integer indefinite value shouldn't be seen here");
+    STOP("integer indefinite value shouldn't be seen here");
-      bind(integer);
+    bind(integer);
    }
 #else
    {
      Label integer;
      mov(tmp3, tmp2); // preserve tmp2 for parity check below
      shlq(tmp3, 1);
      jcc(Assembler::carryClear, integer);
      jcc(Assembler::notZero, integer);
      STOP("integer indefinite value shouldn't be seen here");
      bind(integer);
    }
 #endif
 #endif
    // get rid of duplicate arguments. Stack: X^Y
    if (num_fpu_regs_in_use > 0) {
      fxch(); fpop();
      fxch(); fpop();
    } else {
      ffree(2);
      ffree(1);
    }
    testl(tmp2, 1);
    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
    // X <= 0, Y even: X^Y = -abs(X)^Y
    fchs();                     // Stack: -abs(X)^Y Y
    jmp(done);
  }
 #else
  {
    Label integer;
    mov(tmp3, tmp2); // preserve tmp2 for parity check below
    shlq(tmp3, 1);
    jcc(Assembler::carryClear, integer);
    jcc(Assembler::notZero, integer);
    STOP("integer indefinite value shouldn't be seen here");
    bind(integer);
  }
 #endif
 #endif
  // get rid of duplicate arguments. Stack: X^Y
  if (num_fpu_regs_in_use > 0) {
    fxch(); fpop();
    fxch(); fpop();
  } else {
    ffree(2);
    ffree(1);
  }
  testl(tmp2, 1);
  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
  // X <= 0, Y even: X^Y = -abs(X)^Y
  fchs();                     // Stack: -abs(X)^Y Y
  jmp(done);
  // slow case: runtime call
  bind(slow_case);
  fpop();                       // pop incorrect result or int(Y)
-  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
+  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
                      is_exp ? 1 : 2, num_fpu_regs_in_use);
  // Come here with result in F-TOS
  bind(done);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -907,14 +907,14 @@ class MacroAssembler: public Assembler {
  // all corner cases and may result in NaN and require fallback to a
  // runtime call.
  void fast_pow();
-  void fast_exp();
+  void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
                Register rax, Register rcx, Register rdx, Register tmp);
  void increase_precision();
  void restore_precision();
  // computes exp(x). Fallback to runtime call included.
  void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); }
  // computes pow(x,y). Fallback to runtime call included.
-  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); }
+  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); }
 private:
@ -925,7 +925,7 @@ private:
  void pow_exp_core_encoding();
  // computes pow(x,y) or exp(x). Fallback to runtime call included.
-  void pow_or_exp(bool is_exp, int num_fpu_regs_in_use);
+  void pow_or_exp(int num_fpu_regs_in_use);
  // these are private because users should be doing movflt/movdbl
@ -971,6 +971,10 @@ public:
  void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
  void movsd(XMMRegister dst, AddressLiteral src);
  void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
  void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
  void mulpd(XMMRegister dst, AddressLiteral src);
  void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
  void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
  void mulsd(XMMRegister dst, AddressLiteral src);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86_libm.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86_libm.cpp
@ -0,0 +1,677 @@
 /*
 * Copyright (c) 2015, Intel Corporation.
 * Intel Math Library (LIBM) Source Code
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 /******************************************************************************/
 //                     ALGORITHM DESCRIPTION
 //                     ---------------------
 //
 // Description:
 //  Let K = 64 (table size).
 //        x    x/log(2)     n
 //       e  = 2          = 2 * T[j] * (1 + P(y))
 //  where
 //       x = m*log(2)/K + y,    y in [-log(2)/K..log(2)/K]
 //       m = n*K + j,           m,n,j - signed integer, j in [-K/2..K/2]
 //                  j/K
 //       values of 2   are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
 //
 //       P(y) is a minimax polynomial approximation of exp(x)-1
 //       on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
 //
 //  To avoid problems with arithmetic overflow and underflow,
 //            n                        n1  n2
 //  value of 2  is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
 //  where BIAS is a value of exponent bias.
 //
 // Special cases:
 //  exp(NaN) = NaN
 //  exp(+INF) = +INF
 //  exp(-INF) = 0
 //  exp(x) = 1 for subnormals
 //  for finite argument, only exp(0)=1 is exact
 //  For IEEE double
 //    if x >  709.782712893383973096 then exp(x) overflow
 //    if x < -745.133219101941108420 then exp(x) underflow
 //
 /******************************************************************************/
 #include "precompiled.hpp"
 #include "asm/assembler.hpp"
 #include "asm/assembler.inline.hpp"
 #include "macroAssembler_x86.hpp"
 #ifdef _MSC_VER
 #define ALIGNED_(x) __declspec(align(x))
 #else
 #define ALIGNED_(x) __attribute__ ((aligned(x)))
 #endif
 #ifdef _LP64
 ALIGNED_(16) juint _cv[] =
 {
    0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL, 0xfefa0000UL,
    0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL, 0x3d1cf79aUL,
    0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL,
    0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL,
    0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
 };
 ALIGNED_(16) juint _shifter[] =
 {
    0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
 };
 ALIGNED_(16) juint _mmask[] =
 {
    0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
 };
 ALIGNED_(16) juint _bias[] =
 {
    0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
 };
 ALIGNED_(16) juint _Tbl_addr[] =
 {
    0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
    0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
    0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
    0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
    0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
    0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
    0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
    0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
    0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
    0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
    0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
    0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
    0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
    0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
    0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
    0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
    0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
    0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
    0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
    0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
    0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
    0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
    0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
    0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
    0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
    0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
    0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
    0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
    0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
    0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
    0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
    0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
    0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
    0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
    0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
    0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
    0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
    0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
    0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
    0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
    0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
    0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
    0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
    0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
    0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
    0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
    0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
    0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
    0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
    0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
    0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
    0x000fa7c1UL
 };
 ALIGNED_(16) juint _ALLONES[] =
 {
    0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
 };
 ALIGNED_(16) juint _ebias[] =
 {
    0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
 };
 ALIGNED_(4) juint _XMAX[] =
 {
    0xffffffffUL, 0x7fefffffUL
 };
 ALIGNED_(4) juint _XMIN[] =
 {
    0x00000000UL, 0x00100000UL
 };
 ALIGNED_(4) juint _INF[] =
 {
    0x00000000UL, 0x7ff00000UL
 };
 ALIGNED_(4) juint _ZERO[] =
 {
    0x00000000UL, 0x00000000UL
 };
 ALIGNED_(4) juint _ONE_val[] =
 {
    0x00000000UL, 0x3ff00000UL
 };
 // Registers:
 // input: xmm0
 // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 //          rax, rdx, rcx, tmp - r11
 // Code generated by Intel C compiler for LIBM library
 void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start;
  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address cv = (address)_cv;
  address Shifter = (address)_shifter;
  address mmask = (address)_mmask;
  address bias = (address)_bias;
  address Tbl_addr = (address)_Tbl_addr;
  address ALLONES = (address)_ALLONES;
  address ebias = (address)_ebias;
  address XMAX = (address)_XMAX;
  address XMIN = (address)_XMIN;
  address INF = (address)_INF;
  address ZERO = (address)_ZERO;
  address ONE_val = (address)_ONE_val;
  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 8), xmm0);
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, ExternalAddress(cv));       // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, ExternalAddress(Shifter));  // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, ExternalAddress(16+cv));    // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, ExternalAddress(32+cv));    // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, ExternalAddress(64+cv));    // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, ExternalAddress(80+cv));    // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, ExternalAddress(mmask));    // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, ExternalAddress(bias));     // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  lea(tmp, ExternalAddress(Tbl_addr));
  movdqu(xmm2, Address(ecx,tmp));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, ExternalAddress(48+cv));     // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp (B1_5);
  bind(L_2TAG_PACKET_1_0_2);
  xorpd(xmm3, xmm3);
  movdqu(xmm4, ExternalAddress(ALLONES));  // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
  movl(edx, -1022);
  subl(edx, eax);
  movdl(xmm5, edx);
  psllq(xmm4, xmm5);
  movl(ecx, eax);
  sarl(eax, 1);
  pinsrw(xmm3, eax, 3);
  movdqu(xmm6, ExternalAddress(ebias));    // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
  psllq(xmm3, 4);
  psubd(xmm2, xmm3);
  mulsd(xmm0, xmm2);
  cmpl(edx, 52);
  jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
  pand(xmm4, xmm2);
  paddd(xmm3, xmm6);
  subsd(xmm2, xmm4);
  addsd(xmm0, xmm2);
  cmpl(ecx, 1023);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32768);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  movapd(xmm6, xmm0);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
  jmp(B1_5);
  bind(L_2TAG_PACKET_5_0_2);
  mulsd(xmm6, xmm3);
  mulsd(xmm4, xmm3);
  movdqu(xmm0, xmm6);
  pxor(xmm6, xmm4);
  psrad(xmm6, 31);
  pshufd(xmm6, xmm6, 85);
  psllq(xmm0, 1);
  psrlq(xmm0, 1);
  pxor(xmm0, xmm6);
  psrlq(xmm6, 63);
  paddq(xmm0, xmm6);
  paddq(xmm0, xmm4);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);
  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  jmp(B1_5);
  bind(L_2TAG_PACKET_3_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  jmp(B1_5);
  bind(L_2TAG_PACKET_2_0_2);
  paddd(xmm3, xmm6);
  addpd(xmm0, xmm2);
  mulsd(xmm0, xmm3);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);
  bind(L_2TAG_PACKET_8_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMAX));      // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);
  bind(L_2TAG_PACKET_7_0_2);
  movl(Address(rsp,0), 14);
  jmp(L_2TAG_PACKET_6_0_2);
  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMIN));      // 0x00000000UL, 0x00100000UL
  mulsd(xmm0, xmm0);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);
  bind(L_2TAG_PACKET_9_0_2);
  movl(edx, Address(rsp,8));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(INF));       // 0x00000000UL, 0x7ff00000UL
  jmp(B1_5);
  bind(L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(ZERO));      // 0x00000000UL, 0x00000000UL
  jmp(B1_5);
  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(rsp, 8));
  addsd(xmm0, xmm0);
  jmp(B1_5);
  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 12));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
  movsd(Address(rsp, 8), xmm0);
  addsd(xmm0, ExternalAddress(ONE_val));   // 0x00000000UL, 0x3ff00000UL
  jmp(B1_5);
  bind(L_2TAG_PACKET_6_0_2);
  movq(Address(rsp, 16), xmm0);
  bind(B1_3);
  movq(xmm0, Address(rsp, 16));
  bind(B1_5);
  addq(rsp, 24);
 }
 #endif
 #ifndef _LP64
 ALIGNED_(16) juint _static_const_table[] =
 {
    0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL, 0xffffffc0UL,
    0x00000000UL, 0xffffffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL,
    0x0000ffc0UL, 0x00000000UL, 0x00000000UL, 0x43380000UL, 0x00000000UL,
    0x43380000UL, 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL,
    0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL,
    0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL,
    0xfffffffeUL, 0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL,
    0x3fa55555UL, 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL,
    0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
    0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
    0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
    0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
    0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
    0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
    0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
    0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
    0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
    0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
    0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
    0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
    0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
    0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
    0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
    0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
    0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
    0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
    0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
    0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
    0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
    0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
    0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
    0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
    0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
    0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
    0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
    0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
    0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
    0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
    0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
    0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
    0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
    0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
    0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
    0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
    0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
    0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
    0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
    0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
    0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
    0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
    0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
    0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
    0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
    0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
    0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
    0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
    0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
    0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
    0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
    0x000fa7c1UL, 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x7ff00000UL,
    0x00000000UL, 0x00000000UL, 0xffffffffUL, 0x7fefffffUL, 0x00000000UL,
    0x00100000UL
 };
 //registers,
 // input: (rbp + 8)
 // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 //          rax, rdx, rcx, rbx (tmp)
 // Code generated by Intel C compiler for LIBM library
 void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address static_const_table = (address)_static_const_table;
  bind(start);
  subl(rsp, 120);
  movl(Address(rsp, 64), tmp);
  lea(tmp, ExternalAddress(static_const_table));
  movdqu(xmm0, Address(rsp, 128));
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, Address(tmp, 64));          // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, Address(tmp, 48));          // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, Address(tmp, 80));          // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, Address(tmp, 96));          // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, Address(tmp, 128));         // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, Address(tmp, 144));         // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, Address(tmp, 16));          // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, Address(tmp, 32));          // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, Address(tmp, 112));          // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp(L_2TAG_PACKET_2_0_2);
  bind(L_2TAG_PACKET_1_0_2);
  fnstcw(Address(rsp, 24));
  movzwl(edx, Address(rsp, 24));
  orl(edx, 768);
  movw(Address(rsp, 28), edx);
  fldcw(Address(rsp, 28));
  movl(edx, eax);
  sarl(eax, 1);
  subl(edx, eax);
  movdqu(xmm6, Address(tmp, 0));           // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL
  pandn(xmm6, xmm2);
  addl(eax, 1023);
  movdl(xmm3, eax);
  psllq(xmm3, 52);
  por(xmm6, xmm3);
  addl(edx, 1023);
  movdl(xmm4, edx);
  psllq(xmm4, 52);
  movsd(Address(rsp, 8), xmm0);
  fld_d(Address(rsp, 8));
  movsd(Address(rsp, 16), xmm6);
  fld_d(Address(rsp, 16));
  fmula(1);
  faddp(1);
  movsd(Address(rsp, 8), xmm4);
  fld_d(Address(rsp, 8));
  fmulp(1);
  fstp_d(Address(rsp, 8));
  movsd(xmm0,Address(rsp, 8));
  fldcw(Address(rsp, 24));
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_2_0_2);
  cmpl(ecx, INT_MIN);
  jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, -1064950997);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(edx ,-17155601);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jmp(L_2TAG_PACKET_4_0_2);
  bind(L_2TAG_PACKET_3_0_2);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);
  bind(L_2TAG_PACKET_4_0_2);
  movl(edx, 15);
  bind(L_2TAG_PACKET_5_0_2);
  movsd(Address(rsp, 0), xmm0);
  movsd(xmm0, Address(rsp, 128));
  fld_d(Address(rsp, 0));
  jmp(L_2TAG_PACKET_6_0_2);
  bind(L_2TAG_PACKET_7_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, INT_MIN);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1208));         // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);
  bind(L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1216));
  mulsd(xmm0, xmm0);
  movl(edx, 15);
  jmp(L_2TAG_PACKET_5_0_2);
  bind(L_2TAG_PACKET_8_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1192));         // 0x00000000UL, 0x7ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);
  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1200));         // 0x00000000UL, 0x00000000UL
  jmp(L_2TAG_PACKET_2_0_2);
  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, xmm0);
  jmp(L_2TAG_PACKET_2_0_2);
  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 132));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, Address(tmp, 1184));         // 0x00000000UL, 0x3ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);
  bind(L_2TAG_PACKET_2_0_2);
  movsd(Address(rsp, 48), xmm0);
  fld_d(Address(rsp, 48));
  bind(L_2TAG_PACKET_6_0_2);
  movl(tmp, Address(rsp, 64));
 }
 #endif
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -2134,14 +2134,6 @@ class StubGenerator: public StubCodeGenerator {
      __ trigfunc('t');
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "exp");
      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ exp_with_fallback(0);
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "pow");
      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
@ -3048,6 +3040,32 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }
 address generate_libmExp() {
    address start = __ pc();
    const XMMRegister x0  = xmm0;
    const XMMRegister x1  = xmm1;
    const XMMRegister x2  = xmm2;
    const XMMRegister x3  = xmm3;
    const XMMRegister x4  = xmm4;
    const XMMRegister x5  = xmm5;
    const XMMRegister x6  = xmm6;
    const XMMRegister x7  = xmm7;
    const Register tmp   = rbx;
    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  // Safefetch stubs.
  void generate_safefetch(const char* name, int size, address* entry,
                          address* fault_pc, address* continuation_pc) {
@ -3268,6 +3286,9 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
    }
    if (VM_Version::supports_sse2()) {
      StubRoutines::_dexp = generate_libmExp();
    }
  }
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -3038,19 +3038,6 @@ class StubGenerator: public StubCodeGenerator {
      __ addq(rsp, 8);
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "exp");
      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
      __ subq(rsp, 8);
      __ movdbl(Address(rsp, 0), xmm0);
      __ fld_d(Address(rsp, 0));
      __ exp_with_fallback(0);
      __ fstp_d(Address(rsp, 0));
      __ movdbl(xmm0, Address(rsp, 0));
      __ addq(rsp, 8);
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "pow");
      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
@ -4180,6 +4167,44 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }
  address generate_libmExp() {
    address start = __ pc();
    const XMMRegister x0  = xmm0;
    const XMMRegister x1  = xmm1;
    const XMMRegister x2  = xmm2;
    const XMMRegister x3  = xmm3;
    const XMMRegister x4  = xmm4;
    const XMMRegister x5  = xmm5;
    const XMMRegister x6  = xmm6;
    const XMMRegister x7  = xmm7;
    const Register tmp   = r11;
    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
 #ifdef _WIN64
    // save the xmm registers which must be preserved 6-7
    __ movdqu(xmm_save(6), as_XMMRegister(6));
    __ movdqu(xmm_save(7), as_XMMRegister(7));
 #endif
      __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
 #ifdef _WIN64
    // restore xmm regs belonging to calling function
    __ movdqu(as_XMMRegister(6), xmm_save(6));
    __ movdqu(as_XMMRegister(7), xmm_save(7));
 #endif
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
 #undef __
 #define __ masm->
@ -4367,6 +4392,7 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
    }
    StubRoutines::_dexp = generate_libmExp();
  }
  void generate_all() {
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@ -9911,35 +9911,6 @@ instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXR
  ins_pipe( pipe_slow );
 %}
 instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE<=1);
  match(Set dpr1 (ExpD dpr1));
  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
  format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ fast_exp();
  %}
  ins_pipe( pipe_slow );
 %}
 instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (ExpD src));
  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ fast_exp();
    __ fstp_d(Address(rsp, 0));
    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 8);
  %}
  ins_pipe( pipe_slow );
 %}
 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  // The source Double operand on FPU stack
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@ -9898,22 +9898,6 @@ instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rc
  ins_pipe( pipe_slow );
 %}
 instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
  match(Set dst (ExpD src));
  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ fast_exp();
    __ fstp_d(Address(rsp, 0));
    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 8);
  %}
  ins_pipe( pipe_slow );
 %}
 //----------Arithmetic Conversion Instructions---------------------------------
 instruct roundFloat_nop(regF dst)
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -4006,7 +4006,6 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"DivD")==0 ||
        strcmp(opType,"DivF")==0 ||
        strcmp(opType,"DivI")==0 ||
        strcmp(opType,"ExpD")==0 ||
        strcmp(opType,"LogD")==0 ||
        strcmp(opType,"Log10D")==0 ||
        strcmp(opType,"ModD")==0 ||
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@ -732,8 +732,7 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
    case lir_sin:
    case lir_cos:
    case lir_log:
-    case lir_log10:
+    case lir_log10: {
    case lir_exp: {
      assert(op->as_Op2() != NULL, "must be");
      LIR_Op2* op2 = (LIR_Op2*)op;
@ -743,9 +742,6 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
      // overlap with the input.
      assert(op2->_info == NULL, "not used");
      assert(op2->_tmp5->is_illegal(), "not used");
      assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used");
      assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used");
      assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used");
      assert(op2->_opr1->is_valid(), "used");
      do_input(op2->_opr1); do_temp(op2->_opr1);
@ -1775,7 +1771,6 @@ const char * LIR_Op::name() const {
     case lir_tan:                   s = "tan";           break;
     case lir_log:                   s = "log";           break;
     case lir_log10:                 s = "log10";         break;
     case lir_exp:                   s = "exp";           break;
     case lir_pow:                   s = "pow";           break;
     case lir_logic_and:             s = "logic_and";     break;
     case lir_logic_or:              s = "logic_or";      break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp
@ -961,7 +961,6 @@ enum LIR_Code {
      , lir_tan
      , lir_log
      , lir_log10
      , lir_exp
      , lir_pow
      , lir_logic_and
      , lir_logic_or
@ -2199,7 +2198,6 @@ class LIR_List: public CompilationResourceObj {
  void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
  void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
  void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
  void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5)                { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); }
  void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }
  void add (LIR_Opr left, LIR_Opr right, LIR_Opr res)      { append(new LIR_Op2(lir_add, left, right, res)); }
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
@ -739,7 +739,6 @@ void LIR_Assembler::emit_op2(LIR_Op2* op) {
    case lir_cos:
    case lir_log:
    case lir_log10:
    case lir_exp:
    case lir_pow:
      intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
      break;
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
@ -244,6 +244,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
  void do_getClass(Intrinsic* x);
  void do_currentThread(Intrinsic* x);
  void do_MathIntrinsic(Intrinsic* x);
  void do_ExpIntrinsic(Intrinsic* x);
  void do_ArrayCopy(Intrinsic* x);
  void do_CompareAndSwap(Intrinsic* x, ValueType* type);
  void do_NIOCheckIndex(Intrinsic* x);
--- a/hotspot/src/share/vm/c1/c1_LinearScan.cpp
+++ b/hotspot/src/share/vm/c1/c1_LinearScan.cpp
@ -6588,7 +6588,6 @@ void LinearScanStatistic::collect(LinearScan* allocator) {
        case lir_log10:
        case lir_log:
        case lir_pow:
        case lir_exp:
        case lir_logic_and:
        case lir_logic_or:
        case lir_logic_xor:
--- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp
@ -317,6 +317,7 @@ const char* Runtime1::name_for_address(address entry) {
  FUNCTION_CASE(entry, TRACE_TIME_METHOD);
 #endif
  FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
  FUNCTION_CASE(entry, StubRoutines::dexp());
 #undef FUNCTION_CASE
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -131,7 +131,6 @@ macro(DivModL)
 macro(EncodeISOArray)
 macro(EncodeP)
 macro(EncodePKlass)
 macro(ExpD)
 macro(FastLock)
 macro(FastUnlock)
 macro(Goto)
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -222,7 +222,6 @@ class LibraryCallKit : public GraphKit {
  bool inline_math_negateExactL();
  bool inline_math_subtractExactI(bool is_decrement);
  bool inline_math_subtractExactL(bool is_decrement);
  bool inline_exp();
  bool inline_pow();
  Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
  bool inline_min_max(vmIntrinsics::ID id);
@ -1535,20 +1534,6 @@ Node* LibraryCallKit::finish_pow_exp(Node* result, Node* x, Node* y, const TypeF
  }
 }
 //------------------------------inline_exp-------------------------------------
 // Inline exp instructions, if possible.  The Intel hardware only misses
 // really odd corner cases (+/- Infinity).  Just uncommon-trap them.
 bool LibraryCallKit::inline_exp() {
  Node* arg = round_double_node(argument(0));
  Node* n   = _gvn.transform(new ExpDNode(C, control(), arg));
  n = finish_pow_exp(n, arg, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
  set_result(n);
  C->set_has_split_ifs(true); // Has chance for split-if optimization
  return true;
 }
 //------------------------------inline_pow-------------------------------------
 // Inline power instructions, if possible.
 bool LibraryCallKit::inline_pow() {
@ -1776,8 +1761,9 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
  case vmIntrinsics::_dsqrt:  return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false;
  case vmIntrinsics::_dabs:   return Matcher::has_match_rule(Op_AbsD)   ? inline_math(id) : false;
-  case vmIntrinsics::_dexp:   return Matcher::has_match_rule(Op_ExpD)   ? inline_exp()    :
+  case vmIntrinsics::_dexp:
-    runtime_math(OptoRuntime::Math_D_D_Type(),  FN_PTR(SharedRuntime::dexp),  "EXP");
+    return (UseSSE >= 2) ? runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(),  "dexp") :
    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp),  "EXP");
  case vmIntrinsics::_dpow:   return Matcher::has_match_rule(Op_PowD)   ? inline_pow()    :
    runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
 #undef FN_PTR
--- a/hotspot/src/share/vm/opto/subnode.cpp
+++ b/hotspot/src/share/vm/opto/subnode.cpp
@ -1530,18 +1530,6 @@ const Type *Log10DNode::Value( PhaseTransform *phase ) const {
  return TypeD::make( StubRoutines::intrinsic_log10( d ) );
 }
 //=============================================================================
 //------------------------------Value------------------------------------------
 // Compute exp
 const Type *ExpDNode::Value( PhaseTransform *phase ) const {
  const Type *t1 = phase->type( in(1) );
  if( t1 == Type::TOP ) return Type::TOP;
  if( t1->base() != Type::DoubleCon ) return Type::DOUBLE;
  double d = t1->getd();
  return TypeD::make( StubRoutines::intrinsic_exp( d ) );
 }
 //=============================================================================
 //------------------------------Value------------------------------------------
 // Compute pow
--- a/hotspot/src/share/vm/opto/subnode.hpp
+++ b/hotspot/src/share/vm/opto/subnode.hpp
@ -477,20 +477,6 @@ public:
  virtual const Type *Value( PhaseTransform *phase ) const;
 };
 //------------------------------ExpDNode---------------------------------------
 //  Exponentiate a double
 class ExpDNode : public Node {
 public:
  ExpDNode(Compile* C, Node *c, Node *in1) : Node(c, in1) {
    init_flags(Flag_is_expensive);
    C->add_expensive_node(this);
  }
  virtual int Opcode() const;
  const Type *bottom_type() const { return Type::DOUBLE; }
  virtual uint ideal_reg() const { return Op_RegD; }
  virtual const Type *Value( PhaseTransform *phase ) const;
 };
 //------------------------------LogDNode---------------------------------------
 // Log_e of a double
 class LogDNode : public Node {
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -148,9 +148,10 @@ address StubRoutines::_mulAdd = NULL;
 address StubRoutines::_montgomeryMultiply = NULL;
 address StubRoutines::_montgomerySquare = NULL;
 address StubRoutines::_dexp = NULL;
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
 double (* StubRoutines::_intrinsic_exp   )(double) = NULL;
 double (* StubRoutines::_intrinsic_pow   )(double, double) = NULL;
 double (* StubRoutines::_intrinsic_sin   )(double) = NULL;
 double (* StubRoutines::_intrinsic_cos   )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -207,6 +207,8 @@ class StubRoutines: AllStatic {
  static address _montgomeryMultiply;
  static address _montgomerySquare;
  static address _dexp;
  // These are versions of the java.lang.Math methods which perform
  // the same operations as the intrinsic version.  They are used for
  // constant folding in the compiler to ensure equivalence.  If the
@ -215,7 +217,6 @@ class StubRoutines: AllStatic {
  // SharedRuntime.
  static double (*_intrinsic_log)(double);
  static double (*_intrinsic_log10)(double);
  static double (*_intrinsic_exp)(double);
  static double (*_intrinsic_pow)(double, double);
  static double (*_intrinsic_sin)(double);
  static double (*_intrinsic_cos)(double);
@ -375,6 +376,8 @@ class StubRoutines: AllStatic {
  static address montgomeryMultiply()  { return _montgomeryMultiply; }
  static address montgomerySquare()    { return _montgomerySquare; }
  static address dexp()                {return _dexp; }
  static address select_fill_function(BasicType t, bool aligned, const char* &name);
  static address zero_aligned_words()   { return _zero_aligned_words; }
@ -387,10 +390,6 @@ class StubRoutines: AllStatic {
    assert(_intrinsic_log != NULL, "must be defined");
    return _intrinsic_log10(d);
  }
  static double  intrinsic_exp(double d) {
    assert(_intrinsic_exp != NULL, "must be defined");
    return _intrinsic_exp(d);
  }
  static double  intrinsic_pow(double d, double d2) {
    assert(_intrinsic_pow != NULL, "must be defined");
    return _intrinsic_pow(d, d2);
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -837,6 +837,7 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _multiplyToLen,                                address)                               \
     static_field(StubRoutines,                _squareToLen,                                  address)                               \
     static_field(StubRoutines,                _mulAdd,                                       address)                               \
     static_field(StubRoutines,                _dexp,                                         address)                               \
                                                                                                                                     \
  /*****************/                                                                                                                \
  /* SharedRuntime */                                                                                                                \
@ -1992,7 +1993,6 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
  declare_c2_type(TanDNode, Node)                                         \
  declare_c2_type(AtanDNode, Node)                                        \
  declare_c2_type(SqrtDNode, Node)                                        \
  declare_c2_type(ExpDNode, Node)                                         \
  declare_c2_type(LogDNode, Node)                                         \
  declare_c2_type(Log10DNode, Node)                                       \
  declare_c2_type(PowDNode, Node)                                         \