8145688: Update for x86 pow in the math lib

Optimizes Math.pow() for 64 and 32 bit X86 architecture using Intel LIBM implementation. Reviewed-by: kvn
2015-12-23 21:09:50 -08:00 · 2015-12-23 21:09:50 -08:00 · 453650389f
commit 453650389f
parent bc04deac15
27 changed files with 3714 additions and 691 deletions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -772,6 +772,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
    case 0x55: // andnps
    case 0x56: // orps
    case 0x57: // xorps
+    case 0x58: // addpd
    case 0x59: // mulpd
    case 0x6E: // movd
    case 0x7E: // movd
@ -3363,6 +3364,7 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
  emit_int8(imm8);
 }

+// The encoding for pextrw is SSE2 to support the LIBM implementation.
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@ -4361,6 +4363,17 @@ void Assembler::addpd(XMMRegister dst, XMMRegister src) {
  emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::addpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x58);
+  emit_operand(dst, src);
+}
+
+
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1791,6 +1791,7 @@ private:

  // Add Packed Floating-Point Values
  void addpd(XMMRegister dst, XMMRegister src);
+  void addpd(XMMRegister dst, Address src);
  void addps(XMMRegister dst, XMMRegister src);
  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -2381,9 +2381,6 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, L
        // Should consider not saving rbx, if not necessary
        __ trigfunc('t', op->as_Op2()->fpu_stack_size());
        break;
-      case lir_pow :
-        __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
-        break;
      default      : ShouldNotReachHere();
    }
  } else {
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@ -810,7 +810,8 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");

-  if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog) {
+  if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog ||
+      x->id() == vmIntrinsics::_dpow) {
    do_LibmIntrinsic(x);
    return;
  }
@ -824,7 +825,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
      case vmIntrinsics::_dcos:
      case vmIntrinsics::_dtan:
      case vmIntrinsics::_dlog10:
-      case vmIntrinsics::_dpow:
        use_fpu = true;
    }
  } else {
@ -874,7 +874,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
    case vmIntrinsics::_dcos:   __ cos  (calc_input, calc_result, tmp1, tmp2);              break;
    case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
    case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
-    case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
    default:                    ShouldNotReachHere();
  }

@ -890,11 +889,25 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
  LIR_Opr calc_result = rlock_result(x);
  LIR_Opr result_reg = result_register_for(x->type());

-  BasicTypeList signature(1);
-  signature.append(T_DOUBLE);
-  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+  CallingConvention* cc = NULL;

-  value.load_item_force(cc->at(0));
+  if (x->id() == vmIntrinsics::_dpow) {
+    LIRItem value1(x->argument_at(1), this);
+
+    value1.set_destroys_register();
+
+    BasicTypeList signature(2);
+    signature.append(T_DOUBLE);
+    signature.append(T_DOUBLE);
+    cc = frame_map()->c_calling_convention(&signature);
+    value.load_item_force(cc->at(0));
+    value1.load_item_force(cc->at(1));
+  } else {
+    BasicTypeList signature(1);
+    signature.append(T_DOUBLE);
+    cc = frame_map()->c_calling_convention(&signature);
+    value.load_item_force(cc->at(0));
+  }

 #ifndef _LP64
  LIR_Opr tmp = FrameMap::fpu0_double_opr;
@ -915,6 +928,14 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dlog), getThreadTemp(), result_reg, cc->args());
      }
      break;
+    case vmIntrinsics::_dpow:
+      if (VM_Version::supports_sse2()) {
+        __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
+      }
+      else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
    default:  ShouldNotReachHere();
  }
 #else
@ -925,6 +946,9 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
    case vmIntrinsics::_dlog:
      __ call_runtime_leaf(StubRoutines::dlog(), getThreadTemp(), result_reg, cc->args());
      break;
+    case vmIntrinsics::_dpow:
+      __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
+      break;
  }
 #endif
  __ move(result_reg, calc_result);
--- a/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp
@ -840,53 +840,6 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
      break;
    }

-    case lir_pow: {
-      // pow needs two temporary fpu stack slots, so there are two temporary
-      // registers (stored in tmp1 and tmp2 of the operation).
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
-      assert(left->is_fpu_register(), "must be");
-      assert(right->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-
-      assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
-      assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
-
-      insert_free_if_dead(op2->tmp1_opr());
-      insert_free_if_dead(op2->tmp2_opr());
-
-      // Must bring both operands to top of stack with following operand ordering:
-      // * fpu stack before pow: ... right left
-      // * fpu stack after pow:  ... left
-
-      insert_free_if_dead(res, right);
-
-      if (tos_offset(right) != 1) {
-        insert_exchange(right);
-        insert_exchange(1);
-      }
-      insert_exchange(left);
-      assert(tos_offset(right) == 1, "check");
-      assert(tos_offset(left) == 0, "check");
-
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
-
-      sim()->pop();
-
-      do_rename(right, res);
-
-      new_res = to_fpu_stack_top(res);
-      break;
-    }
-
    default: {
      assert(false, "missed a fpu-operation");
    }
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp
@ -149,10 +149,15 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
        break;
    case Interpreter::java_lang_math_pow:
      __ fld_d(Address(rsp, 3*wordSize)); // second argument
-      __ pow_with_fallback(0);
-      // Store to stack to convert 80bit precision back to 64bits
-      __ push_fTOS();
-      __ pop_fTOS();
+      __ subptr(rsp, 4 * wordSize);
+      __ fstp_d(Address(rsp, 0));
+      __ fstp_d(Address(rsp, 2 * wordSize));
+      if (VM_Version::supports_sse2()) {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dpow())));
+      } else {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dpow)));
+      }
+      __ addptr(rsp, 4 * wordSize);
      break;
    case Interpreter::java_lang_math_exp:
      __ subptr(rsp, 2*wordSize);
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp
@ -255,6 +255,10 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
  } else if (kind == Interpreter::java_lang_math_log) {
    __ movdbl(xmm0, Address(rsp, wordSize));
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlog())));
+  } else if (kind == Interpreter::java_lang_math_pow) {
+    __ movdbl(xmm1, Address(rsp, wordSize));
+    __ movdbl(xmm0, Address(rsp, 3 * wordSize));
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dpow())));
  } else {
    __ fld_d(Address(rsp, wordSize));
    switch (kind) {
@ -273,11 +277,6 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
      case Interpreter::java_lang_math_log10:
          __ flog10();
          break;
-      case Interpreter::java_lang_math_pow:
-          __ fld_d(Address(rsp, 3*wordSize)); // second argument (one
-                                              // empty stack slot)
-          __ pow_with_fallback(0);
-          break;
      default                              :
          ShouldNotReachHere();
    }
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -3060,50 +3060,6 @@ void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
  }
 }

-void MacroAssembler::pow_exp_core_encoding() {
-  // kills rax, rcx, rdx
-  subptr(rsp,sizeof(jdouble));
-  // computes 2^X. Stack: X ...
-  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
-  // keep it on the thread's stack to compute 2^int(X) later
-  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
-  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
-  fld_s(0);                 // Stack: X X ...
-  frndint();                // Stack: int(X) X ...
-  fsuba(1);                 // Stack: int(X) X-int(X) ...
-  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
-  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
-  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
-  faddp(1);                 // Stack: 2^(X-int(X))
-  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
-  // shift int(X)+1023 to exponent position.
-  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
-  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
-  // values so detect them and set result to NaN.
-  movl(rax,Address(rsp,0));
-  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
-  addl(rax, 1023);
-  movl(rdx,rax);
-  shll(rax,20);
-  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
-  addl(rdx,1);
-  // Check that 1 < int(X)+1023+1 < 2048
-  // in 3 steps:
-  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
-  // 2- (int(X)+1023+1)&-2048 != 0
-  // 3- (int(X)+1023+1)&-2048 != 1
-  // Do 2- first because addl just updated the flags.
-  cmov32(Assembler::equal,rax,rcx);
-  cmpl(rdx,1);
-  cmov32(Assembler::equal,rax,rcx);
-  testl(rdx,rcx);
-  cmov32(Assembler::notEqual,rax,rcx);
-  movl(Address(rsp,4),rax);
-  movl(Address(rsp,0),0);
-  fmul_d(Address(rsp,0));   // Stack: 2^X ...
-  addptr(rsp,sizeof(jdouble));
-}
-
 void MacroAssembler::increase_precision() {
  subptr(rsp, BytesPerWord);
  fnstcw(Address(rsp, 0));
@ -3119,194 +3075,6 @@ void MacroAssembler::restore_precision() {
  addptr(rsp, BytesPerWord);
 }

-void MacroAssembler::fast_pow() {
-  // computes X^Y = 2^(Y * log2(X))
-  // if fast computation is not possible, result is NaN. Requires
-  // fallback from user of this macro.
-  // increase precision for intermediate steps of the computation
-  BLOCK_COMMENT("fast_pow {");
-  increase_precision();
-  fyl2x();                 // Stack: (Y*log2(X)) ...
-  pow_exp_core_encoding(); // Stack: exp(X) ...
-  restore_precision();
-  BLOCK_COMMENT("} fast_pow");
-}
-
-void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
-  // kills rax, rcx, rdx
-  // pow and exp needs 2 extra registers on the fpu stack.
-  Label slow_case, done;
-  Register tmp = noreg;
-  if (!VM_Version::supports_cmov()) {
-    // fcmp needs a temporary so preserve rdx,
-    tmp = rdx;
-  }
-  Register tmp2 = rax;
-  Register tmp3 = rcx;
-
-  // Stack: X Y
-  Label x_negative, y_not_2;
-
-  static double two = 2.0;
-  ExternalAddress two_addr((address)&two);
-
-  // constant maybe too far on 64 bit
-  lea(tmp2, two_addr);
-  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
-  fcmp(tmp, 2, true, false);  // Stack: X Y
-  jcc(Assembler::parity, y_not_2);
-  jcc(Assembler::notEqual, y_not_2);
-
-  fxch(); fpop();             // Stack: X
-  fmul(0);                    // Stack: X*X
-
-  jmp(done);
-
-  bind(y_not_2);
-
-  fldz();                     // Stack: 0 X Y
-  fcmp(tmp, 1, true, false);  // Stack: X Y
-  jcc(Assembler::above, x_negative);
-
-  // X >= 0
-
-  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-  fld_s(1);                   // Stack: X Y X Y
-  fast_pow();                 // Stack: X^Y X Y
-  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
-  // X^Y not equal to itself: X^Y is NaN go to slow case.
-  jcc(Assembler::parity, slow_case);
-  // get rid of duplicate arguments. Stack: X^Y
-  if (num_fpu_regs_in_use > 0) {
-    fxch(); fpop();
-    fxch(); fpop();
-  } else {
-    ffree(2);
-    ffree(1);
-  }
-  jmp(done);
-
-  // X <= 0
-  bind(x_negative);
-
-  fld_s(1);                   // Stack: Y X Y
-  frndint();                  // Stack: int(Y) X Y
-  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
-  jcc(Assembler::notEqual, slow_case);
-
-  subptr(rsp, 8);
-
-  // For X^Y, when X < 0, Y has to be an integer and the final
-  // result depends on whether it's odd or even. We just checked
-  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
-  // integer to test its parity. If int(Y) is huge and doesn't fit
-  // in the 64 bit integer range, the integer indefinite value will
-  // end up in the gp registers. Huge numbers are all even, the
-  // integer indefinite number is even so it's fine.
-
-#ifdef ASSERT
-  // Let's check we don't end up with an integer indefinite number
-  // when not expected. First test for huge numbers: check whether
-  // int(Y)+1 == int(Y) which is true for very large numbers and
-  // those are all even. A 64 bit integer is guaranteed to not
-  // overflow for numbers where y+1 != y (when precision is set to
-  // double precision).
-  Label y_not_huge;
-
-  fld1();                     // Stack: 1 int(Y) X Y
-  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
-
-#ifdef _LP64
-  // trip to memory to force the precision down from double extended
-  // precision
-  fstp_d(Address(rsp, 0));
-  fld_d(Address(rsp, 0));
-#endif
-
-  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
-#endif
-
-  // move int(Y) as 64 bit integer to thread's stack
-  fistp_d(Address(rsp,0));    // Stack: X Y
-
-#ifdef ASSERT
-  jcc(Assembler::notEqual, y_not_huge);
-
-  // Y is huge so we know it's even. It may not fit in a 64 bit
-  // integer and we don't want the debug code below to see the
-  // integer indefinite value so overwrite int(Y) on the thread's
-  // stack with 0.
-  movl(Address(rsp, 0), 0);
-  movl(Address(rsp, 4), 0);
-
-  bind(y_not_huge);
-#endif
-
-  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-  fld_s(1);                   // Stack: X Y X Y
-  fabs();                     // Stack: abs(X) Y X Y
-  fast_pow();                 // Stack: abs(X)^Y X Y
-  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
-  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
-
-  pop(tmp2);
-  NOT_LP64(pop(tmp3));
-  jcc(Assembler::parity, slow_case);
-
-#ifdef ASSERT
-  // Check that int(Y) is not integer indefinite value (int
-  // overflow). Shouldn't happen because for values that would
-  // overflow, 1+int(Y)==Y which was tested earlier.
-#ifndef _LP64
-  {
-    Label integer;
-    testl(tmp2, tmp2);
-    jcc(Assembler::notZero, integer);
-    cmpl(tmp3, 0x80000000);
-    jcc(Assembler::notZero, integer);
-    STOP("integer indefinite value shouldn't be seen here");
-    bind(integer);
-  }
-#else
-  {
-    Label integer;
-    mov(tmp3, tmp2); // preserve tmp2 for parity check below
-    shlq(tmp3, 1);
-    jcc(Assembler::carryClear, integer);
-    jcc(Assembler::notZero, integer);
-    STOP("integer indefinite value shouldn't be seen here");
-    bind(integer);
-  }
-#endif
-#endif
-
-  // get rid of duplicate arguments. Stack: X^Y
-  if (num_fpu_regs_in_use > 0) {
-    fxch(); fpop();
-    fxch(); fpop();
-  } else {
-    ffree(2);
-    ffree(1);
-  }
-
-  testl(tmp2, 1);
-  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
-  // X <= 0, Y even: X^Y = -abs(X)^Y
-
-  fchs();                     // Stack: -abs(X)^Y Y
-  jmp(done);
-
-  // slow case: runtime call
-  bind(slow_case);
-
-  fpop();                       // pop incorrect result or int(Y)
-
-  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
-
-  // Come here with result in F-TOS
-  bind(done);
-}
-
 void MacroAssembler::fpop() {
  ffree();
  fincstp();
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -918,24 +918,19 @@ class MacroAssembler: public Assembler {
  void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
                Register rax, Register rcx, Register rdx, Register tmp1 LP64_ONLY(COMMA Register tmp2));
+  void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
+                XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
+                Register rdx NOT_LP64(COMMA  Register tmp) LP64_ONLY(COMMA  Register tmp1)
+                LP64_ONLY(COMMA  Register tmp2) LP64_ONLY(COMMA  Register tmp3) LP64_ONLY(COMMA  Register tmp4));

  void increase_precision();
  void restore_precision();

-  // computes pow(x,y). Fallback to runtime call included.
-  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); }
-
 private:

  // call runtime as a fallback for trig functions and pow/exp.
  void fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use);

-  // computes 2^(Ylog2X); Ylog2X in ST(0)
-  void pow_exp_core_encoding();
-
-  // computes pow(x,y) or exp(x). Fallback to runtime call included.
-  void pow_or_exp(int num_fpu_regs_in_use);
-
  // these are private because users should be doing movflt/movdbl

  void movss(Address dst, XMMRegister src)     { Assembler::movss(dst, src); }
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86_libm.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86_libm.cpp
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -2126,15 +2126,6 @@ class StubGenerator: public StubCodeGenerator {
      __ trigfunc('t');
      __ ret(0);
    }
-    {
-      StubCodeMark mark(this, "StubRoutines", "pow");
-      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
-
-      __ fld_d(Address(rsp, 12));
-      __ fld_d(Address(rsp, 4));
-      __ pow_with_fallback(0);
-      __ ret(0);
-    }
  }

  // AES intrinsic stubs
@ -3082,6 +3073,30 @@ class StubGenerator: public StubCodeGenerator {

 }

+ address generate_libmPow() {
+   address start = __ pc();
+
+   const XMMRegister x0 = xmm0;
+   const XMMRegister x1 = xmm1;
+   const XMMRegister x2 = xmm2;
+   const XMMRegister x3 = xmm3;
+
+   const XMMRegister x4 = xmm4;
+   const XMMRegister x5 = xmm5;
+   const XMMRegister x6 = xmm6;
+   const XMMRegister x7 = xmm7;
+
+   const Register tmp = rbx;
+
+   BLOCK_COMMENT("Entry:");
+   __ enter(); // required for proper stackwalking of RuntimeStub frame
+   __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+   __ leave(); // required for proper stackwalking of RuntimeStub frame
+   __ ret(0);
+
+   return start;
+
+ }


  // Safefetch stubs.
@ -3310,6 +3325,7 @@ class StubGenerator: public StubCodeGenerator {
    if (VM_Version::supports_sse2()) {
      StubRoutines::_dexp = generate_libmExp();
      StubRoutines::_dlog = generate_libmLog();
+      StubRoutines::_dpow = generate_libmPow();
    }
  }

--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -3025,21 +3025,6 @@ class StubGenerator: public StubCodeGenerator {
      __ addq(rsp, 8);
      __ ret(0);
    }
-    {
-      StubCodeMark mark(this, "StubRoutines", "pow");
-      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
-
-      __ subq(rsp, 8);
-      __ movdbl(Address(rsp, 0), xmm1);
-      __ fld_d(Address(rsp, 0));
-      __ movdbl(Address(rsp, 0), xmm0);
-      __ fld_d(Address(rsp, 0));
-      __ pow_with_fallback(0);
-      __ fstp_d(Address(rsp, 0));
-      __ movdbl(xmm0, Address(rsp, 0));
-      __ addq(rsp, 8);
-      __ ret(0);
-    }
  }

  // AES intrinsic stubs
@ -4283,6 +4268,48 @@ class StubGenerator: public StubCodeGenerator {

  }

+  address generate_libmPow() {
+    address start = __ pc();
+
+    const XMMRegister x0 = xmm0;
+    const XMMRegister x1 = xmm1;
+    const XMMRegister x2 = xmm2;
+    const XMMRegister x3 = xmm3;
+
+    const XMMRegister x4 = xmm4;
+    const XMMRegister x5 = xmm5;
+    const XMMRegister x6 = xmm6;
+    const XMMRegister x7 = xmm7;
+
+    const Register tmp1 = r8;
+    const Register tmp2 = r9;
+    const Register tmp3 = r10;
+    const Register tmp4 = r11;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-7
+    __ subptr(rsp, 4 * wordSize);
+    __ movdqu(Address(rsp, 0), xmm6);
+    __ movdqu(Address(rsp, 2 * wordSize), xmm7);
+#endif
+    __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    __ movdqu(xmm6, Address(rsp, 0));
+    __ movdqu(xmm7, Address(rsp, 2 * wordSize));
+    __ addptr(rsp, 4 * wordSize);
+#endif
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+
+  }

 #undef __
 #define __ masm->
@ -4478,6 +4505,7 @@ class StubGenerator: public StubCodeGenerator {
    if (VM_Version::supports_sse2()) {
      StubRoutines::_dexp = generate_libmExp();
      StubRoutines::_dlog = generate_libmLog();
+      StubRoutines::_dpow = generate_libmPow();
    }
  }

--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@ -9885,39 +9885,6 @@ instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
  ins_pipe( pipe_slow );
 %}

-instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE<=1);
-  match(Set Y (PowD X Y));  // Raise X to the Yth power
-  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
-  format %{ "fast_pow $X $Y -> $Y  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ fld_s($X$$reg - 1);
-    __ fast_pow();
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE>=2);
-  match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
-  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
-  format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_pow();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  // The source Double operand on FPU stack
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@ -9864,24 +9864,6 @@ instruct log10D_reg(regD dst) %{
  ins_pipe( pipe_slow );
 %}

-instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
-  match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
-  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
-  format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_pow();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 //----------Arithmetic Conversion Instructions---------------------------------

 instruct roundFloat_nop(regF dst)
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -4018,7 +4018,6 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"ModD")==0 ||
        strcmp(opType,"ModF")==0 ||
        strcmp(opType,"ModI")==0 ||
-        strcmp(opType,"PowD")==0 ||
        strcmp(opType,"SinD")==0 ||
        strcmp(opType,"SqrtD")==0 ||
        strcmp(opType,"TanD")==0 ||
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@ -754,31 +754,6 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
      break;
    }

-    case lir_pow: {
-      assert(op->as_Op2() != NULL, "must be");
-      LIR_Op2* op2 = (LIR_Op2*)op;
-
-      // On x86 pow needs two temporary fpu stack slots: tmp1 and
-      // tmp2. Register input operands as temps to guarantee that it
-      // doesn't overlap with the temporary slots.
-      assert(op2->_info == NULL, "not used");
-      assert(op2->_opr1->is_valid() && op2->_opr2->is_valid(), "used");
-      assert(op2->_tmp1->is_valid() && op2->_tmp2->is_valid() && op2->_tmp3->is_valid()
-             && op2->_tmp4->is_valid() && op2->_tmp5->is_valid(), "used");
-      assert(op2->_result->is_valid(), "used");
-
-      do_input(op2->_opr1); do_temp(op2->_opr1);
-      do_input(op2->_opr2); do_temp(op2->_opr2);
-      do_temp(op2->_tmp1);
-      do_temp(op2->_tmp2);
-      do_temp(op2->_tmp3);
-      do_temp(op2->_tmp4);
-      do_temp(op2->_tmp5);
-      do_output(op2->_result);
-
-      break;
-    }
-
 // LIR_Op3
    case lir_idiv:
    case lir_irem: {
@ -1769,7 +1744,6 @@ const char * LIR_Op::name() const {
     case lir_cos:                   s = "cos";           break;
     case lir_tan:                   s = "tan";           break;
     case lir_log10:                 s = "log10";         break;
-     case lir_pow:                   s = "pow";           break;
     case lir_logic_and:             s = "logic_and";     break;
     case lir_logic_or:              s = "logic_or";      break;
     case lir_logic_xor:             s = "logic_xor";     break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp
@ -962,7 +962,6 @@ enum LIR_Code {
      , lir_cos
      , lir_tan
      , lir_log10
-      , lir_pow
      , lir_logic_and
      , lir_logic_or
      , lir_logic_xor
@ -2198,7 +2197,6 @@ class LIR_List: public CompilationResourceObj {
  void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
  void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
  void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
-  void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }

  void add (LIR_Opr left, LIR_Opr right, LIR_Opr res)      { append(new LIR_Op2(lir_add, left, right, res)); }
  void sub (LIR_Opr left, LIR_Opr right, LIR_Opr res, CodeEmitInfo* info = NULL) { append(new LIR_Op2(lir_sub, left, right, res, info)); }
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
@ -740,7 +740,6 @@ void LIR_Assembler::emit_op2(LIR_Op2* op) {
    case lir_tan:
    case lir_cos:
    case lir_log10:
-    case lir_pow:
      intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
      break;

--- a/hotspot/src/share/vm/c1/c1_LinearScan.cpp
+++ b/hotspot/src/share/vm/c1/c1_LinearScan.cpp
@ -6603,7 +6603,6 @@ void LinearScanStatistic::collect(LinearScan* allocator) {
        case lir_cos:
        case lir_abs:
        case lir_log10:
-        case lir_pow:
        case lir_logic_and:
        case lir_logic_or:
        case lir_logic_xor:
--- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp
@ -319,6 +319,7 @@ const char* Runtime1::name_for_address(address entry) {
  FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
  FUNCTION_CASE(entry, StubRoutines::dexp());
  FUNCTION_CASE(entry, StubRoutines::dlog());
+  FUNCTION_CASE(entry, StubRoutines::dpow());

 #undef FUNCTION_CASE

--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -216,7 +216,6 @@ macro(PartialSubtypeCheck)
 macro(Phi)
 macro(PopCountI)
 macro(PopCountL)
-macro(PowD)
 macro(PrefetchAllocation)
 macro(Proj)
 macro(RShiftI)
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -230,8 +230,6 @@ class LibraryCallKit : public GraphKit {
  bool inline_math_negateExactL();
  bool inline_math_subtractExactI(bool is_decrement);
  bool inline_math_subtractExactL(bool is_decrement);
-  bool inline_pow();
-  Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
  bool inline_min_max(vmIntrinsics::ID id);
  bool inline_notify(vmIntrinsics::ID id);
  Node* generate_min_max(vmIntrinsics::ID id, Node* x, Node* y);
@ -1718,243 +1716,6 @@ bool LibraryCallKit::inline_trig(vmIntrinsics::ID id) {
  return true;
 }

-Node* LibraryCallKit::finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName) {
-  //-------------------
-  //result=(result.isNaN())? funcAddr():result;
-  // Check: If isNaN() by checking result!=result? then either trap
-  // or go to runtime
-  Node* cmpisnan = _gvn.transform(new CmpDNode(result, result));
-  // Build the boolean node
-  Node* bolisnum = _gvn.transform(new BoolNode(cmpisnan, BoolTest::eq));
-
-  if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
-    { BuildCutout unless(this, bolisnum, PROB_STATIC_FREQUENT);
-      // The pow or exp intrinsic returned a NaN, which requires a call
-      // to the runtime.  Recompile with the runtime call.
-      uncommon_trap(Deoptimization::Reason_intrinsic,
-                    Deoptimization::Action_make_not_entrant);
-    }
-    return result;
-  } else {
-    // If this inlining ever returned NaN in the past, we compile a call
-    // to the runtime to properly handle corner cases
-
-    IfNode* iff = create_and_xform_if(control(), bolisnum, PROB_STATIC_FREQUENT, COUNT_UNKNOWN);
-    Node* if_slow = _gvn.transform(new IfFalseNode(iff));
-    Node* if_fast = _gvn.transform(new IfTrueNode(iff));
-
-    if (!if_slow->is_top()) {
-      RegionNode* result_region = new RegionNode(3);
-      PhiNode*    result_val = new PhiNode(result_region, Type::DOUBLE);
-
-      result_region->init_req(1, if_fast);
-      result_val->init_req(1, result);
-
-      set_control(if_slow);
-
-      const TypePtr* no_memory_effects = NULL;
-      Node* rt = make_runtime_call(RC_LEAF, call_type, funcAddr, funcName,
-                                   no_memory_effects,
-                                   x, top(), y, y ? top() : NULL);
-      Node* value = _gvn.transform(new ProjNode(rt, TypeFunc::Parms+0));
-#ifdef ASSERT
-      Node* value_top = _gvn.transform(new ProjNode(rt, TypeFunc::Parms+1));
-      assert(value_top == top(), "second value must be top");
-#endif
-
-      result_region->init_req(2, control());
-      result_val->init_req(2, value);
-      set_control(_gvn.transform(result_region));
-      return _gvn.transform(result_val);
-    } else {
-      return result;
-    }
-  }
-}
-
-//------------------------------inline_pow-------------------------------------
-// Inline power instructions, if possible.
-bool LibraryCallKit::inline_pow() {
-  // Pseudocode for pow
-  // if (y == 2) {
-  //   return x * x;
-  // } else {
-  //   if (x <= 0.0) {
-  //     long longy = (long)y;
-  //     if ((double)longy == y) { // if y is long
-  //       if (y + 1 == y) longy = 0; // huge number: even
-  //       result = ((1&longy) == 0)?-DPow(abs(x), y):DPow(abs(x), y);
-  //     } else {
-  //       result = NaN;
-  //     }
-  //   } else {
-  //     result = DPow(x,y);
-  //   }
-  //   if (result != result)?  {
-  //     result = uncommon_trap() or runtime_call();
-  //   }
-  //   return result;
-  // }
-
-  Node* x = round_double_node(argument(0));
-  Node* y = round_double_node(argument(2));
-
-  Node* result = NULL;
-
-  Node*   const_two_node = makecon(TypeD::make(2.0));
-  Node*   cmp_node       = _gvn.transform(new CmpDNode(y, const_two_node));
-  Node*   bool_node      = _gvn.transform(new BoolNode(cmp_node, BoolTest::eq));
-  IfNode* if_node        = create_and_xform_if(control(), bool_node, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
-  Node*   if_true        = _gvn.transform(new IfTrueNode(if_node));
-  Node*   if_false       = _gvn.transform(new IfFalseNode(if_node));
-
-  RegionNode* region_node = new RegionNode(3);
-  region_node->init_req(1, if_true);
-
-  Node* phi_node = new PhiNode(region_node, Type::DOUBLE);
-  // special case for x^y where y == 2, we can convert it to x * x
-  phi_node->init_req(1, _gvn.transform(new MulDNode(x, x)));
-
-  // set control to if_false since we will now process the false branch
-  set_control(if_false);
-
-  if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
-    // Short form: skip the fancy tests and just check for NaN result.
-    result = _gvn.transform(new PowDNode(C, control(), x, y));
-  } else {
-    // If this inlining ever returned NaN in the past, include all
-    // checks + call to the runtime.
-
-    // Set the merge point for If node with condition of (x <= 0.0)
-    // There are four possible paths to region node and phi node
-    RegionNode *r = new RegionNode(4);
-    Node *phi = new PhiNode(r, Type::DOUBLE);
-
-    // Build the first if node: if (x <= 0.0)
-    // Node for 0 constant
-    Node *zeronode = makecon(TypeD::ZERO);
-    // Check x:0
-    Node *cmp = _gvn.transform(new CmpDNode(x, zeronode));
-    // Check: If (x<=0) then go complex path
-    Node *bol1 = _gvn.transform(new BoolNode( cmp, BoolTest::le ));
-    // Branch either way
-    IfNode *if1 = create_and_xform_if(control(),bol1, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
-    // Fast path taken; set region slot 3
-    Node *fast_taken = _gvn.transform(new IfFalseNode(if1));
-    r->init_req(3,fast_taken); // Capture fast-control
-
-    // Fast path not-taken, i.e. slow path
-    Node *complex_path = _gvn.transform(new IfTrueNode(if1));
-
-    // Set fast path result
-    Node *fast_result = _gvn.transform(new PowDNode(C, control(), x, y));
-    phi->init_req(3, fast_result);
-
-    // Complex path
-    // Build the second if node (if y is long)
-    // Node for (long)y
-    Node *longy = _gvn.transform(new ConvD2LNode(y));
-    // Node for (double)((long) y)
-    Node *doublelongy= _gvn.transform(new ConvL2DNode(longy));
-    // Check (double)((long) y) : y
-    Node *cmplongy= _gvn.transform(new CmpDNode(doublelongy, y));
-    // Check if (y isn't long) then go to slow path
-
-    Node *bol2 = _gvn.transform(new BoolNode( cmplongy, BoolTest::ne ));
-    // Branch either way
-    IfNode *if2 = create_and_xform_if(complex_path,bol2, PROB_STATIC_INFREQUENT, COUNT_UNKNOWN);
-    Node* ylong_path = _gvn.transform(new IfFalseNode(if2));
-
-    Node *slow_path = _gvn.transform(new IfTrueNode(if2));
-
-    // Calculate DPow(abs(x), y)*(1 & (long)y)
-    // Node for constant 1
-    Node *conone = longcon(1);
-    // 1& (long)y
-    Node *signnode= _gvn.transform(new AndLNode(conone, longy));
-
-    // A huge number is always even. Detect a huge number by checking
-    // if y + 1 == y and set integer to be tested for parity to 0.
-    // Required for corner case:
-    // (long)9.223372036854776E18 = max_jlong
-    // (double)(long)9.223372036854776E18 = 9.223372036854776E18
-    // max_jlong is odd but 9.223372036854776E18 is even
-    Node* yplus1 = _gvn.transform(new AddDNode(y, makecon(TypeD::make(1))));
-    Node *cmpyplus1= _gvn.transform(new CmpDNode(yplus1, y));
-    Node *bolyplus1 = _gvn.transform(new BoolNode( cmpyplus1, BoolTest::eq ));
-    Node* correctedsign = NULL;
-    if (ConditionalMoveLimit != 0) {
-      correctedsign = _gvn.transform(CMoveNode::make(NULL, bolyplus1, signnode, longcon(0), TypeLong::LONG));
-    } else {
-      IfNode *ifyplus1 = create_and_xform_if(ylong_path,bolyplus1, PROB_FAIR, COUNT_UNKNOWN);
-      RegionNode *r = new RegionNode(3);
-      Node *phi = new PhiNode(r, TypeLong::LONG);
-      r->init_req(1, _gvn.transform(new IfFalseNode(ifyplus1)));
-      r->init_req(2, _gvn.transform(new IfTrueNode(ifyplus1)));
-      phi->init_req(1, signnode);
-      phi->init_req(2, longcon(0));
-      correctedsign = _gvn.transform(phi);
-      ylong_path = _gvn.transform(r);
-      record_for_igvn(r);
-    }
-
-    // zero node
-    Node *conzero = longcon(0);
-    // Check (1&(long)y)==0?
-    Node *cmpeq1 = _gvn.transform(new CmpLNode(correctedsign, conzero));
-    // Check if (1&(long)y)!=0?, if so the result is negative
-    Node *bol3 = _gvn.transform(new BoolNode( cmpeq1, BoolTest::ne ));
-    // abs(x)
-    Node *absx=_gvn.transform(new AbsDNode(x));
-    // abs(x)^y
-    Node *absxpowy = _gvn.transform(new PowDNode(C, control(), absx, y));
-    // -abs(x)^y
-    Node *negabsxpowy = _gvn.transform(new NegDNode (absxpowy));
-    // (1&(long)y)==1?-DPow(abs(x), y):DPow(abs(x), y)
-    Node *signresult = NULL;
-    if (ConditionalMoveLimit != 0) {
-      signresult = _gvn.transform(CMoveNode::make(NULL, bol3, absxpowy, negabsxpowy, Type::DOUBLE));
-    } else {
-      IfNode *ifyeven = create_and_xform_if(ylong_path,bol3, PROB_FAIR, COUNT_UNKNOWN);
-      RegionNode *r = new RegionNode(3);
-      Node *phi = new PhiNode(r, Type::DOUBLE);
-      r->init_req(1, _gvn.transform(new IfFalseNode(ifyeven)));
-      r->init_req(2, _gvn.transform(new IfTrueNode(ifyeven)));
-      phi->init_req(1, absxpowy);
-      phi->init_req(2, negabsxpowy);
-      signresult = _gvn.transform(phi);
-      ylong_path = _gvn.transform(r);
-      record_for_igvn(r);
-    }
-    // Set complex path fast result
-    r->init_req(2, ylong_path);
-    phi->init_req(2, signresult);
-
-    static const jlong nan_bits = CONST64(0x7ff8000000000000);
-    Node *slow_result = makecon(TypeD::make(*(double*)&nan_bits)); // return NaN
-    r->init_req(1,slow_path);
-    phi->init_req(1,slow_result);
-
-    // Post merge
-    set_control(_gvn.transform(r));
-    record_for_igvn(r);
-    result = _gvn.transform(phi);
-  }
-
-  result = finish_pow_exp(result, x, y, OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW");
-
-  // control from finish_pow_exp is now input to the region node
-  region_node->set_req(2, control());
-  // the result from finish_pow_exp is now input to the phi node
-  phi_node->init_req(2, result);
-  set_control(_gvn.transform(region_node));
-  record_for_igvn(region_node);
-  set_result(_gvn.transform(phi_node));
-
-  C->set_has_split_ifs(true); // Has chance for split-if optimization
-  return true;
-}
-
 //------------------------------runtime_math-----------------------------
 bool LibraryCallKit::runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName) {
  assert(call_type == OptoRuntime::Math_DD_D_Type() || call_type == OptoRuntime::Math_D_D_Type(),
@ -2005,8 +1766,10 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
    return StubRoutines::dexp() != NULL ?
      runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(),  "dexp") :
      runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp),  "EXP");
-  case vmIntrinsics::_dpow:   return Matcher::has_match_rule(Op_PowD)   ? inline_pow()    :
-    runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
+  case vmIntrinsics::_dpow:
+    return StubRoutines::dpow() != NULL ?
+      runtime_math(OptoRuntime::Math_DD_D_Type(), StubRoutines::dpow(), "dpow") :
+      runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
 #undef FN_PTR

   // These intrinsics are not yet correctly implemented
--- a/hotspot/src/share/vm/opto/subnode.cpp
+++ b/hotspot/src/share/vm/opto/subnode.cpp
@ -1519,17 +1519,3 @@ const Type *Log10DNode::Value( PhaseTransform *phase ) const {
  return TypeD::make( StubRoutines::intrinsic_log10( d ) );
 }

-//=============================================================================
-//------------------------------Value------------------------------------------
-// Compute pow
-const Type *PowDNode::Value( PhaseTransform *phase ) const {
-  const Type *t1 = phase->type( in(1) );
-  if( t1 == Type::TOP ) return Type::TOP;
-  if( t1->base() != Type::DoubleCon ) return Type::DOUBLE;
-  const Type *t2 = phase->type( in(2) );
-  if( t2 == Type::TOP ) return Type::TOP;
-  if( t2->base() != Type::DoubleCon ) return Type::DOUBLE;
-  double d1 = t1->getd();
-  double d2 = t2->getd();
-  return TypeD::make( StubRoutines::intrinsic_pow( d1, d2 ) );
-}
--- a/hotspot/src/share/vm/opto/subnode.hpp
+++ b/hotspot/src/share/vm/opto/subnode.hpp
@ -491,20 +491,6 @@ public:
  virtual const Type *Value( PhaseTransform *phase ) const;
 };

-//------------------------------PowDNode---------------------------------------
-// Raise a double to a double power
-class PowDNode : public Node {
-public:
-  PowDNode(Compile* C, Node *c, Node *in1, Node *in2 ) : Node(c, in1, in2) {
-    init_flags(Flag_is_expensive);
-    C->add_expensive_node(this);
-  }
-  virtual int Opcode() const;
-  const Type *bottom_type() const { return Type::DOUBLE; }
-  virtual uint ideal_reg() const { return Op_RegD; }
-  virtual const Type *Value( PhaseTransform *phase ) const;
-};
-
 //-------------------------------ReverseBytesINode--------------------------------
 // reverse bytes of an integer
 class ReverseBytesINode : public Node {
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -153,9 +153,9 @@ address StubRoutines::_vectorizedMismatch = NULL;

 address StubRoutines::_dexp = NULL;
 address StubRoutines::_dlog = NULL;
+address StubRoutines::_dpow = NULL;

 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
-double (* StubRoutines::_intrinsic_pow   )(double, double) = NULL;
 double (* StubRoutines::_intrinsic_sin   )(double) = NULL;
 double (* StubRoutines::_intrinsic_cos   )(double) = NULL;
 double (* StubRoutines::_intrinsic_tan   )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -212,6 +212,7 @@ class StubRoutines: AllStatic {

  static address _dexp;
  static address _dlog;
+  static address _dpow;

  // These are versions of the java.lang.Math methods which perform
  // the same operations as the intrinsic version.  They are used for
@ -384,6 +385,7 @@ class StubRoutines: AllStatic {

  static address dexp()                { return _dexp; }
  static address dlog()                { return _dlog; }
+  static address dpow()                { return _dpow; }

  static address select_fill_function(BasicType t, bool aligned, const char* &name);

--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -860,6 +860,7 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _mulAdd,                                       address)                               \
     static_field(StubRoutines,                _dexp,                                         address)                               \
     static_field(StubRoutines,                _dlog,                                         address)                               \
+     static_field(StubRoutines,                _dpow,                                         address)                               \
     static_field(StubRoutines,                _vectorizedMismatch,                           address)                               \
     static_field(StubRoutines,                _jbyte_arraycopy,                              address)                               \
     static_field(StubRoutines,                _jshort_arraycopy,                             address)                               \
@ -2058,7 +2059,6 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
  declare_c2_type(AtanDNode, Node)                                        \
  declare_c2_type(SqrtDNode, Node)                                        \
  declare_c2_type(Log10DNode, Node)                                       \
-  declare_c2_type(PowDNode, Node)                                         \
  declare_c2_type(ReverseBytesINode, Node)                                \
  declare_c2_type(ReverseBytesLNode, Node)                                \
  declare_c2_type(ReductionNode, Node)                                    \