Merge

2015-06-11 12:02:12 -07:00 · 2015-06-11 12:02:12 -07:00 · 95736933bd
commit 95736933bd
parent f8cc15a29b 513d9a5ede
22 changed files with 1303 additions and 10 deletions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -2813,6 +2813,13 @@ void Assembler::orl(Register dst, Register src) {
  emit_arith(0x0B, 0xC0, dst, src);
 }
 void Assembler::orl(Address dst, Register src) {
  InstructionMark im(this);
  prefix(dst, src);
  emit_int8(0x09);
  emit_operand(src, dst);
 }
 void Assembler::packuswb(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@ -6907,6 +6914,19 @@ void Assembler::rclq(Register dst, int imm8) {
  }
 }
 void Assembler::rcrq(Register dst, int imm8) {
  assert(isShiftCount(imm8 >> 1), "illegal shift count");
  int encode = prefixq_and_encode(dst->encoding());
  if (imm8 == 1) {
    emit_int8((unsigned char)0xD1);
    emit_int8((unsigned char)(0xD8 | encode));
  } else {
    emit_int8((unsigned char)0xC1);
    emit_int8((unsigned char)(0xD8 | encode));
    emit_int8(imm8);
  }
 }
 void Assembler::rorq(Register dst, int imm8) {
  assert(isShiftCount(imm8 >> 1), "illegal shift count");
  int encode = prefixq_and_encode(dst->encoding());
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1594,6 +1594,7 @@ private:
  void orl(Register dst, int32_t imm32);
  void orl(Register dst, Address src);
  void orl(Register dst, Register src);
  void orl(Address dst, Register src);
  void orq(Address dst, int32_t imm32);
  void orq(Register dst, int32_t imm32);
@ -1694,6 +1695,8 @@ private:
  void rclq(Register dst, int imm8);
  void rcrq(Register dst, int imm8);
  void rdtsc();
  void ret(int imm16);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -7750,6 +7750,503 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
  pop(tmp2);
  pop(tmp1);
 }
 //Helper functions for square_to_len()
 /**
 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
 * Preserves x and z and modifies rest of the registers.
 */
 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  // Perform square and right shift by 1
  // Handle odd xlen case first, then for even xlen do the following
  // jlong carry = 0;
  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
  //     huge_128 product = x[j:j+1] * x[j:j+1];
  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
  //     z[i+2:i+3] = (jlong)(product >>> 1);
  //     carry = (jlong)product;
  // }
  xorq(tmp5, tmp5);     // carry
  xorq(rdxReg, rdxReg);
  xorl(tmp1, tmp1);     // index for x
  xorl(tmp4, tmp4);     // index for z
  Label L_first_loop, L_first_loop_exit;
  testl(xlen, 1);
  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
  // Square and right shift by 1 the odd element using 32 bit multiply
  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
  imulq(raxReg, raxReg);
  shrq(raxReg, 1);
  adcq(tmp5, 0);
  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
  incrementl(tmp1);
  addl(tmp4, 2);
  // Square and  right shift by 1 the rest using 64 bit multiply
  bind(L_first_loop);
  cmpptr(tmp1, xlen);
  jccb(Assembler::equal, L_first_loop_exit);
  // Square
  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
  rorq(raxReg, 32);    // convert big-endian to little-endian
  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
  // Right shift by 1 and save carry
  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
  rcrq(rdxReg, 1);
  rcrq(raxReg, 1);
  adcq(tmp5, 0);
  // Store result in z
  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
  // Update indices for x and z
  addl(tmp1, 2);
  addl(tmp4, 4);
  jmp(L_first_loop);
  bind(L_first_loop_exit);
 }
 /**
 * Perform the following multiply add operation using BMI2 instructions
 * carry:sum = sum + op1*op2 + carry
 * op2 should be in rdx
 * op2 is preserved, all other registers are modified
 */
 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
  // assert op2 is rdx
  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
  addq(sum, carry);
  adcq(tmp2, 0);
  addq(sum, op1);
  adcq(tmp2, 0);
  movq(carry, tmp2);
 }
 /**
 * Perform the following multiply add operation:
 * carry:sum = sum + op1*op2 + carry
 * Preserves op1, op2 and modifies rest of registers
 */
 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
  // rdx:rax = op1 * op2
  movq(raxReg, op2);
  mulq(op1);
  //  rdx:rax = sum + carry + rdx:rax
  addq(sum, carry);
  adcq(rdxReg, 0);
  addq(sum, raxReg);
  adcq(rdxReg, 0);
  // carry:sum = rdx:sum
  movq(carry, rdxReg);
 }
 /**
 * Add 64 bit long carry into z[] with carry propogation.
 * Preserves z and carry register values and modifies rest of registers.
 *
 */
 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
  Label L_fourth_loop, L_fourth_loop_exit;
  movl(tmp1, 1);
  subl(zlen, 2);
  addq(Address(z, zlen, Address::times_4, 0), carry);
  bind(L_fourth_loop);
  jccb(Assembler::carryClear, L_fourth_loop_exit);
  subl(zlen, 2);
  jccb(Assembler::negative, L_fourth_loop_exit);
  addq(Address(z, zlen, Address::times_4, 0), tmp1);
  jmp(L_fourth_loop);
  bind(L_fourth_loop_exit);
 }
 /**
 * Shift z[] left by 1 bit.
 * Preserves x, len, z and zlen registers and modifies rest of the registers.
 *
 */
 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
  Label L_fifth_loop, L_fifth_loop_exit;
  // Fifth loop
  // Perform primitiveLeftShift(z, zlen, 1)
  const Register prev_carry = tmp1;
  const Register new_carry = tmp4;
  const Register value = tmp2;
  const Register zidx = tmp3;
  // int zidx, carry;
  // long value;
  // carry = 0;
  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
  //    (carry:value)  = (z[i] << 1) | carry ;
  //    z[i] = value;
  // }
  movl(zidx, zlen);
  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
  bind(L_fifth_loop);
  decl(zidx);  // Use decl to preserve carry flag
  decl(zidx);
  jccb(Assembler::negative, L_fifth_loop_exit);
  if (UseBMI2Instructions) {
     movq(value, Address(z, zidx, Address::times_4, 0));
     rclq(value, 1);
     rorxq(value, value, 32);
     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
  }
  else {
    // clear new_carry
    xorl(new_carry, new_carry);
    // Shift z[i] by 1, or in previous carry and save new carry
    movq(value, Address(z, zidx, Address::times_4, 0));
    shlq(value, 1);
    adcl(new_carry, 0);
    orq(value, prev_carry);
    rorq(value, 0x20);
    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
    // Set previous carry = new carry
    movl(prev_carry, new_carry);
  }
  jmp(L_fifth_loop);
  bind(L_fifth_loop_exit);
 }
 /**
 * Code for BigInteger::squareToLen() intrinsic
 *
 * rdi: x
 * rsi: len
 * r8:  z
 * rcx: zlen
 * r12: tmp1
 * r13: tmp2
 * r14: tmp3
 * r15: tmp4
 * rbx: tmp5
 *
 */
 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
  push(tmp1);
  push(tmp2);
  push(tmp3);
  push(tmp4);
  push(tmp5);
  // First loop
  // Store the squares, right shifted one bit (i.e., divided by 2).
  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
  // Add in off-diagonal sums.
  //
  // Second, third (nested) and fourth loops.
  // zlen +=2;
  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
  //    carry = 0;
  //    long op2 = x[xidx:xidx+1];
  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
  //       k -= 2;
  //       long op1 = x[j:j+1];
  //       long sum = z[k:k+1];
  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
  //       z[k:k+1] = sum;
  //    }
  //    add_one_64(z, k, carry, tmp_regs);
  // }
  const Register carry = tmp5;
  const Register sum = tmp3;
  const Register op1 = tmp4;
  Register op2 = tmp2;
  push(zlen);
  push(len);
  addl(zlen,2);
  bind(L_second_loop);
  xorq(carry, carry);
  subl(zlen, 4);
  subl(len, 2);
  push(zlen);
  push(len);
  cmpl(len, 0);
  jccb(Assembler::lessEqual, L_second_loop_exit);
  // Multiply an array by one 64 bit long.
  if (UseBMI2Instructions) {
    op2 = rdxReg;
    movq(op2, Address(x, len, Address::times_4,  0));
    rorxq(op2, op2, 32);
  }
  else {
    movq(op2, Address(x, len, Address::times_4,  0));
    rorq(op2, 32);
  }
  bind(L_third_loop);
  decrementl(len);
  jccb(Assembler::negative, L_third_loop_exit);
  decrementl(len);
  jccb(Assembler::negative, L_last_x);
  movq(op1, Address(x, len, Address::times_4,  0));
  rorq(op1, 32);
  bind(L_multiply);
  subl(zlen, 2);
  movq(sum, Address(z, zlen, Address::times_4,  0));
  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
  if (UseBMI2Instructions) {
    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
  }
  else {
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
  }
  movq(Address(z, zlen, Address::times_4, 0), sum);
  jmp(L_third_loop);
  bind(L_third_loop_exit);
  // Fourth loop
  // Add 64 bit long carry into z with carry propogation.
  // Uses offsetted zlen.
  add_one_64(z, zlen, carry, tmp1);
  pop(len);
  pop(zlen);
  jmp(L_second_loop);
  // Next infrequent code is moved outside loops.
  bind(L_last_x);
  movl(op1, Address(x, 0));
  jmp(L_multiply);
  bind(L_second_loop_exit);
  pop(len);
  pop(zlen);
  pop(len);
  pop(zlen);
  // Fifth loop
  // Shift z left 1 bit.
  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
  // z[zlen-1] |= x[len-1] & 1;
  movl(tmp3, Address(x, len, Address::times_4, -4));
  andl(tmp3, 1);
  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
  pop(tmp5);
  pop(tmp4);
  pop(tmp3);
  pop(tmp2);
  pop(tmp1);
 }
 /**
 * Helper function for mul_add()
 * Multiply the in[] by int k and add to out[] starting at offset offs using
 * 128 bit by 32 bit multiply and return the carry in tmp5.
 * Only quad int aligned length of in[] is operated on in this function.
 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
 * This function preserves out, in and k registers.
 * len and offset point to the appropriate index in "in" & "out" correspondingly
 * tmp5 has the carry.
 * other registers are temporary and are modified.
 *
 */
 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  Label L_first_loop, L_first_loop_exit;
  movl(tmp1, len);
  shrl(tmp1, 2);
  bind(L_first_loop);
  subl(tmp1, 1);
  jccb(Assembler::negative, L_first_loop_exit);
  subl(len, 4);
  subl(offset, 4);
  Register op2 = tmp2;
  const Register sum = tmp3;
  const Register op1 = tmp4;
  const Register carry = tmp5;
  if (UseBMI2Instructions) {
    op2 = rdxReg;
  }
  movq(op1, Address(in, len, Address::times_4,  8));
  rorq(op1, 32);
  movq(sum, Address(out, offset, Address::times_4,  8));
  rorq(sum, 32);
  if (UseBMI2Instructions) {
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
  }
  else {
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
  }
  // Store back in big endian from little endian
  rorq(sum, 0x20);
  movq(Address(out, offset, Address::times_4,  8), sum);
  movq(op1, Address(in, len, Address::times_4,  0));
  rorq(op1, 32);
  movq(sum, Address(out, offset, Address::times_4,  0));
  rorq(sum, 32);
  if (UseBMI2Instructions) {
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
  }
  else {
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
  }
  // Store back in big endian from little endian
  rorq(sum, 0x20);
  movq(Address(out, offset, Address::times_4,  0), sum);
  jmp(L_first_loop);
  bind(L_first_loop_exit);
 }
 /**
 * Code for BigInteger::mulAdd() intrinsic
 *
 * rdi: out
 * rsi: in
 * r11: offs (out.length - offset)
 * rcx: len
 * r8:  k
 * r12: tmp1
 * r13: tmp2
 * r14: tmp3
 * r15: tmp4
 * rbx: tmp5
 * Multiply the in[] by word k and add to out[], return the carry in rax
 */
 void MacroAssembler::mul_add(Register out, Register in, Register offs,
   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  Label L_carry, L_last_in, L_done;
 // carry = 0;
 // for (int j=len-1; j >= 0; j--) {
 //    long product = (in[j] & LONG_MASK) * kLong +
 //                   (out[offs] & LONG_MASK) + carry;
 //    out[offs--] = (int)product;
 //    carry = product >>> 32;
 // }
 //
  push(tmp1);
  push(tmp2);
  push(tmp3);
  push(tmp4);
  push(tmp5);
  Register op2 = tmp2;
  const Register sum = tmp3;
  const Register op1 = tmp4;
  const Register carry =  tmp5;
  if (UseBMI2Instructions) {
    op2 = rdxReg;
    movl(op2, k);
  }
  else {
    movl(op2, k);
  }
  xorq(carry, carry);
  //First loop
  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
  //The carry is in tmp5
  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
  decrementl(len);
  jccb(Assembler::negative, L_carry);
  decrementl(len);
  jccb(Assembler::negative, L_last_in);
  movq(op1, Address(in, len, Address::times_4,  0));
  rorq(op1, 32);
  subl(offs, 2);
  movq(sum, Address(out, offs, Address::times_4,  0));
  rorq(sum, 32);
  if (UseBMI2Instructions) {
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
  }
  else {
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
  }
  // Store back in big endian from little endian
  rorq(sum, 0x20);
  movq(Address(out, offs, Address::times_4,  0), sum);
  testl(len, len);
  jccb(Assembler::zero, L_carry);
  //Multiply the last in[] entry, if any
  bind(L_last_in);
  movl(op1, Address(in, 0));
  movl(sum, Address(out, offs, Address::times_4,  -4));
  movl(raxReg, k);
  mull(op1); //tmp4 * eax -> edx:eax
  addl(sum, carry);
  adcl(rdxReg, 0);
  addl(sum, raxReg);
  adcl(rdxReg, 0);
  movl(carry, rdxReg);
  movl(Address(out, offs, Address::times_4,  -4), sum);
  bind(L_carry);
  //return tmp5/carry as carry in rax
  movl(rax, carry);
  bind(L_done);
  pop(tmp5);
  pop(tmp4);
  pop(tmp3);
  pop(tmp2);
  pop(tmp1);
 }
 #endif
 /**
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -1241,6 +1241,25 @@ public:
                               Register carry2);
  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
                            Register tmp2);
  void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
                       Register rdxReg, Register raxReg);
  void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
  void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
                       Register tmp3, Register tmp4);
  void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
                     Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
  void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
               Register raxReg);
  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
               Register raxReg);
 #endif
  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -3785,6 +3785,107 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }
 /**
   *  Arguments:
   *
  //  Input:
  //    c_rarg0   - x address
  //    c_rarg1   - x length
  //    c_rarg2   - z address
  //    c_rarg3   - z lenth
   *
   */
  address generate_squareToLen() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "squareToLen");
    address start = __ pc();
    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
    const Register x      = rdi;
    const Register len    = rsi;
    const Register z      = r8;
    const Register zlen   = rcx;
   const Register tmp1      = r12;
   const Register tmp2      = r13;
   const Register tmp3      = r14;
   const Register tmp4      = r15;
   const Register tmp5      = rbx;
    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
       setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
                          // zlen => rcx
                          // r9 and r10 may be used to save non-volatile registers
    __ movptr(r8, rdx);
    __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
    restore_arg_regs();
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
   /**
   *  Arguments:
   *
   *  Input:
   *    c_rarg0   - out address
   *    c_rarg1   - in address
   *    c_rarg2   - offset
   *    c_rarg3   - len
   * not Win64
   *    c_rarg4   - k
   * Win64
   *    rsp+40    - k
   */
  address generate_mulAdd() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "mulAdd");
    address start = __ pc();
    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
    const Register out     = rdi;
    const Register in      = rsi;
    const Register offset  = r11;
    const Register len     = rcx;
    const Register k       = r8;
    // Next registers will be saved on stack in mul_add().
    const Register tmp1  = r12;
    const Register tmp2  = r13;
    const Register tmp3  = r14;
    const Register tmp4  = r15;
    const Register tmp5  = rbx;
    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
                       // len => rcx, k => r8
                       // r9 and r10 may be used to save non-volatile registers
 #ifdef _WIN64
    // last argument is on stack on Win64
    __ movl(k, Address(rsp, 6 * wordSize));
 #endif
    __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
    __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
    restore_arg_regs();
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
 #undef __
 #define __ masm->
@ -4030,6 +4131,12 @@ class StubGenerator: public StubCodeGenerator {
    if (UseMultiplyToLenIntrinsic) {
      StubRoutines::_multiplyToLen = generate_multiplyToLen();
    }
    if (UseSquareToLenIntrinsic) {
      StubRoutines::_squareToLen = generate_squareToLen();
    }
    if (UseMulAddIntrinsic) {
      StubRoutines::_mulAdd = generate_mulAdd();
    }
 #endif
  }
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@ -33,7 +33,7 @@ static bool    returns_to_call_stub(address return_pc)   { return return_pc == _
 enum platform_dependent_constants {
  code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
 };
 class x86 {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -790,6 +790,12 @@ void VM_Version::get_processor_features() {
  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
    UseMultiplyToLenIntrinsic = true;
  }
  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
    UseSquareToLenIntrinsic = true;
  }
  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
    UseMulAddIntrinsic = true;
  }
 #else
  if (UseMultiplyToLenIntrinsic) {
    if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
@ -797,6 +803,18 @@ void VM_Version::get_processor_features() {
    }
    FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
  }
  if (UseSquareToLenIntrinsic) {
    if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
      warning("squareToLen intrinsic is not available in 32-bit VM");
    }
    FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
  }
  if (UseMulAddIntrinsic) {
    if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
      warning("mulAdd intrinsic is not available in 32-bit VM");
    }
    FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
  }
 #endif
 #endif // COMPILER2
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -799,6 +799,14 @@
   do_name(     multiplyToLen_name,                             "multiplyToLen")                                        \
   do_signature(multiplyToLen_signature,                        "([II[II[I)[I")                                         \
                                                                                                                        \
  do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S)                        \
   do_name(     squareToLen_name,                             "implSquareToLen")                                        \
   do_signature(squareToLen_signature,                        "([II[II)[I")                                             \
                                                                                                                        \
  do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S)                                       \
   do_name(     mulAdd_name,                                  "implMulAdd")                                             \
   do_signature(mulAdd_signature,                             "([I[IIII)I")                                             \
                                                                                                                        \
  /* java/lang/ref/Reference */                                                                                         \
  do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                        \
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -665,6 +665,12 @@
  product(bool, UseMultiplyToLenIntrinsic, false,                           \
          "Enables intrinsification of BigInteger.multiplyToLen()")         \
                                                                            \
  product(bool, UseSquareToLenIntrinsic, false,                             \
          "Enables intrinsification of BigInteger.squareToLen()")           \
                                                                            \
  product(bool, UseMulAddIntrinsic, false,                                  \
          "Enables intrinsification of BigInteger.mulAdd()")                \
                                                                            \
  product(bool, UseTypeSpeculation, true,                                   \
          "Speculatively propagate types from profiles")                    \
                                                                            \
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -972,7 +972,9 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                  strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0)
                  ))) {
            call->dump();
            fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
--- a/hotspot/src/share/vm/opto/ifnode.cpp
+++ b/hotspot/src/share/vm/opto/ifnode.cpp
@ -817,19 +817,78 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
  BoolTest::mask hi_test = this_bool->_test._test;
  BoolTest::mask cond = hi_test;
  // convert:
  //
  //          dom_bool = x {<,<=,>,>=} a
  //                           / \
  //     proj = {True,False}  /   \ otherproj = {False,True}
  //                         /
  //        this_bool = x {<,<=} b
  //                       / \
  //  fail = {True,False} /   \ success = {False,True}
  //                     /
  //
  // (Second test guaranteed canonicalized, first one may not have
  // been canonicalized yet)
  //
  // into:
  //
  // cond = (x - lo) {<u,<=u,>u,>=u} adjusted_lim
  //                       / \
  //                 fail /   \ success
  //                     /
  //
  // Figure out which of the two tests sets the upper bound and which
  // sets the lower bound if any.
  Node* adjusted_lim = NULL;
  if (hi_type->_lo > lo_type->_hi && hi_type->_hi == max_jint && lo_type->_lo == min_jint) {
    assert((dom_bool->_test.is_less() && !proj->_con) ||
           (dom_bool->_test.is_greater() && proj->_con), "incorrect test");
    // this test was canonicalized
    assert(this_bool->_test.is_less() && fail->_con, "incorrect test");
    // this_bool = <
    //   dom_bool = >= (proj = True) or dom_bool = < (proj = False)
    //     x in [a, b[ on the fail (= True) projection, b > a-1 (because of hi_type->_lo > lo_type->_hi test above):
    //     lo = a, hi = b, adjusted_lim = b-a, cond = <u
    //   dom_bool = > (proj = True) or dom_bool = <= (proj = False)
    //     x in ]a, b[ on the fail (= True) projection, b > a:
    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <u
    // this_bool = <=
    //   dom_bool = >= (proj = True) or dom_bool = < (proj = False)
    //     x in [a, b] on the fail (= True) projection, b+1 > a-1:
    //     lo = a, hi = b, adjusted_lim = b-a, cond = <=u
    //   dom_bool = > (proj = True) or dom_bool = <= (proj = False)
    //     x in ]a, b] on the fail (= True) projection b+1 > a:
    //     lo = a+1, hi = b, adjusted_lim = b-a, cond = <u
    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <=u doesn't work because a = b is possible, then hi-lo = -1
    if (lo_test == BoolTest::gt || lo_test == BoolTest::le) {
      if (hi_test == BoolTest::le) {
        adjusted_lim = igvn->transform(new SubINode(hi, lo));
        cond = BoolTest::lt;
      }
      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
    }
  } else if (lo_type->_lo > hi_type->_hi && lo_type->_hi == max_jint && hi_type->_lo == min_jint) {
    // this_bool = <
    //   dom_bool = < (proj = True) or dom_bool = >= (proj = False)
    //     x in [b, a[ on the fail (= False) projection, a > b-1 (because of lo_type->_lo > hi_type->_hi above):
    //     lo = b, hi = a, adjusted_lim = a-b, cond = >=u
    //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
    //     x in [b, a] on the fail (= False) projection, a+1 > b-1:
    //     lo = b, hi = a, adjusted_lim = a-b, cond = >u
    // this_bool = <=
    //   dom_bool = < (proj = True) or dom_bool = >= (proj = False)
    //     x in ]b, a[ on the fail (= False) projection, a > b:
    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >=u
    //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
    //     x in ]b, a] on the fail (= False) projection, a+1 > b:
    //     lo = b+1, hi = a, adjusted_lim = a-b, cond = >=u
    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then hi-lo = -1
    swap(lo, hi);
    swap(lo_type, hi_type);
    swap(lo_test, hi_test);
@ -842,6 +901,10 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
    cond = (hi_test == BoolTest::le || hi_test == BoolTest::gt) ? BoolTest::gt : BoolTest::ge;
    if (lo_test == BoolTest::le) {
      if (cond == BoolTest::gt) {
        adjusted_lim = igvn->transform(new SubINode(hi, lo));
        cond = BoolTest::ge;
      }
      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
    }
@ -860,7 +923,6 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
        }
      }
    }
    lo = NULL;
    hi = NULL;
  }
@ -868,12 +930,13 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
  if (lo && hi) {
    // Merge the two compares into a single unsigned compare by building (CmpU (n - lo) (hi - lo))
    Node* adjusted_val = igvn->transform(new SubINode(n,  lo));
-    Node* adjusted_lim = igvn->transform(new SubINode(hi, lo));
+    if (adjusted_lim == NULL) {
      adjusted_lim = igvn->transform(new SubINode(hi, lo));
    }
    Node* newcmp = igvn->transform(new CmpUNode(adjusted_val, adjusted_lim));
    Node* newbool = igvn->transform(new BoolNode(newcmp, cond));
-    igvn->is_IterGVN()->replace_input_of(dom_iff, 1, igvn->intcon(proj->_con));
+    igvn->replace_input_of(dom_iff, 1, igvn->intcon(proj->_con));
    igvn->hash_delete(this);
    set_req(1, newbool);
    return true;
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -291,6 +291,8 @@ class LibraryCallKit : public GraphKit {
  bool inline_updateBytesCRC32();
  bool inline_updateByteBufferCRC32();
  bool inline_multiplyToLen();
  bool inline_squareToLen();
  bool inline_mulAdd();
  bool inline_profileBoolean();
  bool inline_isCompileConstant();
@ -494,6 +496,14 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    if (!UseMultiplyToLenIntrinsic) return NULL;
    break;
  case vmIntrinsics::_squareToLen:
    if (!UseSquareToLenIntrinsic) return NULL;
    break;
  case vmIntrinsics::_mulAdd:
    if (!UseMulAddIntrinsic) return NULL;
    break;
  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
    if (!UseAESIntrinsics) return NULL;
@ -913,6 +923,12 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_multiplyToLen:
    return inline_multiplyToLen();
  case vmIntrinsics::_squareToLen:
    return inline_squareToLen();
  case vmIntrinsics::_mulAdd:
    return inline_mulAdd();
  case vmIntrinsics::_encodeISOArray:
    return inline_encodeISOArray();
@ -5306,6 +5322,100 @@ bool LibraryCallKit::inline_multiplyToLen() {
  return true;
 }
 //-------------inline_squareToLen------------------------------------
 bool LibraryCallKit::inline_squareToLen() {
  assert(UseSquareToLenIntrinsic, "not implementated on this platform");
  address stubAddr = StubRoutines::squareToLen();
  if (stubAddr == NULL) {
    return false; // Intrinsic's stub is not implemented on this platform
  }
  const char* stubName = "squareToLen";
  assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
  Node* x    = argument(0);
  Node* len  = argument(1);
  Node* z    = argument(2);
  Node* zlen = argument(3);
  const Type* x_type = x->Value(&_gvn);
  const Type* z_type = z->Value(&_gvn);
  const TypeAryPtr* top_x = x_type->isa_aryptr();
  const TypeAryPtr* top_z = z_type->isa_aryptr();
  if (top_x  == NULL || top_x->klass()  == NULL ||
      top_z  == NULL || top_z->klass()  == NULL) {
    // failed array check
    return false;
  }
  BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  if (x_elem != T_INT || z_elem != T_INT) {
    return false;
  }
  Node* x_start = array_element_address(x, intcon(0), x_elem);
  Node* z_start = array_element_address(z, intcon(0), z_elem);
  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
                                  OptoRuntime::squareToLen_Type(),
                                  stubAddr, stubName, TypePtr::BOTTOM,
                                  x_start, len, z_start, zlen);
  set_result(z);
  return true;
 }
 //-------------inline_mulAdd------------------------------------------
 bool LibraryCallKit::inline_mulAdd() {
  assert(UseMulAddIntrinsic, "not implementated on this platform");
  address stubAddr = StubRoutines::mulAdd();
  if (stubAddr == NULL) {
    return false; // Intrinsic's stub is not implemented on this platform
  }
  const char* stubName = "mulAdd";
  assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
  Node* out      = argument(0);
  Node* in       = argument(1);
  Node* offset   = argument(2);
  Node* len      = argument(3);
  Node* k        = argument(4);
  const Type* out_type = out->Value(&_gvn);
  const Type* in_type = in->Value(&_gvn);
  const TypeAryPtr* top_out = out_type->isa_aryptr();
  const TypeAryPtr* top_in = in_type->isa_aryptr();
  if (top_out  == NULL || top_out->klass()  == NULL ||
      top_in == NULL || top_in->klass() == NULL) {
    // failed array check
    return false;
  }
  BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
  if (out_elem != T_INT || in_elem != T_INT) {
    return false;
  }
  Node* outlen = load_array_length(out);
  Node* new_offset = _gvn.transform(new SubINode(outlen, offset));
  Node* out_start = array_element_address(out, intcon(0), out_elem);
  Node* in_start = array_element_address(in, intcon(0), in_elem);
  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
                                  OptoRuntime::mulAdd_Type(),
                                  stubAddr, stubName, TypePtr::BOTTOM,
                                  out_start,in_start, new_offset, len, k);
  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
  set_result(result);
  return true;
 }
 /**
 * Calculate CRC32 for byte.
--- a/hotspot/src/share/vm/opto/loopTransform.cpp
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp
@ -475,7 +475,7 @@ void PhaseIdealLoop::do_peeling( IdealLoopTree *loop, Node_List &old_new ) {
  C->set_major_progress();
  // Peeling a 'main' loop in a pre/main/post situation obfuscates the
-  // 'pre' loop from the main and the 'pre' can no longer have it's
+  // 'pre' loop from the main and the 'pre' can no longer have its
  // iterations adjusted.  Therefore, we need to declare this loop as
  // no longer a 'main' loop; it will need new pre and post loops before
  // we can do further RCE.
@ -1911,10 +1911,13 @@ void PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
    return;
  assert(opqzm->in(1) == main_limit, "do not understand situation");
-  // Find the pre-loop limit; we will expand it's iterations to
+  // Find the pre-loop limit; we will expand its iterations to
  // not ever trip low tests.
  Node *p_f = iffm->in(0);
-  assert(p_f->Opcode() == Op_IfFalse, "");
+  // pre loop may have been optimized out
  if (p_f->Opcode() != Op_IfFalse) {
    return;
  }
  CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
  assert(pre_end->loopnode()->is_pre_loop(), "");
  Node *pre_opaq1 = pre_end->limit();
@ -2215,6 +2218,56 @@ void IdealLoopTree::adjust_loop_exit_prob( PhaseIdealLoop *phase ) {
  }
 }
 #ifdef ASSERT
 static CountedLoopNode* locate_pre_from_main(CountedLoopNode *cl) {
  Node *ctrl  = cl->in(LoopNode::EntryControl);
  assert(ctrl->Opcode() == Op_IfTrue || ctrl->Opcode() == Op_IfFalse, "");
  Node *iffm = ctrl->in(0);
  assert(iffm->Opcode() == Op_If, "");
  Node *p_f = iffm->in(0);
  assert(p_f->Opcode() == Op_IfFalse, "");
  CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
  assert(pre_end->loopnode()->is_pre_loop(), "");
  return pre_end->loopnode();
 }
 #endif
 // Remove the main and post loops and make the pre loop execute all
 // iterations. Useful when the pre loop is found empty.
 void IdealLoopTree::remove_main_post_loops(CountedLoopNode *cl, PhaseIdealLoop *phase) {
  CountedLoopEndNode* pre_end = cl->loopexit();
  Node* pre_cmp = pre_end->cmp_node();
  if (pre_cmp->in(2)->Opcode() != Op_Opaque1) {
    // Only safe to remove the main loop if the compiler optimized it
    // out based on an unknown number of iterations
    return;
  }
  // Can we find the main loop?
  if (_next == NULL) {
    return;
  }
  Node* next_head = _next->_head;
  if (!next_head->is_CountedLoop()) {
    return;
  }
  CountedLoopNode* main_head = next_head->as_CountedLoop();
  if (!main_head->is_main_loop()) {
    return;
  }
  assert(locate_pre_from_main(main_head) == cl, "bad main loop");
  Node* main_iff = main_head->in(LoopNode::EntryControl)->in(0);
  // Remove the Opaque1Node of the pre loop and make it execute all iterations
  phase->_igvn.replace_input_of(pre_cmp, 2, pre_cmp->in(2)->in(2));
  // Remove the Opaque1Node of the main loop so it can be optimized out
  Node* main_cmp = main_iff->in(1)->in(1);
  assert(main_cmp->in(2)->Opcode() == Op_Opaque1, "main loop has no opaque node?");
  phase->_igvn.replace_input_of(main_cmp, 2, main_cmp->in(2)->in(1));
 }
 //------------------------------policy_do_remove_empty_loop--------------------
 // Micro-benchmark spamming.  Policy is to always remove empty loops.
@ -2233,6 +2286,12 @@ bool IdealLoopTree::policy_do_remove_empty_loop( PhaseIdealLoop *phase ) {
  if (!phase->is_member(this, phase->get_ctrl(cl->loopexit()->in(CountedLoopEndNode::TestValue))))
    return false;             // Infinite loop
  if (cl->is_pre_loop()) {
    // If the loop we are removing is a pre-loop then the main and
    // post loop can be removed as well
    remove_main_post_loops(cl, phase);
  }
 #ifdef ASSERT
  // Ensure only one phi which is the iv.
  Node* iv = NULL;
--- a/hotspot/src/share/vm/opto/loopnode.hpp
+++ b/hotspot/src/share/vm/opto/loopnode.hpp
@ -485,6 +485,8 @@ public:
  bool is_inner()   { return is_loop() && _child == NULL; }
  bool is_counted() { return is_loop() && _head != NULL && _head->is_CountedLoop(); }
  void remove_main_post_loops(CountedLoopNode *cl, PhaseIdealLoop *phase);
 #ifndef PRODUCT
  void dump_head( ) const;      // Dump loop head only
  void dump() const;            // Dump this loop recursively
--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@ -945,6 +945,48 @@ const TypeFunc* OptoRuntime::multiplyToLen_Type() {
  return TypeFunc::make(domain, range);
 }
 const TypeFunc* OptoRuntime::squareToLen_Type() {
  // create input type (domain)
  int num_args      = 4;
  int argcnt = num_args;
  const Type** fields = TypeTuple::fields(argcnt);
  int argp = TypeFunc::Parms;
  fields[argp++] = TypePtr::NOTNULL;    // x
  fields[argp++] = TypeInt::INT;        // len
  fields[argp++] = TypePtr::NOTNULL;    // z
  fields[argp++] = TypeInt::INT;        // zlen
  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
  // no result type needed
  fields = TypeTuple::fields(1);
  fields[TypeFunc::Parms+0] = NULL;
  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
  return TypeFunc::make(domain, range);
 }
 // for mulAdd calls, 2 pointers and 3 ints, returning int
 const TypeFunc* OptoRuntime::mulAdd_Type() {
  // create input type (domain)
  int num_args      = 5;
  int argcnt = num_args;
  const Type** fields = TypeTuple::fields(argcnt);
  int argp = TypeFunc::Parms;
  fields[argp++] = TypePtr::NOTNULL;    // out
  fields[argp++] = TypePtr::NOTNULL;    // in
  fields[argp++] = TypeInt::INT;        // offset
  fields[argp++] = TypeInt::INT;        // len
  fields[argp++] = TypeInt::INT;        // k
  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
  // returning carry (int)
  fields = TypeTuple::fields(1);
  fields[TypeFunc::Parms+0] = TypeInt::INT;
  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
  return TypeFunc::make(domain, range);
 }
 //------------- Interpreter state access for on stack replacement
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@ -312,6 +312,10 @@ private:
  static const TypeFunc* multiplyToLen_Type();
  static const TypeFunc* squareToLen_Type();
  static const TypeFunc* mulAdd_Type();
  static const TypeFunc* updateBytesCRC32_Type();
  // leaf on stack replacement interpreter accessor types
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -137,6 +137,8 @@ address StubRoutines::_updateBytesCRC32 = NULL;
 address StubRoutines::_crc_table_adr = NULL;
 address StubRoutines::_multiplyToLen = NULL;
 address StubRoutines::_squareToLen = NULL;
 address StubRoutines::_mulAdd = NULL;
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -197,6 +197,8 @@ class StubRoutines: AllStatic {
  static address _crc_table_adr;
  static address _multiplyToLen;
  static address _squareToLen;
  static address _mulAdd;
  // These are versions of the java.lang.Math methods which perform
  // the same operations as the intrinsic version.  They are used for
@ -356,6 +358,8 @@ class StubRoutines: AllStatic {
  static address crc_table_addr()      { return _crc_table_adr; }
  static address multiplyToLen()       {return _multiplyToLen; }
  static address squareToLen()         {return _squareToLen; }
  static address mulAdd()              {return _mulAdd; }
  static address select_fill_function(BasicType t, bool aligned, const char* &name);
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -831,6 +831,8 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
     static_field(StubRoutines,                _crc_table_adr,                                address)                               \
     static_field(StubRoutines,                _multiplyToLen,                                address)                               \
     static_field(StubRoutines,                _squareToLen,                                  address)                               \
     static_field(StubRoutines,                _mulAdd,                                       address)                               \
                                                                                                                                     \
  /*****************/                                                                                                                \
  /* SharedRuntime */                                                                                                                \
--- a/hotspot/test/compiler/intrinsics/muladd/TestMulAdd.java
+++ b/hotspot/test/compiler/intrinsics/muladd/TestMulAdd.java
@ -0,0 +1,117 @@
 /*
 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 /**
 * @test
 * @bug 8081778
 * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
 *
 * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
 *      -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
 *      -XX:CompileCommand=dontinline,TestMulAdd::main
 *      -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
 *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
 *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
 *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
 *      -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
 *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
 *      -XX:CompileCommand=inline,java.math.BigInteger::square
 *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
 *      -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
 */
 import java.util.Random;
 import java.math.*;
 public class TestMulAdd {
    // Avoid intrinsic by preventing inlining multiply() and mulAdd().
    public static BigInteger base_multiply(BigInteger op1) {
      return op1.multiply(op1);
    }
    // Generate mulAdd() intrinsic by inlining multiply().
    public static BigInteger new_multiply(BigInteger op1) {
      return op1.multiply(op1);
    }
    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
      byte[] data1 = b1.toByteArray();
      byte[] data2 = b2.toByteArray();
      if (data1.length != data2.length)
        return false;
      for (int i = 0; i < data1.length; i++) {
        if (data1[i] != data2[i])
          return false;
      }
      return true;
    }
    public static String stringify(BigInteger b) {
      String strout= "";
      byte [] data = b.toByteArray();
      for (int i = 0; i < data.length; i++) {
        strout += (String.format("%02x",data[i]) + " ");
      }
      return strout;
    }
    public static void main(String args[]) throws Exception {
      BigInteger oldsum = new BigInteger("0");
      BigInteger newsum = new BigInteger("0");
      BigInteger b1, b2, oldres, newres;
      Random rand = new Random();
      long seed = System.nanoTime();
      Random rand1 = new Random();
      long seed1 = System.nanoTime();
      rand.setSeed(seed);
      rand1.setSeed(seed1);
      for (int j = 0; j < 100000; j++) {
        int rand_int = rand1.nextInt(3136)+32;
        b1 = new BigInteger(rand_int, rand);
        oldres = base_multiply(b1);
        newres = new_multiply(b1);
        oldsum = oldsum.add(oldres);
        newsum = newsum.add(newres);
        if (!bytecompare(oldres,newres)) {
          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
          System.out.println(b1);
          throw new Exception("Failed");
        }
      }
      if (!bytecompare(oldsum,newsum))  {
        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
        throw new Exception("Failed");
      } else {
        System.out.println("Success");
      }
   }
 }
--- a/hotspot/test/compiler/intrinsics/squaretolen/TestSquareToLen.java
+++ b/hotspot/test/compiler/intrinsics/squaretolen/TestSquareToLen.java
@ -0,0 +1,114 @@
 /*
 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 /**
 * @test
 * @bug 8081778
 * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
 *
 * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
 *      -XX:CompileCommand=exclude,TestSquareToLen::main
 *      -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
 *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
 *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
 *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
 *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
 *      -XX:CompileCommand=inline,java.math.BigInteger::square
 *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
 */
 import java.util.Random;
 import java.math.*;
 public class TestSquareToLen {
    // Avoid intrinsic by preventing inlining multiply() and squareToLen().
    public static BigInteger base_multiply(BigInteger op1) {
      return op1.multiply(op1);
    }
    // Generate squareToLen() intrinsic by inlining multiply().
    public static BigInteger new_multiply(BigInteger op1) {
      return op1.multiply(op1);
    }
    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
      byte[] data1 = b1.toByteArray();
      byte[] data2 = b2.toByteArray();
      if (data1.length != data2.length)
        return false;
      for (int i = 0; i < data1.length; i++) {
        if (data1[i] != data2[i])
          return false;
      }
      return true;
    }
    public static String stringify(BigInteger b) {
      String strout= "";
      byte [] data = b.toByteArray();
      for (int i = 0; i < data.length; i++) {
        strout += (String.format("%02x",data[i]) + " ");
      }
      return strout;
    }
    public static void main(String args[]) throws Exception {
      BigInteger oldsum = new BigInteger("0");
      BigInteger newsum = new BigInteger("0");
      BigInteger b1, b2, oldres, newres;
      Random rand = new Random();
      long seed = System.nanoTime();
      Random rand1 = new Random();
      long seed1 = System.nanoTime();
      rand.setSeed(seed);
      rand1.setSeed(seed1);
      for (int j = 0; j < 100000; j++) {
        int rand_int = rand1.nextInt(3136)+32;
        b1 = new BigInteger(rand_int, rand);
        oldres = base_multiply(b1);
        newres = new_multiply(b1);
        oldsum = oldsum.add(oldres);
        newsum = newsum.add(newres);
        if (!bytecompare(oldres,newres)) {
          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
          System.out.println(b1);
          throw new Exception("Failed");
        }
      }
      if (!bytecompare(oldsum,newsum))  {
        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
        throw new Exception("Failed");
      } else {
        System.out.println("Success");
      }
   }
 }
--- a/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java
+++ b/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java
@ -0,0 +1,94 @@
 /*
 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 /*
 * @test
 * @bug 8085832
 * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1
 * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestBadFoldCompare
 */
 public class TestBadFoldCompare {
    static boolean test1_taken;
    static void helper1(int i, int a, int b, boolean flag) {
        if (flag) {
            if (i <= a || i > b) {
                test1_taken = true;
            }
        }
    }
    static void test1(int i, boolean flag) {
        helper1(i, 0, 0, flag);
    }
    static boolean test2_taken;
    static void helper2(int i, int a, int b, boolean flag) {
        if (flag) {
            if (i > b || i <= a) {
                test2_taken = true;
            }
        }
    }
    static void test2(int i, boolean flag) {
        helper2(i, 0, 0, flag);
    }
    static public void main(String[] args) {
        boolean success = true;
        for (int i = 0; i < 20000; i++) {
            helper1(5, 0,  10, (i%2)==0);
            helper1(-1, 0,  10, (i%2)==0);
            helper1(15, 0,  10, (i%2)==0);
            test1(0, false);
        }
        test1_taken = false;
        test1(0, true);
        if (!test1_taken) {
            System.out.println("Test1 failed");
            success = false;
        }
        for (int i = 0; i < 20000; i++) {
            helper2(5, 0,  10, (i%2)==0);
            helper2(-1, 0,  10, (i%2)==0);
            helper2(15, 0,  10, (i%2)==0);
            test2(0, false);
        }
        test2_taken = false;
        test2(0, true);
        if (!test2_taken) {
            System.out.println("Test2 failed");
            success = false;
        }
        if (!success) {
            throw new RuntimeException("Some tests failed");
        }
    }
 }