Merge

2012-10-26 12:06:55 -07:00 · 2012-10-26 12:06:55 -07:00 · b8abfa016c
commit b8abfa016c
parent ebba61f003 fc84b11227
42 changed files with 4709 additions and 69 deletions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -1007,6 +1007,67 @@ void Assembler::addss(XMMRegister dst, Address src) {
  emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }

+void Assembler::aesdec(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xde);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xde);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdf);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdf);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenc(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdc);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdc);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenclast(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_aes(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdd);
+  emit_operand(dst, src);
+}
+
+void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_aes(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0xdd);
+  emit_byte(0xC0 | encode);
+}
+
+
 void Assembler::andl(Address dst, int32_t imm32) {
  InstructionMark im(this);
  prefix(dst);
@ -2307,6 +2368,22 @@ void Assembler::prefix(Prefix p) {
  a_byte(p);
 }

+void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_ssse3(), "");
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x00);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::pshufb(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_ssse3(), "");
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x00);
+  emit_operand(dst, src);
+}
+
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
  assert(isByte(mode), "invalid value");
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@ -8067,6 +8144,15 @@ void MacroAssembler::movptr(Address dst, Register src) {
  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 }

+void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::movdqu(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::movdqu(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
  if (reachable(src)) {
    Assembler::movsd(dst, as_Address(src));
@ -8357,6 +8443,17 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
  }
 }

+void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::pshufb(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::pshufb(dst, Address(rscratch1, 0));
+  }
+}
+
 // AVX 3-operands instructions

 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -875,6 +875,17 @@ private:
  void addss(XMMRegister dst, Address src);
  void addss(XMMRegister dst, XMMRegister src);

+  // AES instructions
+  void aesdec(XMMRegister dst, Address src);
+  void aesdec(XMMRegister dst, XMMRegister src);
+  void aesdeclast(XMMRegister dst, Address src);
+  void aesdeclast(XMMRegister dst, XMMRegister src);
+  void aesenc(XMMRegister dst, Address src);
+  void aesenc(XMMRegister dst, XMMRegister src);
+  void aesenclast(XMMRegister dst, Address src);
+  void aesenclast(XMMRegister dst, XMMRegister src);
+
+
  void andl(Address  dst, int32_t imm32);
  void andl(Register dst, int32_t imm32);
  void andl(Register dst, Address src);
@ -1424,6 +1435,10 @@ private:
  void prefetcht2(Address src);
  void prefetchw(Address src);

+  // Shuffle Bytes
+  void pshufb(XMMRegister dst, XMMRegister src);
+  void pshufb(XMMRegister dst, Address src);
+
  // Shuffle Packed Doublewords
  void pshufd(XMMRegister dst, XMMRegister src, int mode);
  void pshufd(XMMRegister dst, Address src,     int mode);
@ -2611,6 +2626,12 @@ public:
  void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
  void divss(XMMRegister dst, AddressLiteral src);

+  // Move Unaligned Double Quadword
+  void movdqu(Address     dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, Address src)       { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
+  void movdqu(XMMRegister dst, AddressLiteral src);
+
  void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
  void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
  void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
@ -2658,6 +2679,10 @@ public:
  void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
  void xorps(XMMRegister dst, AddressLiteral src);

+  // Shuffle Bytes
+  void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
+  void pshufb(XMMRegister dst, Address src)     { Assembler::pshufb(dst, src); }
+  void pshufb(XMMRegister dst, AddressLiteral src);
  // AVX 3-operands instructions

  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -2137,6 +2137,529 @@ class StubGenerator: public StubCodeGenerator {
    }
  }

+  // AES intrinsic stubs
+  enum {AESBlockSize = 16};
+
+  address generate_key_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+    address start = __ pc();
+    __ emit_data(0x00010203, relocInfo::none, 0 );
+    __ emit_data(0x04050607, relocInfo::none, 0 );
+    __ emit_data(0x08090a0b, relocInfo::none, 0 );
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
+    return start;
+  }
+
+  // Utility routine for loading a 128-bit key word in little endian format
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    __ movdqu(xmmdst, Address(key, offset));
+    if (xmm_shuf_mask != NULL) {
+      __ pshufb(xmmdst, xmm_shuf_mask);
+    } else {
+      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    }
+  }
+
+  // aesenc using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesenc(xmmdst, xmmtmp);
+  }
+
+  // aesdec using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesdec(xmmdst, xmmtmp);
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register keylen      = rax;
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(rsi);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+
+    // For encryption, the java expanded key ordering is just what we need
+
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp);
+    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    __ aesenclast(xmm_result, xmm_temp);
+    __ movdqu(Address(to, 0), xmm_result);        // store the result
+    __ xorptr(rax, rax); // return 0
+    __ pop(rsi);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register keylen      = rax;
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(rsi);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));
+
+    // for decryption java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    // we don't know if the key is aligned, hence not using load-execute form
+    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+    __ pxor  (xmm_result, xmm_temp);
+    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 192 and 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    // for decryption the aesdeclast operation is always on key+0x00
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ aesdeclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, 0), xmm_result);  // store the result
+
+    __ xorptr(rax, rax); // return 0
+    __ pop(rsi);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+  void handleSOERegisters(bool saving) {
+    const int saveFrameSizeInBytes = 4 * wordSize;
+    const Address saved_rbx     (rbp, -3 * wordSize);
+    const Address saved_rsi     (rbp, -2 * wordSize);
+    const Address saved_rdi     (rbp, -1 * wordSize);
+
+    if (saving) {
+      __ subptr(rsp, saveFrameSizeInBytes);
+      __ movptr(saved_rsi, rsi);
+      __ movptr(saved_rdi, rdi);
+      __ movptr(saved_rbx, rbx);
+    } else {
+      // restoring
+      __ movptr(rsi, saved_rsi);
+      __ movptr(rdi, saved_rdi);
+      __ movptr(rbx, saved_rbx);
+    }
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register rvec        = rdi;      // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // first 6 keys preloaded into xmm2-xmm7
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 7;
+    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/);
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+    __ movptr(rvec , rvec_param);
+    __ movptr(len_reg , len_param);
+
+    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 7 with keys 0-5
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+    // 128 bit code follows here
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_128);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xa0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_128);
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
+
+    handleSOERegisters(false /*restoring*/);
+    __ movl(rax, 0);                             // return 0 (why?)
+    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+  __ BIND(L_key_192_256);
+  // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_loopTop_192);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xc0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_192);
+    __ jmp(L_exit);
+
+  __ BIND(L_key_256);
+    // 256-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_loopTop_256);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0xe0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+  // CBC AES Decryption.
+  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256;
+    Label L_singleBlock_loopTop_128;
+    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register rvec        = rdi;      // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // first 6 keys preloaded into xmm2-xmm7
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 7;
+    const int FIRST_NON_REG_KEY_offset = 0x70;
+    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/);
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    __ movptr(key  , key_param);
+    __ movptr(rvec , rvec_param);
+    __ movptr(len_reg , len_param);
+
+    // the java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 6 with first 5 keys
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    // inside here, use the rvec register to point to previous block cipher
+    // with which we xor at the end of each newly decrypted block
+    const Register  prev_block_cipher_ptr = rvec;
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+    // 128-bit code follows here, parallelized
+    __ movptr(pos, 0);
+  __ align(OptoLoopAlignment);
+  __ BIND(L_singleBlock_loopTop_128);
+    __ cmpptr(len_reg, 0);           // any blocks left??
+    __ jcc(Assembler::equal, L_exit);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jmp(L_singleBlock_loopTop_128);
+
+
+    __ BIND(L_exit);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
+    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
+    handleSOERegisters(false /*restoring*/);
+    __ movl(rax, 0);                                                  // return 0 (why?)
+    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_192);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_256);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
+      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+    }
+    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
+    __ aesdeclast(xmm_result, xmm_temp);
+    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
 public:
  // Information about frame layout at time of blocking runtime call.
  // Note that we only have to preserve callee-saved registers since
@ -2332,6 +2855,16 @@ class StubGenerator: public StubCodeGenerator {
    generate_arraycopy_stubs();

    generate_math_stubs();
+
+    // don't bother generating these AES intrinsic stubs unless global flag is set
+    if (UseAESIntrinsics) {
+      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
+
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
  }


--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -2941,6 +2941,548 @@ class StubGenerator: public StubCodeGenerator {
    }
  }

+  // AES intrinsic stubs
+  enum {AESBlockSize = 16};
+
+  address generate_key_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+    address start = __ pc();
+    __ emit_data64( 0x0405060700010203, relocInfo::none );
+    __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
+    return start;
+  }
+
+  // Utility routine for loading a 128-bit key word in little endian format
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    __ movdqu(xmmdst, Address(key, offset));
+    if (xmm_shuf_mask != NULL) {
+      __ pshufb(xmmdst, xmm_shuf_mask);
+    } else {
+      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    }
+  }
+
+  // aesenc using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesenc(xmmdst, xmmtmp);
+  }
+
+  // aesdec using specified key+offset
+  // can optionally specify that the shuffle mask is already in an xmmregister
+  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+    load_key(xmmtmp, key, offset, xmm_shuf_mask);
+    __ aesdec(xmmdst, xmmtmp);
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rax;
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+
+    // For encryption, the java expanded key ordering is just what we need
+    // we don't know if the key is aligned, hence not using load-execute form
+
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp);
+    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    __ aesenclast(xmm_result, xmm_temp);
+    __ movdqu(Address(to, 0), xmm_result);        // store the result
+    __ xorptr(rax, rax); // return 0
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+    address start = __ pc();
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rax;
+
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    const XMMRegister xmm_key_shuf_mask = xmm2;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    // keylen = # of 32-bit words, convert to 128-bit words
+    __ shrl(keylen, 2);
+    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
+
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movdqu(xmm_result, Address(from, 0));
+
+    // for decryption java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    // we don't know if the key is aligned, hence not using load-execute form
+    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+    __ pxor  (xmm_result, xmm_temp);
+    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+    }
+    __ cmpl(keylen, 0);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 192 and 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+    __ subl(keylen, 2);
+    __ jcc(Assembler::equal, L_doLast);
+    // only in 256 bit keys
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+    __ BIND(L_doLast);
+    // for decryption the aesdeclast operation is always on key+0x00
+    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+    __ aesdeclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, 0), xmm_result);  // store the result
+
+    __ xorptr(rax, rax); // return 0
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+#else
+    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
+    const Register len_reg     = r10;      // pick the first volatile windows register
+#endif
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    const XMMRegister xmm_temp   = xmm1;
+    // keys 0-10 preloaded into xmm2-xmm12
+    const int XMM_REG_NUM_KEY_FIRST = 2;
+    const int XMM_REG_NUM_KEY_LAST  = 12;
+    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    // save the xmm registers which must be preserved 6-12
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+
+    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+    // 128 bit code follows here
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_128);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    __ aesenclast(xmm_result, xmm_key10);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_128);
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ movl(rax, 0); // return 0 (why?)
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_192);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+    load_key(xmm_temp, key, 0xc0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be changed to use more xmm registers)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_256);
+    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
+    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
+
+    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      __ aesenc(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
+    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
+    load_key(xmm_temp, key, 0xe0);
+    __ aesenclast(xmm_result, xmm_temp);
+
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual, L_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+
+  // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
+  // to hide instruction latency
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+
+  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
+    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+    address start = __ pc();
+
+    Label L_exit, L_key_192_256, L_key_256;
+    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
+    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+#else
+    const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
+    const Register len_reg     = r10;      // pick the first volatile windows register
+#endif
+    const Register pos         = rax;
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
+    // keys 0-10 preloaded into xmm2-xmm12
+    const int XMM_REG_NUM_KEY_FIRST = 5;
+    const int XMM_REG_NUM_KEY_LAST  = 15;
+    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    // save the xmm registers which must be preserved 6-15
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#endif
+    // the java expanded key ordering is rotated one position from what we want
+    // so we start from 0x10 here and hit 0x00 last
+    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
+      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+      offset += 0x10;
+    }
+
+    const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
+    // registers holding the four results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm0;
+    const XMMRegister xmm_result1 = xmm2;
+    const XMMRegister xmm_result2 = xmm3;
+    const XMMRegister xmm_result3 = xmm4;
+
+    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
+
+    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 44);
+    __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+    // 128-bit code follows here, parallelized
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_multiBlock_loopTop_128);
+    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
+    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
+
+    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
+    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
+    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
+    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
+
+#define DoFour(opc, src_reg)                    \
+    __ opc(xmm_result0, src_reg);               \
+    __ opc(xmm_result1, src_reg);               \
+    __ opc(xmm_result2, src_reg);               \
+    __ opc(xmm_result3, src_reg);
+
+    DoFour(pxor, xmm_key_first);
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      DoFour(aesdec, as_XMMRegister(rnum));
+    }
+    DoFour(aesdeclast, xmm_key_last);
+    // for each result, xor with the r vector of previous cipher block
+    __ pxor(xmm_result0, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
+    __ pxor(xmm_result1, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
+    __ pxor(xmm_result2, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
+    __ pxor(xmm_result3, xmm_prev_block_cipher);
+    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
+
+    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
+    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
+    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
+
+    __ addptr(pos, 4*AESBlockSize);
+    __ subptr(len_reg, 4*AESBlockSize);
+    __ jmp(L_multiBlock_loopTop_128);
+
+    // registers used in the non-parallelized loops
+    const XMMRegister xmm_prev_block_cipher_save = xmm2;
+    const XMMRegister xmm_temp   = xmm3;
+
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_128);
+    __ cmpptr(len_reg, 0);           // any blocks left??
+    __ jcc(Assembler::equal, L_exit);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    __ aesdeclast(xmm_result, xmm_key_last);
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jmp(L_singleBlock_loopTop_128);
+
+
+    __ BIND(L_exit);
+    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
+#ifdef _WIN64
+    // restore regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+#endif
+    __ movl(rax, 0); // return 0 (why?)
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::notEqual, L_key_256);
+
+    // 192-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_192);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+    __ jmp(L_exit);
+
+    __ BIND(L_key_256);
+    // 256-bit code follows here (could be optimized to use parallelism)
+    __ movptr(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_256);
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
+    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+      __ aesdec(xmm_result, as_XMMRegister(rnum));
+    }
+    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
+    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
+    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
+    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
+    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    // no need to store r to memory until we exit
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
+
+    __ addptr(pos, AESBlockSize);
+    __ subptr(len_reg, AESBlockSize);
+    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+    __ jmp(L_exit);
+
+    return start;
+  }
+
+
+
 #undef __
 #define __ masm->

@ -3135,6 +3677,16 @@ class StubGenerator: public StubCodeGenerator {
    generate_arraycopy_stubs();

    generate_math_stubs();
+
+    // don't bother generating these AES intrinsic stubs unless global flag is set
+    if (UseAESIntrinsics) {
+      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
+
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+    }
  }

 public:
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp
@ -44,3 +44,4 @@

 address StubRoutines::x86::_verify_mxcsr_entry         = NULL;
 address StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
@ -41,10 +41,14 @@ class x86 {
 private:
  static address _verify_mxcsr_entry;
  static address _verify_fpu_cntrl_wrd_entry;
+  // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+  static address _key_shuffle_mask_addr;

 public:
  static address verify_mxcsr_entry()                        { return _verify_mxcsr_entry; }
  static address verify_fpu_cntrl_wrd_entry()                { return _verify_fpu_cntrl_wrd_entry; }
+  static address key_shuffle_mask_addr()                     { return _key_shuffle_mask_addr; }
+
 };

  static bool    returns_to_call_stub(address return_pc)     { return return_pc == _call_stub_return_address; }
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp
@ -56,3 +56,4 @@ address StubRoutines::x86::_float_sign_flip = NULL;
 address StubRoutines::x86::_double_sign_mask = NULL;
 address StubRoutines::x86::_double_sign_flip = NULL;
 address StubRoutines::x86::_mxcsr_std = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@ -54,6 +54,8 @@ class x86 {
  static address _double_sign_mask;
  static address _double_sign_flip;
  static address _mxcsr_std;
+  // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+  static address _key_shuffle_mask_addr;

 public:

@ -116,6 +118,9 @@ class x86 {
  {
    return _mxcsr_std;
  }
+
+  static address key_shuffle_mask_addr()                     { return _key_shuffle_mask_addr; }
+
 };

 #endif // CPU_X86_VM_STUBROUTINES_X86_64_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -419,13 +419,16 @@ void VM_Version::get_processor_features() {
  if (UseAVX < 1)
    _cpuFeatures &= ~CPU_AVX;

+  if (!UseAES && !FLAG_IS_DEFAULT(UseAES))
+    _cpuFeatures &= ~CPU_AES;
+
  if (logical_processors_per_package() == 1) {
    // HT processor could be installed on a system which doesn't support HT.
    _cpuFeatures &= ~CPU_HT;
  }

  char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               cores_per_cpu(), threads_per_core(),
               cpu_family(), _model, _stepping,
               (supports_cmov() ? ", cmov" : ""),
@ -441,6 +444,7 @@ void VM_Version::get_processor_features() {
               (supports_popcnt() ? ", popcnt" : ""),
               (supports_avx()    ? ", avx" : ""),
               (supports_avx2()   ? ", avx2" : ""),
+               (supports_aes()    ? ", aes" : ""),
               (supports_mmx_ext() ? ", mmxext" : ""),
               (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
               (supports_lzcnt()   ? ", lzcnt": ""),
@ -472,6 +476,29 @@ void VM_Version::get_processor_features() {
  if (!supports_avx ()) // Drop to 0 if no AVX  support
    UseAVX = 0;

+  // Use AES instructions if available.
+  if (supports_aes()) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      UseAES = true;
+    }
+  } else if (UseAES) {
+    if (!FLAG_IS_DEFAULT(UseAES))
+      warning("AES instructions not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  // The AES intrinsic stubs require AES instruction support (of course)
+  // but also require AVX mode for misaligned SSE access
+  if (UseAES && (UseAVX > 0)) {
+    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      UseAESIntrinsics = true;
+    }
+  } else if (UseAESIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+      warning("AES intrinsics not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
 #ifdef COMPILER2
  if (UseFPUForSpilling) {
    if (UseSSE < 2) {
@ -714,6 +741,9 @@ void VM_Version::get_processor_features() {
    if (UseAVX > 0) {
      tty->print("  UseAVX=%d",UseAVX);
    }
+    if (UseAES) {
+      tty->print("  UseAES=1");
+    }
    tty->cr();
    tty->print("Allocation");
    if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@ -78,7 +78,9 @@ public:
               sse4_2   : 1,
                        : 2,
               popcnt   : 1,
-                        : 3,
+                        : 1,
+               aes      : 1,
+                        : 1,
               osxsave  : 1,
               avx      : 1,
                        : 3;
@ -244,7 +246,8 @@ protected:
    CPU_TSC    = (1 << 15),
    CPU_TSCINV = (1 << 16),
    CPU_AVX    = (1 << 17),
-    CPU_AVX2   = (1 << 18)
+    CPU_AVX2   = (1 << 18),
+    CPU_AES    = (1 << 19)
  } cpuFeatureFlags;

  enum {
@ -420,6 +423,8 @@ protected:
      result |= CPU_TSC;
    if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
      result |= CPU_TSCINV;
+    if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
+      result |= CPU_AES;

    // AMD features.
    if (is_amd()) {
@ -544,6 +549,7 @@ public:
  static bool supports_avx()      { return (_cpuFeatures & CPU_AVX) != 0; }
  static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
  static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
+  static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }

  // Intel features
  static bool is_intel_family_core() { return is_intel() &&
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -4102,9 +4102,158 @@ instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{

 // ----------------------- LogicalRightShift -----------------------------------

-// Shorts/Chars vector logical right shift produces incorrect Java result
+// Shorts vector logical right shift produces incorrect Java result
 // for negative data because java code convert short value into int with
-// sign extension before a shift.
+// sign extension before a shift. But char vectors are fine since chars are
+// unsigned values.
+
+instruct vsrl2S(vecS dst, vecS shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2S_imm(vecS dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4S(vecD dst, vecS shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4S_imm(vecD dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8S(vecX dst, vecS shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8S_imm(vecX dst, immI8 shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS dst shift));
+  format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
+  ins_encode %{
+    __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
+  ins_encode %{
+    bool vector256 = false;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+  %}
+  ins_pipe( pipe_slow );
+%}

 // Integers vector logical right shift
 instruct vsrl2I(vecD dst, vecS shift) %{
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
@ -1844,17 +1844,12 @@ void GraphBuilder::invoke(Bytecodes::Code code) {
        code == Bytecodes::_invokevirtual && target->is_final_method() ||
        code == Bytecodes::_invokedynamic) {
      ciMethod* inline_target = (cha_monomorphic_target != NULL) ? cha_monomorphic_target : target;
-      bool success = false;
-      if (target->is_method_handle_intrinsic()) {
-        // method handle invokes
-        success = try_method_handle_inline(target);
-      } else {
-        // static binding => check if callee is ok
-        success = try_inline(inline_target, (cha_monomorphic_target != NULL) || (exact_target != NULL), code, better_receiver);
-      }
-      CHECK_BAILOUT();
+      // static binding => check if callee is ok
+      bool success = try_inline(inline_target, (cha_monomorphic_target != NULL) || (exact_target != NULL), code, better_receiver);

+      CHECK_BAILOUT();
      clear_inline_bailout();
+
      if (success) {
        // Register dependence if JVMTI has either breakpoint
        // setting or hotswapping of methods capabilities since they may
@ -3201,6 +3196,11 @@ bool GraphBuilder::try_inline(ciMethod* callee, bool holder_known, Bytecodes::Co
    return false;
  }

+  // method handle invokes
+  if (callee->is_method_handle_intrinsic()) {
+    return try_method_handle_inline(callee);
+  }
+
  // handle intrinsics
  if (callee->intrinsic_id() != vmIntrinsics::_none) {
    if (try_inline_intrinsics(callee)) {
@ -3885,10 +3885,14 @@ bool GraphBuilder::try_method_handle_inline(ciMethod* callee) {
      ValueType* type = state()->stack_at(args_base)->type();
      if (type->is_constant()) {
        ciMethod* target = type->as_ObjectType()->constant_value()->as_method_handle()->get_vmtarget();
-        guarantee(!target->is_method_handle_intrinsic(), "should not happen");  // XXX remove
-        Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual;
-        if (try_inline(target, /*holder_known*/ true, bc)) {
-          return true;
+        // We don't do CHA here so only inline static and statically bindable methods.
+        if (target->is_static() || target->can_be_statically_bound()) {
+          Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual;
+          if (try_inline(target, /*holder_known*/ true, bc)) {
+            return true;
+          }
+        } else {
+          print_inlining(target, "not static or statically bindable", /*success*/ false);
        }
      } else {
        print_inlining(callee, "receiver not constant", /*success*/ false);
@ -3941,9 +3945,14 @@ bool GraphBuilder::try_method_handle_inline(ciMethod* callee) {
            }
            j += t->size();  // long and double take two slots
          }
-          Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual;
-          if (try_inline(target, /*holder_known*/ true, bc)) {
-            return true;
+          // We don't do CHA here so only inline static and statically bindable methods.
+          if (target->is_static() || target->can_be_statically_bound()) {
+            Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual;
+            if (try_inline(target, /*holder_known*/ true, bc)) {
+              return true;
+            }
+          } else {
+            print_inlining(target, "not static or statically bindable", /*success*/ false);
          }
        }
      } else {
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -110,6 +110,7 @@
  template(sun_jkernel_DownloadManager,               "sun/jkernel/DownloadManager")              \
  template(getBootClassPathEntryForClass_name,        "getBootClassPathEntryForClass")            \
  template(sun_misc_PostVMInitHook,                   "sun/misc/PostVMInitHook")                  \
+  template(sun_misc_Launcher_ExtClassLoader,          "sun/misc/Launcher$ExtClassLoader")         \
                                                                                                  \
  /* Java runtime version access */                                                               \
  template(sun_misc_Version,                          "sun/misc/Version")                         \
@ -723,6 +724,21 @@
  /* java/lang/ref/Reference */                                                                                         \
  do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                        \
+  /* support for com.sum.crypto.provider.AESCrypt and some of its callers */                                            \
+  do_class(com_sun_crypto_provider_aescrypt,      "com/sun/crypto/provider/AESCrypt")                                   \
+  do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
+  do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
+   do_name(     encryptBlock_name,                                 "encryptBlock")                                      \
+   do_name(     decryptBlock_name,                                 "decryptBlock")                                      \
+   do_signature(byteArray_int_byteArray_int_signature,             "([BI[BI)V")                                         \
+                                                                                                                        \
+  do_class(com_sun_crypto_provider_cipherBlockChaining,            "com/sun/crypto/provider/CipherBlockChaining")       \
+   do_intrinsic(_cipherBlockChaining_encryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, encrypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_name(     encrypt_name,                                      "encrypt")                                           \
+   do_name(     decrypt_name,                                      "decrypt")                                           \
+   do_signature(byteArray_int_int_byteArray_int_signature,         "([BII[BI)V")                                        \
+                                                                                                                        \
  /* support for sun.misc.Unsafe */                                                                                     \
  do_class(sun_misc_Unsafe,               "sun/misc/Unsafe")                                                            \
                                                                                                                        \
--- a/hotspot/src/share/vm/oops/method.cpp
+++ b/hotspot/src/share/vm/oops/method.cpp
@ -1155,8 +1155,12 @@ methodHandle Method::clone_with_new_data(methodHandle m, u_char* new_code, int n
 vmSymbols::SID Method::klass_id_for_intrinsics(Klass* holder) {
  // if loader is not the default loader (i.e., != NULL), we can't know the intrinsics
  // because we are not loading from core libraries
-  if (InstanceKlass::cast(holder)->class_loader() != NULL)
+  // exception: the AES intrinsics come from lib/ext/sunjce_provider.jar
+  // which does not use the class default class loader so we check for its loader here
+  if ((InstanceKlass::cast(holder)->class_loader() != NULL) &&
+       InstanceKlass::cast(holder)->class_loader()->klass()->name() != vmSymbols::sun_misc_Launcher_ExtClassLoader()) {
    return vmSymbols::NO_SID;   // regardless of name, no intrinsics here
+  }

  // see if the klass name is well-known:
  Symbol* klass_name = InstanceKlass::cast(holder)->name();
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -439,6 +439,9 @@
  product(bool, DoEscapeAnalysis, true,                                     \
          "Perform escape analysis")                                        \
                                                                            \
+  develop(bool, ExitEscapeAnalysisOnTimeout, true,                          \
+          "Exit or throw assert in EA when it reaches time limit")          \
+                                                                            \
  notproduct(bool, PrintEscapeAnalysis, false,                              \
          "Print the results of escape analysis")                           \
                                                                            \
--- a/hotspot/src/share/vm/opto/callGenerator.cpp
+++ b/hotspot/src/share/vm/opto/callGenerator.cpp
@ -670,6 +670,129 @@ CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod*
 }


+//------------------------PredictedIntrinsicGenerator------------------------------
+// Internal class which handles all predicted Intrinsic calls.
+class PredictedIntrinsicGenerator : public CallGenerator {
+  CallGenerator* _intrinsic;
+  CallGenerator* _cg;
+
+public:
+  PredictedIntrinsicGenerator(CallGenerator* intrinsic,
+                              CallGenerator* cg)
+    : CallGenerator(cg->method())
+  {
+    _intrinsic = intrinsic;
+    _cg        = cg;
+  }
+
+  virtual bool      is_virtual()   const    { return true; }
+  virtual bool      is_inlined()   const    { return true; }
+  virtual bool      is_intrinsic() const    { return true; }
+
+  virtual JVMState* generate(JVMState* jvms);
+};
+
+
+CallGenerator* CallGenerator::for_predicted_intrinsic(CallGenerator* intrinsic,
+                                                      CallGenerator* cg) {
+  return new PredictedIntrinsicGenerator(intrinsic, cg);
+}
+
+
+JVMState* PredictedIntrinsicGenerator::generate(JVMState* jvms) {
+  GraphKit kit(jvms);
+  PhaseGVN& gvn = kit.gvn();
+
+  CompileLog* log = kit.C->log();
+  if (log != NULL) {
+    log->elem("predicted_intrinsic bci='%d' method='%d'",
+              jvms->bci(), log->identify(method()));
+  }
+
+  Node* slow_ctl = _intrinsic->generate_predicate(kit.sync_jvms());
+  if (kit.failing())
+    return NULL;  // might happen because of NodeCountInliningCutoff
+
+  SafePointNode* slow_map = NULL;
+  JVMState* slow_jvms;
+  if (slow_ctl != NULL) {
+    PreserveJVMState pjvms(&kit);
+    kit.set_control(slow_ctl);
+    if (!kit.stopped()) {
+      slow_jvms = _cg->generate(kit.sync_jvms());
+      if (kit.failing())
+        return NULL;  // might happen because of NodeCountInliningCutoff
+      assert(slow_jvms != NULL, "must be");
+      kit.add_exception_states_from(slow_jvms);
+      kit.set_map(slow_jvms->map());
+      if (!kit.stopped())
+        slow_map = kit.stop();
+    }
+  }
+
+  if (kit.stopped()) {
+    // Predicate is always false.
+    kit.set_jvms(slow_jvms);
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  // Generate intrinsic code:
+  JVMState* new_jvms = _intrinsic->generate(kit.sync_jvms());
+  if (new_jvms == NULL) {
+    // Intrinsic failed, so use slow code or make a direct call.
+    if (slow_map == NULL) {
+      CallGenerator* cg = CallGenerator::for_direct_call(method());
+      new_jvms = cg->generate(kit.sync_jvms());
+    } else {
+      kit.set_jvms(slow_jvms);
+      return kit.transfer_exceptions_into_jvms();
+    }
+  }
+  kit.add_exception_states_from(new_jvms);
+  kit.set_jvms(new_jvms);
+
+  // Need to merge slow and fast?
+  if (slow_map == NULL) {
+    // The fast path is the only path remaining.
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  if (kit.stopped()) {
+    // Intrinsic method threw an exception, so it's just the slow path after all.
+    kit.set_jvms(slow_jvms);
+    return kit.transfer_exceptions_into_jvms();
+  }
+
+  // Finish the diamond.
+  kit.C->set_has_split_ifs(true); // Has chance for split-if optimization
+  RegionNode* region = new (kit.C) RegionNode(3);
+  region->init_req(1, kit.control());
+  region->init_req(2, slow_map->control());
+  kit.set_control(gvn.transform(region));
+  Node* iophi = PhiNode::make(region, kit.i_o(), Type::ABIO);
+  iophi->set_req(2, slow_map->i_o());
+  kit.set_i_o(gvn.transform(iophi));
+  kit.merge_memory(slow_map->merged_memory(), region, 2);
+  uint tos = kit.jvms()->stkoff() + kit.sp();
+  uint limit = slow_map->req();
+  for (uint i = TypeFunc::Parms; i < limit; i++) {
+    // Skip unused stack slots; fast forward to monoff();
+    if (i == tos) {
+      i = kit.jvms()->monoff();
+      if( i >= limit ) break;
+    }
+    Node* m = kit.map()->in(i);
+    Node* n = slow_map->in(i);
+    if (m != n) {
+      const Type* t = gvn.type(m)->meet(gvn.type(n));
+      Node* phi = PhiNode::make(region, m, t);
+      phi->set_req(2, n);
+      kit.map()->set_req(i, gvn.transform(phi));
+    }
+  }
+  return kit.transfer_exceptions_into_jvms();
+}
+
 //-------------------------UncommonTrapCallGenerator-----------------------------
 // Internal class which handles all out-of-line calls checking receiver type.
 class UncommonTrapCallGenerator : public CallGenerator {
--- a/hotspot/src/share/vm/opto/callGenerator.hpp
+++ b/hotspot/src/share/vm/opto/callGenerator.hpp
@ -143,6 +143,9 @@ class CallGenerator : public ResourceObj {
  // Registry for intrinsics:
  static CallGenerator* for_intrinsic(ciMethod* m);
  static void register_intrinsic(ciMethod* m, CallGenerator* cg);
+  static CallGenerator* for_predicted_intrinsic(CallGenerator* intrinsic,
+                                                CallGenerator* cg);
+  virtual Node* generate_predicate(JVMState* jvms) { return NULL; };

  static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) {
    if (PrintInlining)
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -3047,9 +3047,9 @@ bool Compile::Constant::operator==(const Constant& other) {
  case T_LONG:
  case T_DOUBLE:  return (_v._value.j == other._v._value.j);
  case T_OBJECT:
-  case T_METADATA: return (_v._metadata == other._v._metadata);
  case T_ADDRESS: return (_v._value.l == other._v._value.l);
  case T_VOID:    return (_v._value.l == other._v._value.l);  // jump-table entries
+  case T_METADATA: return (_v._metadata == other._v._metadata);
  default: ShouldNotReachHere();
  }
  return false;
--- a/hotspot/src/share/vm/opto/compile.hpp
+++ b/hotspot/src/share/vm/opto/compile.hpp
@ -149,7 +149,7 @@ class Compile : public Phase {
  private:
    BasicType _type;
    union {
-    jvalue    _value;
+      jvalue    _value;
      Metadata* _metadata;
    } _v;
    int       _offset;         // offset of this constant (in bytes) relative to the constant table base.
--- a/hotspot/src/share/vm/opto/doCall.cpp
+++ b/hotspot/src/share/vm/opto/doCall.cpp
@ -107,7 +107,17 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
  // intrinsics handle strict f.p. correctly.
  if (allow_inline && allow_intrinsics) {
    CallGenerator* cg = find_intrinsic(callee, call_is_virtual);
-    if (cg != NULL)  return cg;
+    if (cg != NULL) {
+      if (cg->is_predicted()) {
+        // Code without intrinsic but, hopefully, inlined.
+        CallGenerator* inline_cg = this->call_generator(callee,
+              vtable_index, call_is_virtual, jvms, allow_inline, prof_factor, false);
+        if (inline_cg != NULL) {
+          cg = CallGenerator::for_predicted_intrinsic(cg, inline_cg);
+        }
+      }
+      return cg;
+    }
  }

  // Do method handle calls.
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -893,12 +893,16 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                                       arg_has_oops && (i > TypeFunc::Parms);
 #ifdef ASSERT
          if (!(is_arraycopy ||
-                call->as_CallLeaf()->_name != NULL &&
-                (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre")  == 0 ||
-                 strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ))
-          ) {
+                (call->as_CallLeaf()->_name != NULL &&
+                 (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre")  == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0)
+                  ))) {
            call->dump();
-            assert(false, "EA: unexpected CallLeaf");
+            fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
          }
 #endif
          // Always process arraycopy's destination object since
@ -1080,7 +1084,7 @@ bool ConnectionGraph::complete_connection_graph(
      C->log()->text("%s", (iterations >= CG_BUILD_ITER_LIMIT) ? "iterations" : "time");
      C->log()->end_elem(" limit'");
    }
-    assert(false, err_msg_res("infinite EA connection graph build (%f sec, %d iterations) with %d nodes and worklist size %d",
+    assert(ExitEscapeAnalysisOnTimeout, err_msg_res("infinite EA connection graph build (%f sec, %d iterations) with %d nodes and worklist size %d",
           time.seconds(), iterations, nodes_size(), ptnodes_worklist.length()));
    // Possible infinite build_connection_graph loop,
    // bailout (no changes to ideal graph were made).
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -44,18 +44,22 @@ class LibraryIntrinsic : public InlineCallGenerator {
 public:
 private:
  bool             _is_virtual;
+  bool             _is_predicted;
  vmIntrinsics::ID _intrinsic_id;

 public:
-  LibraryIntrinsic(ciMethod* m, bool is_virtual, vmIntrinsics::ID id)
+  LibraryIntrinsic(ciMethod* m, bool is_virtual, bool is_predicted, vmIntrinsics::ID id)
    : InlineCallGenerator(m),
      _is_virtual(is_virtual),
+      _is_predicted(is_predicted),
      _intrinsic_id(id)
  {
  }
  virtual bool is_intrinsic() const { return true; }
  virtual bool is_virtual()   const { return _is_virtual; }
+  virtual bool is_predicted()   const { return _is_predicted; }
  virtual JVMState* generate(JVMState* jvms);
+  virtual Node* generate_predicate(JVMState* jvms);
  vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; }
 };

@ -83,6 +87,7 @@ class LibraryCallKit : public GraphKit {
  int               arg_size()  const    { return callee()->arg_size(); }

  bool try_to_inline();
+  Node* try_to_predicate();

  // Helper functions to inline natives
  void push_result(RegionNode* region, PhiNode* value);
@ -148,6 +153,7 @@ class LibraryCallKit : public GraphKit {
  CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
    return generate_method_call(method_id, true, false);
  }
+  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);

  Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
  Node* make_string_method_node(int opcode, Node* str1, Node* str2);
@ -253,6 +259,10 @@ class LibraryCallKit : public GraphKit {
  bool inline_reverseBytes(vmIntrinsics::ID id);

  bool inline_reference_get();
+  bool inline_aescrypt_Block(vmIntrinsics::ID id);
+  bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
+  Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
+  Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
 };


@ -306,6 +316,8 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    }
  }

+  bool is_predicted = false;
+
  switch (id) {
  case vmIntrinsics::_compareTo:
    if (!SpecialStringCompareTo)  return NULL;
@ -413,6 +425,18 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    break;
 #endif

+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    if (!UseAESIntrinsics) return NULL;
+    break;
+
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    if (!UseAESIntrinsics) return NULL;
+    // these two require the predicated logic
+    is_predicted = true;
+    break;
+
 default:
    assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility");
    assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?");
@ -444,7 +468,7 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    if (!InlineUnsafeOps)  return NULL;
  }

-  return new LibraryIntrinsic(m, is_virtual, (vmIntrinsics::ID) id);
+  return new LibraryIntrinsic(m, is_virtual, is_predicted, (vmIntrinsics::ID) id);
 }

 //----------------------register_library_intrinsics-----------------------
@ -496,6 +520,47 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) {
  return NULL;
 }

+Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) {
+  LibraryCallKit kit(jvms, this);
+  Compile* C = kit.C;
+  int nodes = C->unique();
+#ifndef PRODUCT
+  assert(is_predicted(), "sanity");
+  if ((PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) && Verbose) {
+    char buf[1000];
+    const char* str = vmIntrinsics::short_name_as_C_string(intrinsic_id(), buf, sizeof(buf));
+    tty->print_cr("Predicate for intrinsic %s", str);
+  }
+#endif
+
+  Node* slow_ctl = kit.try_to_predicate();
+  if (!kit.failing()) {
+    if (C->log()) {
+      C->log()->elem("predicate_intrinsic id='%s'%s nodes='%d'",
+                     vmIntrinsics::name_at(intrinsic_id()),
+                     (is_virtual() ? " virtual='1'" : ""),
+                     C->unique() - nodes);
+    }
+    return slow_ctl; // Could be NULL if the check folds.
+  }
+
+  // The intrinsic bailed out
+  if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
+    if (jvms->has_method()) {
+      // Not a root compile.
+      const char* msg = "failed to generate predicate for intrinsic";
+      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg);
+    } else {
+      // Root compile
+      tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
+               vmIntrinsics::name_at(intrinsic_id()),
+               (is_virtual() ? " (virtual)" : ""), kit.bci());
+    }
+  }
+  C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
+  return NULL;
+}
+
 bool LibraryCallKit::try_to_inline() {
  // Handle symbolic names for otherwise undistinguished boolean switches:
  const bool is_store       = true;
@ -767,6 +832,14 @@ bool LibraryCallKit::try_to_inline() {
  case vmIntrinsics::_Reference_get:
    return inline_reference_get();

+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    return inline_aescrypt_Block(intrinsic_id());
+
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt(intrinsic_id());
+
  default:
    // If you get here, it may be that someone has added a new intrinsic
    // to the list in vmSymbols.hpp without implementing it here.
@ -780,6 +853,36 @@ bool LibraryCallKit::try_to_inline() {
  }
 }

+Node* LibraryCallKit::try_to_predicate() {
+  if (!jvms()->has_method()) {
+    // Root JVMState has a null method.
+    assert(map()->memory()->Opcode() == Op_Parm, "");
+    // Insert the memory aliasing node
+    set_all_memory(reset_memory());
+  }
+  assert(merged_memory(), "");
+
+  switch (intrinsic_id()) {
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt_predicate(false);
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    return inline_cipherBlockChaining_AESCrypt_predicate(true);
+
+  default:
+    // If you get here, it may be that someone has added a new intrinsic
+    // to the list in vmSymbols.hpp without implementing it here.
+#ifndef PRODUCT
+    if ((PrintMiscellaneous && (Verbose || WizardMode)) || PrintOpto) {
+      tty->print_cr("*** Warning: Unimplemented predicate for intrinsic %s(%d)",
+                    vmIntrinsics::name_at(intrinsic_id()), intrinsic_id());
+    }
+#endif
+    Node* slow_ctl = control();
+    set_control(top()); // No fast path instrinsic
+    return slow_ctl;
+  }
+}
+
 //------------------------------push_result------------------------------
 // Helper function for finishing intrinsics.
 void LibraryCallKit::push_result(RegionNode* region, PhiNode* value) {
@ -3830,7 +3933,7 @@ Node* LibraryCallKit::generate_virtual_guard(Node* obj_klass,
                     vtable_index*vtableEntry::size()) * wordSize +
                     vtableEntry::method_offset_in_bytes();
  Node* entry_addr  = basic_plus_adr(obj_klass, entry_offset);
-  Node* target_call = make_load(NULL, entry_addr, TypeInstPtr::NOTNULL, T_OBJECT);
+  Node* target_call = make_load(NULL, entry_addr, TypePtr::NOTNULL, T_ADDRESS);

  // Compare the target method with the expected method (e.g., Object.hashCode).
  const TypePtr* native_call_addr = TypeMetadataPtr::make(method);
@ -5613,3 +5716,265 @@ bool LibraryCallKit::inline_reference_get() {
  push(result);
  return true;
 }
+
+
+Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
+                                              bool is_exact=true, bool is_static=false) {
+
+  const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
+  assert(tinst != NULL, "obj is null");
+  assert(tinst->klass()->is_loaded(), "obj is not loaded");
+  assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
+
+  ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName),
+                                                                          ciSymbol::make(fieldTypeString),
+                                                                          is_static);
+  if (field == NULL) return (Node *) NULL;
+  assert (field != NULL, "undefined field");
+
+  // Next code  copied from Parse::do_get_xxx():
+
+  // Compute address and memory type.
+  int offset  = field->offset_in_bytes();
+  bool is_vol = field->is_volatile();
+  ciType* field_klass = field->type();
+  assert(field_klass->is_loaded(), "should be loaded");
+  const TypePtr* adr_type = C->alias_type(field)->adr_type();
+  Node *adr = basic_plus_adr(fromObj, fromObj, offset);
+  BasicType bt = field->layout_type();
+
+  // Build the resultant type of the load
+  const Type *type = TypeOopPtr::make_from_klass(field_klass->as_klass());
+
+  // Build the load.
+  Node* loadedField = make_load(NULL, adr, type, bt, adr_type, is_vol);
+  return loadedField;
+}
+
+
+//------------------------------inline_aescrypt_Block-----------------------
+bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
+  address stubAddr;
+  const char *stubName;
+  assert(UseAES, "need AES instruction support");
+
+  switch(id) {
+  case vmIntrinsics::_aescrypt_encryptBlock:
+    stubAddr = StubRoutines::aescrypt_encryptBlock();
+    stubName = "aescrypt_encryptBlock";
+    break;
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    stubAddr = StubRoutines::aescrypt_decryptBlock();
+    stubName = "aescrypt_decryptBlock";
+    break;
+  }
+  if (stubAddr == NULL) return false;
+
+  // Restore the stack and pop off the arguments.
+  int nargs = 5;  // this + 2 oop/offset combos
+  assert(callee()->signature()->size() == nargs-1, "encryptBlock has 4 arguments");
+
+  Node *aescrypt_object  = argument(0);
+  Node *src         = argument(1);
+  Node *src_offset  = argument(2);
+  Node *dest        = argument(3);
+  Node *dest_offset = argument(4);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert (top_src  != NULL && top_src->klass()  != NULL &&  top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // for the quick and dirty code we will skip all the checks.
+  // we are just trying to get the call to be generated.
+  Node* src_start  = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start  = array_element_address(src,  src_offset,  T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // now need to get the start of its expanded key array
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+
+  // Call the stub.
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    src_start, dest_start, k_start);
+
+  return true;
+}
+
+//------------------------------inline_cipherBlockChaining_AESCrypt-----------------------
+bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) {
+  address stubAddr;
+  const char *stubName;
+
+  assert(UseAES, "need AES instruction support");
+
+  switch(id) {
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+    stubAddr = StubRoutines::cipherBlockChaining_encryptAESCrypt();
+    stubName = "cipherBlockChaining_encryptAESCrypt";
+    break;
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    stubAddr = StubRoutines::cipherBlockChaining_decryptAESCrypt();
+    stubName = "cipherBlockChaining_decryptAESCrypt";
+    break;
+  }
+  if (stubAddr == NULL) return false;
+
+
+  // Restore the stack and pop off the arguments.
+  int nargs = 6;  // this + oop/offset + len + oop/offset
+  assert(callee()->signature()->size() == nargs-1, "wrong number of arguments");
+  Node *cipherBlockChaining_object  = argument(0);
+  Node *src         = argument(1);
+  Node *src_offset  = argument(2);
+  Node *len         = argument(3);
+  Node *dest        = argument(4);
+  Node *dest_offset = argument(5);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert (top_src  != NULL && top_src->klass()  != NULL
+          &&  top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // checks are the responsibility of the caller
+  Node* src_start  = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start  = array_element_address(src,  src_offset,  T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object
+  // (because of the predicated logic executed earlier).
+  // so we cast it here safely.
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+
+  Node* embeddedCipherObj = load_field_from_object(cipherBlockChaining_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+  if (embeddedCipherObj == NULL) return false;
+
+  // cast it to what we know it will be at runtime
+  const TypeInstPtr* tinst = _gvn.type(cipherBlockChaining_object)->isa_instptr();
+  assert(tinst != NULL, "CBC obj is null");
+  assert(tinst->klass()->is_loaded(), "CBC obj is not loaded");
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) return false;
+
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt);
+  const TypeOopPtr* xtype = aklass->as_instance_type();
+  Node* aescrypt_object = new(C) CheckCastPPNode(control(), embeddedCipherObj, xtype);
+  aescrypt_object = _gvn.transform(aescrypt_object);
+
+  // we need to get the start of the aescrypt_object's expanded key array
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+
+  // similarly, get the start address of the r vector
+  Node* objRvec = load_field_from_object(cipherBlockChaining_object, "r", "[B", /*is_exact*/ false);
+  if (objRvec == NULL) return false;
+  Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE);
+
+  // Call the stub, passing src_start, dest_start, k_start, r_start and src_len
+  make_runtime_call(RC_LEAF|RC_NO_FP,
+                    OptoRuntime::cipherBlockChaining_aescrypt_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    src_start, dest_start, k_start, r_start, len);
+
+  // return is void so no result needs to be pushed
+
+  return true;
+}
+
+//------------------------------get_key_start_from_aescrypt_object-----------------------
+Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+  Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
+  assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
+  if (objAESCryptKey == NULL) return (Node *) NULL;
+
+  // now have the array, need to get the start address of the K array
+  Node* k_start = array_element_address(objAESCryptKey, intcon(0), T_INT);
+  return k_start;
+}
+
+//----------------------------inline_cipherBlockChaining_AESCrypt_predicate----------------------------
+// Return node representing slow path of predicate check.
+// the pseudo code we want to emulate with this predicate is:
+// for encryption:
+//    if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath
+// for decryption:
+//    if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath
+//    note cipher==plain is more conservative than the original java code but that's OK
+//
+Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting) {
+  // First, check receiver for NULL since it is virtual method.
+  int nargs = arg_size();
+  Node* objCBC = argument(0);
+  _sp += nargs;
+  objCBC = do_null_check(objCBC, T_OBJECT);
+  _sp -= nargs;
+
+  if (stopped()) return NULL; // Always NULL
+
+  // Load embeddedCipher field of CipherBlockChaining object.
+  Node* embeddedCipherObj = load_field_from_object(objCBC, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+
+  // get AESCrypt klass for instanceOf check
+  // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point
+  // will have same classloader as CipherBlockChaining object
+  const TypeInstPtr* tinst = _gvn.type(objCBC)->isa_instptr();
+  assert(tinst != NULL, "CBCobj is null");
+  assert(tinst->klass()->is_loaded(), "CBCobj is not loaded");
+
+  // we want to do an instanceof comparison against the AESCrypt class
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) {
+    // if AESCrypt is not even loaded, we never take the intrinsic fast path
+    Node* ctrl = control();
+    set_control(top()); // no regular fast path
+    return ctrl;
+  }
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+
+  _sp += nargs;          // gen_instanceof might do an uncommon trap
+  Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
+  _sp -= nargs;
+  Node* cmp_instof  = _gvn.transform(new (C) CmpINode(instof, intcon(1)));
+  Node* bool_instof  = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne));
+
+  Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN);
+
+  // for encryption, we are done
+  if (!decrypting)
+    return instof_false;  // even if it is NULL
+
+  // for decryption, we need to add a further check to avoid
+  // taking the intrinsic path when cipher and plain are the same
+  // see the original java code for why.
+  RegionNode* region = new(C) RegionNode(3);
+  region->init_req(1, instof_false);
+  Node* src = argument(1);
+  Node *dest = argument(4);
+  Node* cmp_src_dest = _gvn.transform(new (C) CmpPNode(src, dest));
+  Node* bool_src_dest = _gvn.transform(new (C) BoolNode(cmp_src_dest, BoolTest::eq));
+  Node* src_dest_conjoint = generate_guard(bool_src_dest, NULL, PROB_MIN);
+  region->init_req(2, src_dest_conjoint);
+
+  record_for_igvn(region);
+  return _gvn.transform(region);
+
+}
+
+
--- a/hotspot/src/share/vm/opto/mulnode.cpp
+++ b/hotspot/src/share/vm/opto/mulnode.cpp
@ -479,24 +479,27 @@ Node *AndINode::Ideal(PhaseGVN *phase, bool can_reshape) {
    return new (phase->C) AndINode(load,phase->intcon(mask&0xFFFF));

  // Masking bits off of a Short?  Loading a Character does some masking
-  if (lop == Op_LoadS && (mask & 0xFFFF0000) == 0 ) {
-    Node *ldus = new (phase->C) LoadUSNode(load->in(MemNode::Control),
-                                              load->in(MemNode::Memory),
-                                              load->in(MemNode::Address),
-                                              load->adr_type());
-    ldus = phase->transform(ldus);
-    return new (phase->C) AndINode(ldus, phase->intcon(mask & 0xFFFF));
-  }
+  if (can_reshape &&
+      load->outcnt() == 1 && load->unique_out() == this) {
+    if (lop == Op_LoadS && (mask & 0xFFFF0000) == 0 ) {
+      Node *ldus = new (phase->C) LoadUSNode(load->in(MemNode::Control),
+                                             load->in(MemNode::Memory),
+                                             load->in(MemNode::Address),
+                                             load->adr_type());
+      ldus = phase->transform(ldus);
+      return new (phase->C) AndINode(ldus, phase->intcon(mask & 0xFFFF));
+    }

-  // Masking sign bits off of a Byte?  Do an unsigned byte load plus
-  // an and.
-  if (lop == Op_LoadB && (mask & 0xFFFFFF00) == 0) {
-    Node* ldub = new (phase->C) LoadUBNode(load->in(MemNode::Control),
-                                              load->in(MemNode::Memory),
-                                              load->in(MemNode::Address),
-                                              load->adr_type());
-    ldub = phase->transform(ldub);
-    return new (phase->C) AndINode(ldub, phase->intcon(mask));
+    // Masking sign bits off of a Byte?  Do an unsigned byte load plus
+    // an and.
+    if (lop == Op_LoadB && (mask & 0xFFFFFF00) == 0) {
+      Node* ldub = new (phase->C) LoadUBNode(load->in(MemNode::Control),
+                                             load->in(MemNode::Memory),
+                                             load->in(MemNode::Address),
+                                             load->adr_type());
+      ldub = phase->transform(ldub);
+      return new (phase->C) AndINode(ldub, phase->intcon(mask));
+    }
  }

  // Masking off sign bits?  Dont make them!
@ -923,7 +926,9 @@ Node *RShiftINode::Ideal(PhaseGVN *phase, bool can_reshape) {
      set_req(2, phase->intcon(0));
      return this;
    }
-    else if( ld->Opcode() == Op_LoadUS )
+    else if( can_reshape &&
+             ld->Opcode() == Op_LoadUS &&
+             ld->outcnt() == 1 && ld->unique_out() == shl)
      // Replace zero-extension-load with sign-extension-load
      return new (phase->C) LoadSNode( ld->in(MemNode::Control),
                                ld->in(MemNode::Memory),
--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@ -811,6 +811,48 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
  return TypeFunc::make(domain, range);
 }

+// for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant)
+const TypeFunc* OptoRuntime::aescrypt_block_Type() {
+  // create input type (domain)
+  int num_args      = 3;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // src
+  fields[argp++] = TypePtr::NOTNULL;    // dest
+  fields[argp++] = TypePtr::NOTNULL;    // k array
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void
+const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
+  // create input type (domain)
+  int num_args      = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // src
+  fields[argp++] = TypePtr::NOTNULL;    // dest
+  fields[argp++] = TypePtr::NOTNULL;    // k array
+  fields[argp++] = TypePtr::NOTNULL;    // r array
+  fields[argp++] = TypeInt::INT;        // src len
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state access for on stack replacement
 const TypeFunc* OptoRuntime::osr_end_Type() {
  // create input type (domain)
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@ -280,6 +280,9 @@ private:

  static const TypeFunc* array_fill_Type();

+  static const TypeFunc* aescrypt_block_Type();
+  static const TypeFunc* cipherBlockChaining_aescrypt_Type();
+
  // leaf on stack replacement interpreter accessor types
  static const TypeFunc* osr_end_Type();

--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -1776,16 +1776,15 @@ void SuperWord::compute_vector_element_type() {
    set_velt_type(n, container_type(n));
  }

-  // Propagate narrowed type backwards through operations
+  // Propagate integer narrowed type backwards through operations
  // that don't depend on higher order bits
  for (int i = _block.length() - 1; i >= 0; i--) {
    Node* n = _block.at(i);
    // Only integer types need be examined
-    const Type* vt = velt_type(n);
-    if (vt->basic_type() == T_INT) {
+    const Type* vtn = velt_type(n);
+    if (vtn->basic_type() == T_INT) {
      uint start, end;
      VectorNode::vector_operands(n, &start, &end);
-      const Type* vt = velt_type(n);

      for (uint j = start; j < end; j++) {
        Node* in  = n->in(j);
@ -1801,6 +1800,24 @@ void SuperWord::compute_vector_element_type() {
            }
          }
          if (same_type) {
+            // For right shifts of small integer types (bool, byte, char, short)
+            // we need precise information about sign-ness. Only Load nodes have
+            // this information because Store nodes are the same for signed and
+            // unsigned values. And any arithmetic operation after a load may
+            // expand a value to signed Int so such right shifts can't be used
+            // because vector elements do not have upper bits of Int.
+            const Type* vt = vtn;
+            if (VectorNode::is_shift(in)) {
+              Node* load = in->in(1);
+              if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
+                vt = velt_type(load);
+              } else if (in->Opcode() != Op_LShiftI) {
+                // Widen type to Int to avoid creation of right shift vector
+                // (align + data_size(s1) check in stmts_can_pack() will fail).
+                // Note, left shifts work regardless type.
+                vt = TypeInt::INT;
+              }
+            }
            set_velt_type(in, vt);
          }
        }
@ -1841,7 +1858,20 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
 // Smallest type containing range of values
 const Type* SuperWord::container_type(Node* n) {
  if (n->is_Mem()) {
-    return Type::get_const_basic_type(n->as_Mem()->memory_type());
+    BasicType bt = n->as_Mem()->memory_type();
+    if (n->is_Store() && (bt == T_CHAR)) {
+      // Use T_SHORT type instead of T_CHAR for stored values because any
+      // preceding arithmetic operation extends values to signed Int.
+      bt = T_SHORT;
+    }
+    if (n->Opcode() == Op_LoadUB) {
+      // Adjust type for unsigned byte loads, it is important for right shifts.
+      // T_BOOLEAN is used because there is no basic type representing type
+      // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
+      // size (one byte) and sign is important.
+      bt = T_BOOLEAN;
+    }
+    return Type::get_const_basic_type(bt);
  }
  const Type* t = _igvn.type(n);
  if (t->basic_type() == T_INT) {
--- a/hotspot/src/share/vm/opto/type.cpp
+++ b/hotspot/src/share/vm/opto/type.cpp
@ -61,7 +61,7 @@ Type::TypeInfo Type::_type_info[Type::lastype] = {
  { Bad,             T_ILLEGAL,    "tuple:",        false, Node::NotAMachineReg, relocInfo::none          },  // Tuple
  { Bad,             T_ARRAY,      "array:",        false, Node::NotAMachineReg, relocInfo::none          },  // Array

-#if defined(IA32) || defined(AMD64)
+#ifndef SPARC
  { Bad,             T_ILLEGAL,    "vectors:",      false, Op_VecS,              relocInfo::none          },  // VectorS
  { Bad,             T_ILLEGAL,    "vectord:",      false, Op_VecD,              relocInfo::none          },  // VectorD
  { Bad,             T_ILLEGAL,    "vectorx:",      false, Op_VecX,              relocInfo::none          },  // VectorX
--- a/hotspot/src/share/vm/opto/vectornode.cpp
+++ b/hotspot/src/share/vm/opto/vectornode.cpp
@ -29,8 +29,7 @@
 //------------------------------VectorNode--------------------------------------

 // Return the vector operator for the specified scalar operation
-// and vector length.  Also used to check if the code generator
-// supports the vector operation.
+// and vector length.
 int VectorNode::opcode(int sopc, BasicType bt) {
  switch (sopc) {
  case Op_AddI:
@ -75,7 +74,7 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    case T_BYTE:   return 0;   // Unimplemented
    case T_CHAR:
    case T_SHORT:  return Op_MulVS;
-    case T_INT:    return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1
+    case T_INT:    return Op_MulVI;
    }
    ShouldNotReachHere();
  case Op_MulF:
@ -104,9 +103,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    return Op_LShiftVL;
  case Op_RShiftI:
    switch (bt) {
-    case T_BOOLEAN:
+    case T_BOOLEAN:return Op_URShiftVB; // boolean is unsigned value
+    case T_CHAR:   return Op_URShiftVS; // char is unsigned value
    case T_BYTE:   return Op_RShiftVB;
-    case T_CHAR:
    case T_SHORT:  return Op_RShiftVS;
    case T_INT:    return Op_RShiftVI;
    }
@ -116,10 +115,14 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    return Op_RShiftVL;
  case Op_URShiftI:
    switch (bt) {
-    case T_BOOLEAN:
-    case T_BYTE:   return Op_URShiftVB;
-    case T_CHAR:
-    case T_SHORT:  return Op_URShiftVS;
+    case T_BOOLEAN:return Op_URShiftVB;
+    case T_CHAR:   return Op_URShiftVS;
+    case T_BYTE:
+    case T_SHORT:  return 0; // Vector logical right shift for signed short
+                             // values produces incorrect Java result for
+                             // negative data because java code should convert
+                             // a short value into int value with sign
+                             // extension before a shift.
    case T_INT:    return Op_URShiftVI;
    }
    ShouldNotReachHere();
@ -157,12 +160,14 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  return 0; // Unimplemented
 }

+// Also used to check if the code generator
+// supports the vector operation.
 bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
  if (is_java_primitive(bt) &&
      (vlen > 1) && is_power_of_2(vlen) &&
      Matcher::vector_size_supported(bt, vlen)) {
    int vopc = VectorNode::opcode(opc, bt);
-    return vopc > 0 && Matcher::has_match_rule(vopc);
+    return vopc > 0 && Matcher::match_rule_supported(vopc);
  }
  return false;
 }
--- a/hotspot/src/share/vm/prims/unsafe.cpp
+++ b/hotspot/src/share/vm/prims/unsafe.cpp
@ -124,6 +124,8 @@ inline void* index_oop_from_field_offset_long(oop p, jlong field_offset) {
      assert((void*)p->obj_field_addr<oop>((jint)byte_offset) == ptr_plus_disp,
             "raw [ptr+disp] must be consistent with oop::field_base");
    }
+    jlong p_size = HeapWordSize * (jlong)(p->size());
+    assert(byte_offset < p_size, err_msg("Unsafe access: offset " INT64_FORMAT " > object's size " INT64_FORMAT, byte_offset, p_size));
  }
 #endif
  if (sizeof(char*) == sizeof(jint))    // (this constant folds!)
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -533,6 +533,9 @@ class CommandLineFlags {
  product(intx, UseSSE, 99,                                                 \
          "Highest supported SSE instructions set on x86/x64")              \
                                                                            \
+  product(bool, UseAES, false,                                               \
+          "Control whether AES instructions can be used on x86/x64")        \
+                                                                            \
  product(uintx, LargePageSizeInBytes, 0,                                   \
          "Large page size (0 to let VM choose the page size")              \
                                                                            \
@ -635,6 +638,9 @@ class CommandLineFlags {
  product(bool, UseSSE42Intrinsics, false,                                  \
          "SSE4.2 versions of intrinsics")                                  \
                                                                            \
+  product(bool, UseAESIntrinsics, false,                                    \
+          "use intrinsics for AES versions of crypto")                      \
+                                                                            \
  develop(bool, TraceCallFixup, false,                                      \
          "traces all call fixups")                                         \
                                                                            \
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -120,6 +120,10 @@ address StubRoutines::_arrayof_jbyte_fill;
 address StubRoutines::_arrayof_jshort_fill;
 address StubRoutines::_arrayof_jint_fill;

+address StubRoutines::_aescrypt_encryptBlock               = NULL;
+address StubRoutines::_aescrypt_decryptBlock               = NULL;
+address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
+address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;

 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -199,6 +199,11 @@ class StubRoutines: AllStatic {
  // zero heap space aligned to jlong (8 bytes)
  static address _zero_aligned_words;

+  static address _aescrypt_encryptBlock;
+  static address _aescrypt_decryptBlock;
+  static address _cipherBlockChaining_encryptAESCrypt;
+  static address _cipherBlockChaining_decryptAESCrypt;
+
  // These are versions of the java.lang.Math methods which perform
  // the same operations as the intrinsic version.  They are used for
  // constant folding in the compiler to ensure equivalence.  If the
@ -330,6 +335,11 @@ class StubRoutines: AllStatic {
  static address arrayof_jshort_fill() { return _arrayof_jshort_fill; }
  static address arrayof_jint_fill()   { return _arrayof_jint_fill; }

+  static address aescrypt_encryptBlock()                { return _aescrypt_encryptBlock; }
+  static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
+  static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
+  static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
+
  static address select_fill_function(BasicType t, bool aligned, const char* &name);

  static address zero_aligned_words()   { return _zero_aligned_words; }
--- a/hotspot/test/compiler/6340864/TestByteVect.java
+++ b/hotspot/test/compiler/6340864/TestByteVect.java
@ -33,7 +33,7 @@
 public class TestByteVect {
  private static final int ARRLEN = 997;
  private static final int ITERS  = 11000;
-  private static final int ADD_INIT = 0;
+  private static final int ADD_INIT = 63;
  private static final int BIT_MASK = 0xB7;
  private static final int VALUE = 3;
  private static final int SHIFT = 8;
@ -76,6 +76,7 @@ public class TestByteVect {
      test_subc(a0, a1);
      test_subv(a0, a1, (byte)VALUE);
      test_suba(a0, a1, a2);
+
      test_mulc(a0, a1);
      test_mulv(a0, a1, (byte)VALUE);
      test_mula(a0, a1, a2);
@ -88,6 +89,7 @@ public class TestByteVect {
      test_divc_n(a0, a1);
      test_divv(a0, a1, (byte)-VALUE);
      test_diva(a0, a1, a3);
+
      test_andc(a0, a1);
      test_andv(a0, a1, (byte)BIT_MASK);
      test_anda(a0, a1, a4);
@ -97,30 +99,49 @@ public class TestByteVect {
      test_xorc(a0, a1);
      test_xorv(a0, a1, (byte)BIT_MASK);
      test_xora(a0, a1, a4);
+
      test_sllc(a0, a1);
      test_sllv(a0, a1, VALUE);
      test_srlc(a0, a1);
      test_srlv(a0, a1, VALUE);
      test_srac(a0, a1);
      test_srav(a0, a1, VALUE);
+
      test_sllc_n(a0, a1);
      test_sllv(a0, a1, -VALUE);
      test_srlc_n(a0, a1);
      test_srlv(a0, a1, -VALUE);
      test_srac_n(a0, a1);
      test_srav(a0, a1, -VALUE);
+
      test_sllc_o(a0, a1);
      test_sllv(a0, a1, SHIFT);
      test_srlc_o(a0, a1);
      test_srlv(a0, a1, SHIFT);
      test_srac_o(a0, a1);
      test_srav(a0, a1, SHIFT);
+
      test_sllc_on(a0, a1);
      test_sllv(a0, a1, -SHIFT);
      test_srlc_on(a0, a1);
      test_srlv(a0, a1, -SHIFT);
      test_srac_on(a0, a1);
      test_srav(a0, a1, -SHIFT);
+
+      test_sllc_add(a0, a1);
+      test_sllv_add(a0, a1, ADD_INIT);
+      test_srlc_add(a0, a1);
+      test_srlv_add(a0, a1, ADD_INIT);
+      test_srac_add(a0, a1);
+      test_srav_add(a0, a1, ADD_INIT);
+
+      test_sllc_and(a0, a1);
+      test_sllv_and(a0, a1, BIT_MASK);
+      test_srlc_and(a0, a1);
+      test_srlv_and(a0, a1, BIT_MASK);
+      test_srac_and(a0, a1);
+      test_srav_and(a0, a1, BIT_MASK);
+
      test_pack2(p2, a1);
      test_unpack2(a0, p2);
      test_pack2_swap(p2, a1);
@ -369,6 +390,60 @@ public class TestByteVect {
        errn += verify("test_srav_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT)));
      }

+      test_sllc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+      test_sllv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+
+      test_srlc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+      test_srlv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+
+      test_srac_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+      test_srav_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_add: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+
+      test_sllc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+      test_sllv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+
+      test_srlc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+      test_srlv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+
+      test_srac_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+      test_srav_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_and: ", i, a0[i], (byte)(((byte)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+
      test_pack2(p2, a1);
      for (int i=0; i<ARRLEN/2; i++) {
        errn += verify("test_pack2: ", i, p2[i], (short)(((short)(ADD_INIT+2*i) & 0xFF) | ((short)(ADD_INIT+2*i+1) << 8)));
@ -803,6 +878,84 @@ public class TestByteVect {
    end = System.currentTimeMillis();
    System.out.println("test_srav_on: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_and: " + (end - start));
+
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_pack2(p2, a1);
@ -1036,6 +1189,26 @@ public class TestByteVect {
      a0[i] = (byte)(a1[i]<<b);
    }
  }
+  static void test_sllc_add(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + ADD_INIT)<<VALUE);
+    }
+  }
+  static void test_sllv_add(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + b)<<VALUE);
+    }
+  }
+  static void test_sllc_and(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & BIT_MASK)<<VALUE);
+    }
+  }
+  static void test_sllv_and(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & b)<<VALUE);
+    }
+  }

  static void test_srlc(byte[] a0, byte[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -1062,6 +1235,26 @@ public class TestByteVect {
      a0[i] = (byte)(a1[i]>>>b);
    }
  }
+  static void test_srlc_add(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + ADD_INIT)>>>VALUE);
+    }
+  }
+  static void test_srlv_add(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + b)>>>VALUE);
+    }
+  }
+  static void test_srlc_and(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & BIT_MASK)>>>VALUE);
+    }
+  }
+  static void test_srlv_and(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & b)>>>VALUE);
+    }
+  }

  static void test_srac(byte[] a0, byte[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -1088,6 +1281,26 @@ public class TestByteVect {
      a0[i] = (byte)(a1[i]>>b);
    }
  }
+  static void test_srac_add(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + ADD_INIT)>>VALUE);
+    }
+  }
+  static void test_srav_add(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] + b)>>VALUE);
+    }
+  }
+  static void test_srac_and(byte[] a0, byte[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & BIT_MASK)>>VALUE);
+    }
+  }
+  static void test_srav_and(byte[] a0, byte[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (byte)((a1[i] & b)>>VALUE);
+    }
+  }

  static void test_pack2(short[] p2, byte[] a1) {
    if (p2.length*2 > a1.length) return;
--- a/hotspot/test/compiler/6340864/TestIntVect.java
+++ b/hotspot/test/compiler/6340864/TestIntVect.java
@ -74,6 +74,7 @@ public class TestIntVect {
      test_subc(a0, a1);
      test_subv(a0, a1, (int)VALUE);
      test_suba(a0, a1, a2);
+
      test_mulc(a0, a1);
      test_mulv(a0, a1, (int)VALUE);
      test_mula(a0, a1, a2);
@ -86,6 +87,7 @@ public class TestIntVect {
      test_divc_n(a0, a1);
      test_divv(a0, a1, (int)-VALUE);
      test_diva(a0, a1, a3);
+
      test_andc(a0, a1);
      test_andv(a0, a1, (int)BIT_MASK);
      test_anda(a0, a1, a4);
@ -95,30 +97,49 @@ public class TestIntVect {
      test_xorc(a0, a1);
      test_xorv(a0, a1, (int)BIT_MASK);
      test_xora(a0, a1, a4);
+
      test_sllc(a0, a1);
      test_sllv(a0, a1, VALUE);
      test_srlc(a0, a1);
      test_srlv(a0, a1, VALUE);
      test_srac(a0, a1);
      test_srav(a0, a1, VALUE);
+
      test_sllc_n(a0, a1);
      test_sllv(a0, a1, -VALUE);
      test_srlc_n(a0, a1);
      test_srlv(a0, a1, -VALUE);
      test_srac_n(a0, a1);
      test_srav(a0, a1, -VALUE);
+
      test_sllc_o(a0, a1);
      test_sllv(a0, a1, SHIFT);
      test_srlc_o(a0, a1);
      test_srlv(a0, a1, SHIFT);
      test_srac_o(a0, a1);
      test_srav(a0, a1, SHIFT);
+
      test_sllc_on(a0, a1);
      test_sllv(a0, a1, -SHIFT);
      test_srlc_on(a0, a1);
      test_srlv(a0, a1, -SHIFT);
      test_srac_on(a0, a1);
      test_srav(a0, a1, -SHIFT);
+
+      test_sllc_add(a0, a1);
+      test_sllv_add(a0, a1, ADD_INIT);
+      test_srlc_add(a0, a1);
+      test_srlv_add(a0, a1, ADD_INIT);
+      test_srac_add(a0, a1);
+      test_srav_add(a0, a1, ADD_INIT);
+
+      test_sllc_and(a0, a1);
+      test_sllv_and(a0, a1, BIT_MASK);
+      test_srlc_and(a0, a1);
+      test_srlv_and(a0, a1, BIT_MASK);
+      test_srac_and(a0, a1);
+      test_srav_and(a0, a1, BIT_MASK);
+
      test_pack2(p2, a1);
      test_unpack2(a0, p2);
      test_pack2_swap(p2, a1);
@ -359,6 +380,60 @@ public class TestIntVect {
        errn += verify("test_srav_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT)));
      }

+      test_sllc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+      test_sllv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+
+      test_srlc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+      test_srlv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+
+      test_srac_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+      test_srav_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_add: ", i, a0[i], (int)(((int)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+
+      test_sllc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+      test_sllv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+
+      test_srlc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+      test_srlv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+
+      test_srac_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+      test_srav_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_and: ", i, a0[i], (int)(((int)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+
      test_pack2(p2, a1);
      for (int i=0; i<ARRLEN/2; i++) {
        errn += verify("test_pack2: ", i, p2[i], ((long)(ADD_INIT+2*i) & 0xFFFFFFFFl) | ((long)(ADD_INIT+2*i+1) << 32));
@ -725,6 +800,84 @@ public class TestIntVect {
    end = System.currentTimeMillis();
    System.out.println("test_srav_on: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_and: " + (end - start));
+
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_pack2(p2, a1);
@ -908,6 +1061,26 @@ public class TestIntVect {
      a0[i] = (int)(a1[i]<<b);
    }
  }
+  static void test_sllc_add(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + ADD_INIT)<<VALUE);
+    }
+  }
+  static void test_sllv_add(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + b)<<VALUE);
+    }
+  }
+  static void test_sllc_and(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & BIT_MASK)<<VALUE);
+    }
+  }
+  static void test_sllv_and(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & b)<<VALUE);
+    }
+  }

  static void test_srlc(int[] a0, int[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -934,6 +1107,26 @@ public class TestIntVect {
      a0[i] = (int)(a1[i]>>>b);
    }
  }
+  static void test_srlc_add(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + ADD_INIT)>>>VALUE);
+    }
+  }
+  static void test_srlv_add(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + b)>>>VALUE);
+    }
+  }
+  static void test_srlc_and(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & BIT_MASK)>>>VALUE);
+    }
+  }
+  static void test_srlv_and(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & b)>>>VALUE);
+    }
+  }

  static void test_srac(int[] a0, int[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -960,6 +1153,26 @@ public class TestIntVect {
      a0[i] = (int)(a1[i]>>b);
    }
  }
+  static void test_srac_add(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + ADD_INIT)>>VALUE);
+    }
+  }
+  static void test_srav_add(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] + b)>>VALUE);
+    }
+  }
+  static void test_srac_and(int[] a0, int[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & BIT_MASK)>>VALUE);
+    }
+  }
+  static void test_srav_and(int[] a0, int[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (int)((a1[i] & b)>>VALUE);
+    }
+  }

  static void test_pack2(long[] p2, int[] a1) {
    if (p2.length*2 > a1.length) return;
--- a/hotspot/test/compiler/6340864/TestLongVect.java
+++ b/hotspot/test/compiler/6340864/TestLongVect.java
@ -73,6 +73,7 @@ public class TestLongVect {
      test_subc(a0, a1);
      test_subv(a0, a1, (long)VALUE);
      test_suba(a0, a1, a2);
+
      test_mulc(a0, a1);
      test_mulv(a0, a1, (long)VALUE);
      test_mula(a0, a1, a2);
@ -85,6 +86,7 @@ public class TestLongVect {
      test_divc_n(a0, a1);
      test_divv(a0, a1, (long)-VALUE);
      test_diva(a0, a1, a3);
+
      test_andc(a0, a1);
      test_andv(a0, a1, (long)BIT_MASK);
      test_anda(a0, a1, a4);
@ -94,30 +96,48 @@ public class TestLongVect {
      test_xorc(a0, a1);
      test_xorv(a0, a1, (long)BIT_MASK);
      test_xora(a0, a1, a4);
+
      test_sllc(a0, a1);
      test_sllv(a0, a1, VALUE);
      test_srlc(a0, a1);
      test_srlv(a0, a1, VALUE);
      test_srac(a0, a1);
      test_srav(a0, a1, VALUE);
+
      test_sllc_n(a0, a1);
      test_sllv(a0, a1, -VALUE);
      test_srlc_n(a0, a1);
      test_srlv(a0, a1, -VALUE);
      test_srac_n(a0, a1);
      test_srav(a0, a1, -VALUE);
+
      test_sllc_o(a0, a1);
      test_sllv(a0, a1, SHIFT);
      test_srlc_o(a0, a1);
      test_srlv(a0, a1, SHIFT);
      test_srac_o(a0, a1);
      test_srav(a0, a1, SHIFT);
+
      test_sllc_on(a0, a1);
      test_sllv(a0, a1, -SHIFT);
      test_srlc_on(a0, a1);
      test_srlv(a0, a1, -SHIFT);
      test_srac_on(a0, a1);
      test_srav(a0, a1, -SHIFT);
+
+      test_sllc_add(a0, a1);
+      test_sllv_add(a0, a1, ADD_INIT);
+      test_srlc_add(a0, a1);
+      test_srlv_add(a0, a1, ADD_INIT);
+      test_srac_add(a0, a1);
+      test_srav_add(a0, a1, ADD_INIT);
+
+      test_sllc_and(a0, a1);
+      test_sllv_and(a0, a1, BIT_MASK);
+      test_srlc_and(a0, a1);
+      test_srlv_and(a0, a1, BIT_MASK);
+      test_srac_and(a0, a1);
+      test_srav_and(a0, a1, BIT_MASK);
    }
    // Test and verify results
    System.out.println("Verification");
@ -354,6 +374,60 @@ public class TestLongVect {
        errn += verify("test_srav_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT)));
      }

+      test_sllc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+      test_sllv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+
+      test_srlc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+      test_srlv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+
+      test_srac_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+      test_srav_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_add: ", i, a0[i], (long)(((long)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+
+      test_sllc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+      test_sllv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+
+      test_srlc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+      test_srlv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+
+      test_srac_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+      test_srav_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_and: ", i, a0[i], (long)(((long)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+
    }

    if (errn > 0)
@ -696,6 +770,84 @@ public class TestLongVect {
    end = System.currentTimeMillis();
    System.out.println("test_srav_on: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_and: " + (end - start));
+
    return errn;
  }

@ -854,6 +1006,26 @@ public class TestLongVect {
      a0[i] = (long)(a1[i]<<b);
    }
  }
+  static void test_sllc_add(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + ADD_INIT)<<VALUE);
+    }
+  }
+  static void test_sllv_add(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + b)<<VALUE);
+    }
+  }
+  static void test_sllc_and(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & BIT_MASK)<<VALUE);
+    }
+  }
+  static void test_sllv_and(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & b)<<VALUE);
+    }
+  }

  static void test_srlc(long[] a0, long[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -880,6 +1052,26 @@ public class TestLongVect {
      a0[i] = (long)(a1[i]>>>b);
    }
  }
+  static void test_srlc_add(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + ADD_INIT)>>>VALUE);
+    }
+  }
+  static void test_srlv_add(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + b)>>>VALUE);
+    }
+  }
+  static void test_srlc_and(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & BIT_MASK)>>>VALUE);
+    }
+  }
+  static void test_srlv_and(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & b)>>>VALUE);
+    }
+  }

  static void test_srac(long[] a0, long[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -906,6 +1098,26 @@ public class TestLongVect {
      a0[i] = (long)(a1[i]>>b);
    }
  }
+  static void test_srac_add(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + ADD_INIT)>>VALUE);
+    }
+  }
+  static void test_srav_add(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] + b)>>VALUE);
+    }
+  }
+  static void test_srac_and(long[] a0, long[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & BIT_MASK)>>VALUE);
+    }
+  }
+  static void test_srav_and(long[] a0, long[] a1, long b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (long)((a1[i] & b)>>VALUE);
+    }
+  }

  static int verify(String text, int i, long elem, long val) {
    if (elem != val) {
--- a/hotspot/test/compiler/6340864/TestShortVect.java
+++ b/hotspot/test/compiler/6340864/TestShortVect.java
@ -75,6 +75,7 @@ public class TestShortVect {
      test_subc(a0, a1);
      test_subv(a0, a1, (short)VALUE);
      test_suba(a0, a1, a2);
+
      test_mulc(a0, a1);
      test_mulv(a0, a1, (short)VALUE);
      test_mula(a0, a1, a2);
@ -87,6 +88,7 @@ public class TestShortVect {
      test_divc_n(a0, a1);
      test_divv(a0, a1, (short)-VALUE);
      test_diva(a0, a1, a3);
+
      test_andc(a0, a1);
      test_andv(a0, a1, (short)BIT_MASK);
      test_anda(a0, a1, a4);
@ -96,30 +98,49 @@ public class TestShortVect {
      test_xorc(a0, a1);
      test_xorv(a0, a1, (short)BIT_MASK);
      test_xora(a0, a1, a4);
+
      test_sllc(a0, a1);
      test_sllv(a0, a1, VALUE);
      test_srlc(a0, a1);
      test_srlv(a0, a1, VALUE);
      test_srac(a0, a1);
      test_srav(a0, a1, VALUE);
+
      test_sllc_n(a0, a1);
      test_sllv(a0, a1, -VALUE);
      test_srlc_n(a0, a1);
      test_srlv(a0, a1, -VALUE);
      test_srac_n(a0, a1);
      test_srav(a0, a1, -VALUE);
+
      test_sllc_o(a0, a1);
      test_sllv(a0, a1, SHIFT);
      test_srlc_o(a0, a1);
      test_srlv(a0, a1, SHIFT);
      test_srac_o(a0, a1);
      test_srav(a0, a1, SHIFT);
+
      test_sllc_on(a0, a1);
      test_sllv(a0, a1, -SHIFT);
      test_srlc_on(a0, a1);
      test_srlv(a0, a1, -SHIFT);
      test_srac_on(a0, a1);
      test_srav(a0, a1, -SHIFT);
+
+      test_sllc_add(a0, a1);
+      test_sllv_add(a0, a1, ADD_INIT);
+      test_srlc_add(a0, a1);
+      test_srlv_add(a0, a1, ADD_INIT);
+      test_srac_add(a0, a1);
+      test_srav_add(a0, a1, ADD_INIT);
+
+      test_sllc_and(a0, a1);
+      test_sllv_and(a0, a1, BIT_MASK);
+      test_srlc_and(a0, a1);
+      test_srlv_and(a0, a1, BIT_MASK);
+      test_srac_and(a0, a1);
+      test_srav_and(a0, a1, BIT_MASK);
+
      test_pack2(p2, a1);
      test_unpack2(a0, p2);
      test_pack2_swap(p2, a1);
@ -364,6 +385,60 @@ public class TestShortVect {
        errn += verify("test_srav_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT)));
      }

+      test_sllc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+      test_sllv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)<<VALUE));
+      }
+
+      test_srlc_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+      test_srlv_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)>>>VALUE));
+      }
+
+      test_srac_add(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+      test_srav_add(a0, a1, ADD_INIT);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_add: ", i, a0[i], (short)(((short)(ADD_INIT+i) + ADD_INIT)>>VALUE));
+      }
+
+      test_sllc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllc_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+      test_sllv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_sllv_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)<<VALUE));
+      }
+
+      test_srlc_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlc_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+      test_srlv_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srlv_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)>>>VALUE));
+      }
+
+      test_srac_and(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srac_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+      test_srav_and(a0, a1, BIT_MASK);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_srav_and: ", i, a0[i], (short)(((short)(ADD_INIT+i) & BIT_MASK)>>VALUE));
+      }
+
      test_pack2(p2, a1);
      for (int i=0; i<ARRLEN/2; i++) {
        errn += verify("test_pack2: ", i, p2[i], ((int)(ADD_INIT+2*i) & 0xFFFF) | ((int)(ADD_INIT+2*i+1) << 16));
@ -760,6 +835,84 @@ public class TestShortVect {
    end = System.currentTimeMillis();
    System.out.println("test_srav_on: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_add(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_add: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_add(a0, a1, ADD_INIT);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_add: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_sllv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_sllv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlc_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlc_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srlv_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srlv_and: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srac_and(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srac_and: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_srav_and(a0, a1, BIT_MASK);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_srav_and: " + (end - start));
+
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_pack2(p2, a1);
@ -968,6 +1121,26 @@ public class TestShortVect {
      a0[i] = (short)(a1[i]<<b);
    }
  }
+  static void test_sllc_add(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + ADD_INIT)<<VALUE);
+    }
+  }
+  static void test_sllv_add(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + b)<<VALUE);
+    }
+  }
+  static void test_sllc_and(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & BIT_MASK)<<VALUE);
+    }
+  }
+  static void test_sllv_and(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & b)<<VALUE);
+    }
+  }

  static void test_srlc(short[] a0, short[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -994,6 +1167,26 @@ public class TestShortVect {
      a0[i] = (short)(a1[i]>>>b);
    }
  }
+  static void test_srlc_add(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + ADD_INIT)>>>VALUE);
+    }
+  }
+  static void test_srlv_add(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + b)>>>VALUE);
+    }
+  }
+  static void test_srlc_and(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & BIT_MASK)>>>VALUE);
+    }
+  }
+  static void test_srlv_and(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & b)>>>VALUE);
+    }
+  }

  static void test_srac(short[] a0, short[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
@ -1020,6 +1213,26 @@ public class TestShortVect {
      a0[i] = (short)(a1[i]>>b);
    }
  }
+  static void test_srac_add(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + ADD_INIT)>>VALUE);
+    }
+  }
+  static void test_srav_add(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] + b)>>VALUE);
+    }
+  }
+  static void test_srac_and(short[] a0, short[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & BIT_MASK)>>VALUE);
+    }
+  }
+  static void test_srav_and(short[] a0, short[] a1, int b) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = (short)((a1[i] & b)>>VALUE);
+    }
+  }

  static void test_pack2(int[] p2, short[] a1) {
    if (p2.length*2 > a1.length) return;
--- a/hotspot/test/compiler/7184394/TestAESBase.java
+++ b/hotspot/test/compiler/7184394/TestAESBase.java
@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+import javax.crypto.KeyGenerator;
+import javax.crypto.SecretKey;
+import javax.crypto.spec.IvParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import java.security.AlgorithmParameters;
+
+import java.util.Random;
+import java.util.Arrays;
+
+abstract public class TestAESBase {
+  int msgSize = Integer.getInteger("msgSize", 646);
+  boolean checkOutput = Boolean.getBoolean("checkOutput");
+  boolean noReinit = Boolean.getBoolean("noReinit");
+  int keySize = Integer.getInteger("keySize", 128);
+  String algorithm = System.getProperty("algorithm", "AES");
+  String mode = System.getProperty("mode", "CBC");
+  byte[] input;
+  byte[] encode;
+  byte[] expectedEncode;
+  byte[] decode;
+  byte[] expectedDecode;
+  Random random = new Random(0);
+  Cipher cipher;
+  Cipher dCipher;
+  String paddingStr = "PKCS5Padding";
+  AlgorithmParameters algParams;
+  SecretKey key;
+  int ivLen;
+
+  static int numThreads = 0;
+  int  threadId;
+  static synchronized int getThreadId() {
+    int id = numThreads;
+    numThreads++;
+    return id;
+  }
+
+  abstract public void run();
+
+  public void prepare() {
+    try {
+    System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput);
+
+      int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
+      byte keyBytes[] = new byte[keyLenBytes];
+      if (keySize == 128)
+        keyBytes = new byte[] {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7};
+      else
+        random.nextBytes(keyBytes);
+
+      key = new SecretKeySpec(keyBytes, algorithm);
+      if (threadId == 0) {
+        System.out.println("Algorithm: " + key.getAlgorithm() + "("
+                           + key.getEncoded().length * 8 + "bit)");
+      }
+      input = new byte[msgSize];
+      for (int i=0; i<input.length; i++) {
+        input[i] = (byte) (i & 0xff);
+      }
+
+      cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+      dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+
+      ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
+      IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
+
+      cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      algParams = cipher.getParameters();
+      dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
+      if (threadId == 0) {
+        childShowCipher();
+      }
+
+      // do one encode and decode in preparation
+      // this will also create the encode buffer and decode buffer
+      encode = cipher.doFinal(input);
+      decode = dCipher.doFinal(encode);
+      if (checkOutput) {
+        expectedEncode = (byte[]) encode.clone();
+        expectedDecode = (byte[]) decode.clone();
+        showArray(key.getEncoded()  ,  "key:    ");
+        showArray(input,  "input:  ");
+        showArray(encode, "encode: ");
+        showArray(decode, "decode: ");
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  void showArray(byte b[], String name) {
+    System.out.format("%s [%d]: ", name, b.length);
+    for (int i=0; i<Math.min(b.length, 32); i++) {
+      System.out.format("%02x ", b[i] & 0xff);
+    }
+    System.out.println();
+  }
+
+  void compareArrays(byte b[], byte exp[]) {
+    if (b.length != exp.length) {
+      System.out.format("different lengths for actual and expected output arrays\n");
+      showArray(b, "test: ");
+      showArray(exp, "exp : ");
+      System.exit(1);
+    }
+    for (int i=0; i< exp.length; i++) {
+      if (b[i] != exp[i]) {
+        System.out.format("output error at index %d: got %02x, expected %02x\n", i, b[i] & 0xff, exp[i] & 0xff);
+        showArray(b, "test: ");
+        showArray(exp, "exp : ");
+        System.exit(1);
+      }
+    }
+  }
+
+
+  void showCipher(Cipher c, String kind) {
+    System.out.println(kind + " cipher provider: " + cipher.getProvider());
+    System.out.println(kind + " cipher algorithm: " + cipher.getAlgorithm());
+  }
+
+  abstract void childShowCipher();
+}
--- a/hotspot/test/compiler/7184394/TestAESDecode.java
+++ b/hotspot/test/compiler/7184394/TestAESDecode.java
@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+
+public class TestAESDecode extends TestAESBase {
+  @Override
+  public void run() {
+    try {
+      if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
+      if (checkOutput) {
+        // checked version creates new output buffer each time
+        decode = dCipher.doFinal(encode, 0, encode.length);
+        compareArrays(decode, expectedDecode);
+      } else {
+        // non-checked version outputs to existing encode buffer for maximum speed
+        decode = new byte[dCipher.getOutputSize(encode.length)];
+        dCipher.doFinal(encode, 0, encode.length, decode);
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  @Override
+  void childShowCipher() {
+    showCipher(dCipher, "Decryption");
+  }
+
+}
--- a/hotspot/test/compiler/7184394/TestAESEncode.java
+++ b/hotspot/test/compiler/7184394/TestAESEncode.java
@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @author Tom Deneau
+ */
+
+import javax.crypto.Cipher;
+
+public class TestAESEncode extends TestAESBase {
+  @Override
+  public void run() {
+    try {
+      if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      if (checkOutput) {
+        // checked version creates new output buffer each time
+        encode = cipher.doFinal(input, 0, msgSize);
+        compareArrays(encode, expectedEncode);
+      } else {
+        // non-checked version outputs to existing encode buffer for maximum speed
+        encode = new byte[cipher.getOutputSize(msgSize)];
+        cipher.doFinal(input, 0, msgSize, encode);
+      }
+    }
+    catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+
+  @Override
+  void childShowCipher() {
+    showCipher(cipher, "Encryption");
+  }
+
+}
--- a/hotspot/test/compiler/7184394/TestAESMain.java
+++ b/hotspot/test/compiler/7184394/TestAESMain.java
@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7184394
+ * @summary add intrinsics to use AES instructions
+ *
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true TestAESMain
+ *
+ * @author Tom Deneau
+ */
+
+public class TestAESMain {
+  public static void main(String[] args) {
+    int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000);
+    System.out.println(iters + " iterations");
+    TestAESEncode etest = new TestAESEncode();
+    etest.prepare();
+    long start = System.nanoTime();
+    for (int i=0; i<iters; i++) {
+      etest.run();
+    }
+    long end = System.nanoTime();
+    System.out.println("TestAESEncode runtime was " + (double)((end - start)/1000000000.0) + " ms");
+
+    TestAESDecode dtest = new TestAESDecode();
+    dtest.prepare();
+    start = System.nanoTime();
+    for (int i=0; i<iters; i++) {
+      dtest.run();
+    }
+    end = System.nanoTime();
+    System.out.println("TestAESDecode runtime was " + (double)((end - start)/1000000000.0) + " ms");
+  }
+}
--- a/hotspot/test/compiler/8000805/Test8000805.java
+++ b/hotspot/test/compiler/8000805/Test8000805.java
@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8000805
+ * @summary JMM issue: short loads are non-atomic
+ *
+ * @run main/othervm -server -XX:-TieredCompilation -Xcomp -XX:+PrintCompilation -XX:CompileOnly=Test8000805.loadS2LmaskFF,Test8000805.loadS2Lmask16,Test8000805.loadS2Lmask13,Test8000805.loadUS_signExt,Test8000805.loadB2L_mask8 Test8000805
+ */
+
+public class Test8000805 {
+    static long loadS2LmaskFF   (short[] sa) { return sa[0] & 0xFF; }
+    static long loadS2LmaskFF_1 (short[] sa) { return sa[0] & 0xFF; }
+
+    static long loadS2Lmask16   (short[] sa) { return sa[0] & 0xFFFE; }
+    static long loadS2Lmask16_1 (short[] sa) { return sa[0] & 0xFFFE; }
+
+    static long loadS2Lmask13   (short[] sa) { return sa[0] & 0x0FFF; }
+    static long loadS2Lmask13_1 (short[] sa) { return sa[0] & 0x0FFF; }
+
+    static int loadUS_signExt   (char[] ca) { return (ca[0] << 16) >> 16; }
+    static int loadUS_signExt_1 (char[] ca) { return (ca[0] << 16) >> 16; }
+
+    static long loadB2L_mask8   (byte[] ba) { return ba[0] & 0x55; }
+    static long loadB2L_mask8_1 (byte[] ba) { return ba[0] & 0x55; }
+
+    public static void main(String[] args) {
+        for (int i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
+            byte[] ba = new byte[]  { (byte) i};
+
+            { long v1 = loadB2L_mask8(ba);
+              long v2 = loadB2L_mask8_1(ba);
+              if (v1 != v2)
+              throw new InternalError(String.format("loadB2L_mask8 failed: %x != %x", v1, v2)); }
+        }
+
+        for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) {
+            short[] sa = new short[] { (short)i };
+            char[] ca = new char[] { (char)i };
+
+            { long v1 = loadS2LmaskFF(sa);
+              long v2 = loadS2LmaskFF_1(sa);
+              if (v1 != v2)
+              throw new InternalError(String.format("loadS2LmaskFF failed: %x != %x", v1, v2)); }
+
+            { long v1 = loadS2Lmask16(sa);
+              long v2 = loadS2Lmask16_1(sa);
+              if (v1 != v2)
+              throw new InternalError(String.format("loadS2Lmask16 failed: %x != %x", v1, v2)); }
+
+            { long v1 = loadS2Lmask13(sa);
+              long v2 = loadS2Lmask13_1(sa);
+              if (v1 != v2)
+              throw new InternalError(String.format("loadS2Lmask13 failed: %x != %x", v1, v2)); }
+
+            { int v1 = loadUS_signExt(ca);
+              int v2 = loadUS_signExt_1(ca);
+              if (v1 != v2)
+                throw new InternalError(String.format("loadUS_signExt failed: %x != %x", v1, v2)); }
+        }
+
+        System.out.println("TEST PASSED.");
+    }
+}
--- a/hotspot/test/compiler/8001183/TestCharVect.java
+++ b/hotspot/test/compiler/8001183/TestCharVect.java