diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index 378c1f8c6e7..6b9677d30c5 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -1007,6 +1007,67 @@ void Assembler::addss(XMMRegister dst, Address src) { emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } +void Assembler::aesdec(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xde); + emit_operand(dst, src); +} + +void Assembler::aesdec(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xde); + emit_byte(0xC0 | encode); +} + +void Assembler::aesdeclast(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdf); + emit_operand(dst, src); +} + +void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdf); + emit_byte(0xC0 | encode); +} + +void Assembler::aesenc(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdc); + emit_operand(dst, src); +} + +void Assembler::aesenc(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdc); + emit_byte(0xC0 | encode); +} + +void Assembler::aesenclast(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdd); + emit_operand(dst, src); +} + +void Assembler::aesenclast(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdd); + emit_byte(0xC0 | encode); +} + + void Assembler::andl(Address dst, int32_t imm32) { InstructionMark im(this); prefix(dst); @@ -2307,6 +2368,22 @@ void Assembler::prefix(Prefix p) { a_byte(p); } +void Assembler::pshufb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0x00); + emit_byte(0xC0 | encode); +} + +void Assembler::pshufb(XMMRegister dst, Address src) { + assert(VM_Version::supports_ssse3(), ""); + assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0x00); + emit_operand(dst, src); +} + void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -8067,6 +8144,15 @@ void MacroAssembler::movptr(Address dst, Register src) { LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); } +void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) { + if (reachable(src)) { + Assembler::movdqu(dst, as_Address(src)); + } else { + lea(rscratch1, src); + Assembler::movdqu(dst, Address(rscratch1, 0)); + } +} + void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { if (reachable(src)) { Assembler::movsd(dst, as_Address(src)); @@ -8357,6 +8443,17 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) { } } +void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { + // Used in sign-bit flipping with aligned address. + assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); + if (reachable(src)) { + Assembler::pshufb(dst, as_Address(src)); + } else { + lea(rscratch1, src); + Assembler::pshufb(dst, Address(rscratch1, 0)); + } +} + // AVX 3-operands instructions void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index c936e13f5d8..8a9bbaf424e 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -875,6 +875,17 @@ private: void addss(XMMRegister dst, Address src); void addss(XMMRegister dst, XMMRegister src); + // AES instructions + void aesdec(XMMRegister dst, Address src); + void aesdec(XMMRegister dst, XMMRegister src); + void aesdeclast(XMMRegister dst, Address src); + void aesdeclast(XMMRegister dst, XMMRegister src); + void aesenc(XMMRegister dst, Address src); + void aesenc(XMMRegister dst, XMMRegister src); + void aesenclast(XMMRegister dst, Address src); + void aesenclast(XMMRegister dst, XMMRegister src); + + void andl(Address dst, int32_t imm32); void andl(Register dst, int32_t imm32); void andl(Register dst, Address src); @@ -1424,6 +1435,10 @@ private: void prefetcht2(Address src); void prefetchw(Address src); + // Shuffle Bytes + void pshufb(XMMRegister dst, XMMRegister src); + void pshufb(XMMRegister dst, Address src); + // Shuffle Packed Doublewords void pshufd(XMMRegister dst, XMMRegister src, int mode); void pshufd(XMMRegister dst, Address src, int mode); @@ -2611,6 +2626,12 @@ public: void divss(XMMRegister dst, Address src) { Assembler::divss(dst, src); } void divss(XMMRegister dst, AddressLiteral src); + // Move Unaligned Double Quadword + void movdqu(Address dst, XMMRegister src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, Address src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, XMMRegister src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, AddressLiteral src); + void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); } void movsd(Address dst, XMMRegister src) { Assembler::movsd(dst, src); } void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); } @@ -2658,6 +2679,10 @@ public: void xorps(XMMRegister dst, Address src) { Assembler::xorps(dst, src); } void xorps(XMMRegister dst, AddressLiteral src); + // Shuffle Bytes + void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); } + void pshufb(XMMRegister dst, Address src) { Assembler::pshufb(dst, src); } + void pshufb(XMMRegister dst, AddressLiteral src); // AVX 3-operands instructions void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); } diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp index f149fde83ab..d8b61e0b2fd 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -2137,6 +2137,529 @@ class StubGenerator: public StubCodeGenerator { } } + // AES intrinsic stubs + enum {AESBlockSize = 16}; + + address generate_key_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); + address start = __ pc(); + __ emit_data(0x00010203, relocInfo::none, 0 ); + __ emit_data(0x04050607, relocInfo::none, 0 ); + __ emit_data(0x08090a0b, relocInfo::none, 0 ); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0 ); + return start; + } + + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + __ movdqu(xmmdst, Address(key, offset)); + if (xmm_shuf_mask != NULL) { + __ pshufb(xmmdst, xmm_shuf_mask); + } else { + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + } + } + + // aesenc using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesenc(xmmdst, xmmtmp); + } + + // aesdec using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesdec(xmmdst, xmmtmp); + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_encryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register keylen = rax; + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ push(rsi); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input + + // For encryption, the java expanded key ordering is just what we need + + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ pxor(xmm_result, xmm_temp); + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + __ aesenclast(xmm_result, xmm_temp); + __ movdqu(Address(to, 0), xmm_result); // store the result + __ xorptr(rax, rax); // return 0 + __ pop(rsi); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_decryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register keylen = rax; + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ push(rsi); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); + + // for decryption java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + // we don't know if the key is aligned, hence not using load-execute form + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); + __ pxor (xmm_result, xmm_temp); + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + // only in 192 and 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + // only in 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + // for decryption the aesdeclast operation is always on key+0x00 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ aesdeclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, 0), xmm_result); // store the result + + __ xorptr(rax, rax); // return 0 + __ pop(rsi); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + void handleSOERegisters(bool saving) { + const int saveFrameSizeInBytes = 4 * wordSize; + const Address saved_rbx (rbp, -3 * wordSize); + const Address saved_rsi (rbp, -2 * wordSize); + const Address saved_rdi (rbp, -1 * wordSize); + + if (saving) { + __ subptr(rsp, saveFrameSizeInBytes); + __ movptr(saved_rsi, rsi); + __ movptr(saved_rdi, rdi); + __ movptr(saved_rbx, rbx); + } else { + // restoring + __ movptr(rsi, saved_rsi); + __ movptr(rdi, saved_rdi); + __ movptr(rbx, saved_rbx); + } + } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + address generate_cipherBlockChaining_encryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register rvec = rdi; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // first 6 keys preloaded into xmm2-xmm7 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 7; + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + __ movptr(rvec , rvec_param); + __ movptr(len_reg , len_param); + + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 7 with keys 0-5 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + // 128 bit code follows here + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_128); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xa0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_128); + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object + + handleSOERegisters(false /*restoring*/); + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_192); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xc0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_256); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xe0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + // CBC AES Decryption. + // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time. + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + + address generate_cipherBlockChaining_decryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256; + Label L_singleBlock_loopTop_128; + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register rvec = rdi; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // first 6 keys preloaded into xmm2-xmm7 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 7; + const int FIRST_NON_REG_KEY_offset = 0x70; + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + __ movptr(rvec , rvec_param); + __ movptr(len_reg , len_param); + + // the java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 6 with first 5 keys + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + // inside here, use the rvec register to point to previous block cipher + // with which we xor at the end of each newly decrypted block + const Register prev_block_cipher_ptr = rvec; + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + + // 128-bit code follows here, parallelized + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_128); + __ cmpptr(len_reg, 0); // any blocks left?? + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) { // 128-bit runs up to key offset a0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlock_loopTop_128); + + + __ BIND(L_exit); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ movptr(rvec , rvec_param); // restore this since used in loop + __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object + handleSOERegisters(false /*restoring*/); + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_192); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) { // 192-bit runs up to key offset c0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_256); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) { // 256-bit runs up to key offset e0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); + __ jmp(L_exit); + + return start; + } + + public: // Information about frame layout at time of blocking runtime call. // Note that we only have to preserve callee-saved registers since @@ -2332,6 +2855,16 @@ class StubGenerator: public StubCodeGenerator { generate_arraycopy_stubs(); generate_math_stubs(); + + // don't bother generating these AES intrinsic stubs unless global flag is set + if (UseAESIntrinsics) { + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others + + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); + } } diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 8ae595a56da..3e223387c94 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -2941,6 +2941,548 @@ class StubGenerator: public StubCodeGenerator { } } + // AES intrinsic stubs + enum {AESBlockSize = 16}; + + address generate_key_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); + address start = __ pc(); + __ emit_data64( 0x0405060700010203, relocInfo::none ); + __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); + return start; + } + + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + __ movdqu(xmmdst, Address(key, offset)); + if (xmm_shuf_mask != NULL) { + __ pshufb(xmmdst, xmm_shuf_mask); + } else { + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + } + } + + // aesenc using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesenc(xmmdst, xmmtmp); + } + + // aesdec using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesdec(xmmdst, xmmtmp); + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_encryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rax; + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input + + // For encryption, the java expanded key ordering is just what we need + // we don't know if the key is aligned, hence not using load-execute form + + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ pxor(xmm_result, xmm_temp); + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + __ aesenclast(xmm_result, xmm_temp); + __ movdqu(Address(to, 0), xmm_result); // store the result + __ xorptr(rax, rax); // return 0 + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_decryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rax; + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); + + // for decryption java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + // we don't know if the key is aligned, hence not using load-execute form + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); + __ pxor (xmm_result, xmm_temp); + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + // only in 192 and 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + // only in 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + // for decryption the aesdeclast operation is always on key+0x00 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ aesdeclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, 0), xmm_result); // store the result + + __ xorptr(rax, rax); // return 0 + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + address generate_cipherBlockChaining_encryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) +#else + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register +#endif + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // keys 0-10 preloaded into xmm2-xmm12 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 12; + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + // save the xmm registers which must be preserved 6-12 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + // 128 bit code follows here + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_128); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + __ aesenclast(xmm_result, xmm_key10); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_128); + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object + +#ifdef _WIN64 + // restore xmm regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_192); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); + load_key(xmm_temp, key, 0xc0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_256); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); + aes_enc_key(xmm_result, xmm_temp, key, 0xc0); + aes_enc_key(xmm_result, xmm_temp, key, 0xd0); + load_key(xmm_temp, key, 0xe0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + + // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time + // to hide instruction latency + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256; + Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) +#else + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register +#endif + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + // keys 0-10 preloaded into xmm2-xmm12 + const int XMM_REG_NUM_KEY_FIRST = 5; + const int XMM_REG_NUM_KEY_LAST = 15; + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + // save the xmm registers which must be preserved 6-15 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + // the java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block + // registers holding the four results in the parallelized loop + const XMMRegister xmm_result0 = xmm0; + const XMMRegister xmm_result1 = xmm2; + const XMMRegister xmm_result2 = xmm3; + const XMMRegister xmm_result3 = xmm4; + + __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + + // 128-bit code follows here, parallelized + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop_128); + __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left + __ jcc(Assembler::less, L_singleBlock_loopTop_128); + + __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers + __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); + __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); + __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); + +#define DoFour(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); + + DoFour(pxor, xmm_key_first); + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + DoFour(aesdec, as_XMMRegister(rnum)); + } + DoFour(aesdeclast, xmm_key_last); + // for each result, xor with the r vector of previous cipher block + __ pxor(xmm_result0, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); + __ pxor(xmm_result1, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); + __ pxor(xmm_result2, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); + __ pxor(xmm_result3, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks + + __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); + + __ addptr(pos, 4*AESBlockSize); + __ subptr(len_reg, 4*AESBlockSize); + __ jmp(L_multiBlock_loopTop_128); + + // registers used in the non-parallelized loops + const XMMRegister xmm_prev_block_cipher_save = xmm2; + const XMMRegister xmm_temp = xmm3; + + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_128); + __ cmpptr(len_reg, 0); // any blocks left?? + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + __ aesdeclast(xmm_result, xmm_key_last); + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlock_loopTop_128); + + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object +#ifdef _WIN64 + // restore regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_192); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_256); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); + aes_dec_key(xmm_result, xmm_temp, key, 0xd0); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0); + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + #undef __ #define __ masm-> @@ -3135,6 +3677,16 @@ class StubGenerator: public StubCodeGenerator { generate_arraycopy_stubs(); generate_math_stubs(); + + // don't bother generating these AES intrinsic stubs unless global flag is set + if (UseAESIntrinsics) { + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others + + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } } public: diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp index 6ec4121b9e8..cfd4f33a6ca 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.cpp @@ -44,3 +44,4 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = NULL; +address StubRoutines::x86::_key_shuffle_mask_addr = NULL; diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp index 64767c8ad40..d53124fc6c8 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp @@ -41,10 +41,14 @@ class x86 { private: static address _verify_mxcsr_entry; static address _verify_fpu_cntrl_wrd_entry; + // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers + static address _key_shuffle_mask_addr; public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address verify_fpu_cntrl_wrd_entry() { return _verify_fpu_cntrl_wrd_entry; } + static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + }; static bool returns_to_call_stub(address return_pc) { return return_pc == _call_stub_return_address; } diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp index 084bbf8fb82..cf8ec5d7b4c 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.cpp @@ -56,3 +56,4 @@ address StubRoutines::x86::_float_sign_flip = NULL; address StubRoutines::x86::_double_sign_mask = NULL; address StubRoutines::x86::_double_sign_flip = NULL; address StubRoutines::x86::_mxcsr_std = NULL; +address StubRoutines::x86::_key_shuffle_mask_addr = NULL; diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp index 9b9cede4f2d..c3efeecb759 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp @@ -54,6 +54,8 @@ class x86 { static address _double_sign_mask; static address _double_sign_flip; static address _mxcsr_std; + // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers + static address _key_shuffle_mask_addr; public: @@ -116,6 +118,9 @@ class x86 { { return _mxcsr_std; } + + static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + }; #endif // CPU_X86_VM_STUBROUTINES_X86_64_HPP diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp index bf7b3c213f1..182b0ab1a5f 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp @@ -419,13 +419,16 @@ void VM_Version::get_processor_features() { if (UseAVX < 1) _cpuFeatures &= ~CPU_AVX; + if (!UseAES && !FLAG_IS_DEFAULT(UseAES)) + _cpuFeatures &= ~CPU_AES; + if (logical_processors_per_package() == 1) { // HT processor could be installed on a system which doesn't support HT. _cpuFeatures &= ~CPU_HT; } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -441,6 +444,7 @@ void VM_Version::get_processor_features() { (supports_popcnt() ? ", popcnt" : ""), (supports_avx() ? ", avx" : ""), (supports_avx2() ? ", avx2" : ""), + (supports_aes() ? ", aes" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow_prefetch() ? ", 3dnowpref" : ""), (supports_lzcnt() ? ", lzcnt": ""), @@ -472,6 +476,29 @@ void VM_Version::get_processor_features() { if (!supports_avx ()) // Drop to 0 if no AVX support UseAVX = 0; + // Use AES instructions if available. + if (supports_aes()) { + if (FLAG_IS_DEFAULT(UseAES)) { + UseAES = true; + } + } else if (UseAES) { + if (!FLAG_IS_DEFAULT(UseAES)) + warning("AES instructions not available on this CPU"); + FLAG_SET_DEFAULT(UseAES, false); + } + + // The AES intrinsic stubs require AES instruction support (of course) + // but also require AVX mode for misaligned SSE access + if (UseAES && (UseAVX > 0)) { + if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { + UseAESIntrinsics = true; + } + } else if (UseAESIntrinsics) { + if (!FLAG_IS_DEFAULT(UseAESIntrinsics)) + warning("AES intrinsics not available on this CPU"); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + #ifdef COMPILER2 if (UseFPUForSpilling) { if (UseSSE < 2) { @@ -714,6 +741,9 @@ void VM_Version::get_processor_features() { if (UseAVX > 0) { tty->print(" UseAVX=%d",UseAVX); } + if (UseAES) { + tty->print(" UseAES=1"); + } tty->cr(); tty->print("Allocation"); if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) { diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp index 92cdbd3fd03..12bd3b770d5 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp @@ -78,7 +78,9 @@ public: sse4_2 : 1, : 2, popcnt : 1, - : 3, + : 1, + aes : 1, + : 1, osxsave : 1, avx : 1, : 3; @@ -244,7 +246,8 @@ protected: CPU_TSC = (1 << 15), CPU_TSCINV = (1 << 16), CPU_AVX = (1 << 17), - CPU_AVX2 = (1 << 18) + CPU_AVX2 = (1 << 18), + CPU_AES = (1 << 19) } cpuFeatureFlags; enum { @@ -420,6 +423,8 @@ protected: result |= CPU_TSC; if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0) result |= CPU_TSCINV; + if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0) + result |= CPU_AES; // AMD features. if (is_amd()) { @@ -544,6 +549,7 @@ public: static bool supports_avx() { return (_cpuFeatures & CPU_AVX) != 0; } static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; } static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; } + static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && diff --git a/hotspot/src/cpu/x86/vm/x86.ad b/hotspot/src/cpu/x86/vm/x86.ad index a2cf6f7945d..c49d0e6c31c 100644 --- a/hotspot/src/cpu/x86/vm/x86.ad +++ b/hotspot/src/cpu/x86/vm/x86.ad @@ -4102,9 +4102,158 @@ instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ // ----------------------- LogicalRightShift ----------------------------------- -// Shorts/Chars vector logical right shift produces incorrect Java result +// Shorts vector logical right shift produces incorrect Java result // for negative data because java code convert short value into int with -// sign extension before a shift. +// sign extension before a shift. But char vectors are fine since chars are +// unsigned values. + +instruct vsrl2S(vecS dst, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2S_imm(vecS dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4S(vecD dst, vecS shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4S_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8S(vecX dst, vecS shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8S_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (URShiftVS dst shift)); + format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} + ins_encode %{ + __ psrlw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (URShiftVS src shift)); + format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} // Integers vector logical right shift instruct vsrl2I(vecD dst, vecS shift) %{ diff --git a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp index 1bbdc5afbca..941dd120a01 100644 --- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp +++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp @@ -1844,17 +1844,12 @@ void GraphBuilder::invoke(Bytecodes::Code code) { code == Bytecodes::_invokevirtual && target->is_final_method() || code == Bytecodes::_invokedynamic) { ciMethod* inline_target = (cha_monomorphic_target != NULL) ? cha_monomorphic_target : target; - bool success = false; - if (target->is_method_handle_intrinsic()) { - // method handle invokes - success = try_method_handle_inline(target); - } else { - // static binding => check if callee is ok - success = try_inline(inline_target, (cha_monomorphic_target != NULL) || (exact_target != NULL), code, better_receiver); - } - CHECK_BAILOUT(); + // static binding => check if callee is ok + bool success = try_inline(inline_target, (cha_monomorphic_target != NULL) || (exact_target != NULL), code, better_receiver); + CHECK_BAILOUT(); clear_inline_bailout(); + if (success) { // Register dependence if JVMTI has either breakpoint // setting or hotswapping of methods capabilities since they may @@ -3201,6 +3196,11 @@ bool GraphBuilder::try_inline(ciMethod* callee, bool holder_known, Bytecodes::Co return false; } + // method handle invokes + if (callee->is_method_handle_intrinsic()) { + return try_method_handle_inline(callee); + } + // handle intrinsics if (callee->intrinsic_id() != vmIntrinsics::_none) { if (try_inline_intrinsics(callee)) { @@ -3885,10 +3885,14 @@ bool GraphBuilder::try_method_handle_inline(ciMethod* callee) { ValueType* type = state()->stack_at(args_base)->type(); if (type->is_constant()) { ciMethod* target = type->as_ObjectType()->constant_value()->as_method_handle()->get_vmtarget(); - guarantee(!target->is_method_handle_intrinsic(), "should not happen"); // XXX remove - Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual; - if (try_inline(target, /*holder_known*/ true, bc)) { - return true; + // We don't do CHA here so only inline static and statically bindable methods. + if (target->is_static() || target->can_be_statically_bound()) { + Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual; + if (try_inline(target, /*holder_known*/ true, bc)) { + return true; + } + } else { + print_inlining(target, "not static or statically bindable", /*success*/ false); } } else { print_inlining(callee, "receiver not constant", /*success*/ false); @@ -3941,9 +3945,14 @@ bool GraphBuilder::try_method_handle_inline(ciMethod* callee) { } j += t->size(); // long and double take two slots } - Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual; - if (try_inline(target, /*holder_known*/ true, bc)) { - return true; + // We don't do CHA here so only inline static and statically bindable methods. + if (target->is_static() || target->can_be_statically_bound()) { + Bytecodes::Code bc = target->is_static() ? Bytecodes::_invokestatic : Bytecodes::_invokevirtual; + if (try_inline(target, /*holder_known*/ true, bc)) { + return true; + } + } else { + print_inlining(target, "not static or statically bindable", /*success*/ false); } } } else { diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp index 06fdb35be19..2febc7b5675 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.hpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp @@ -110,6 +110,7 @@ template(sun_jkernel_DownloadManager, "sun/jkernel/DownloadManager") \ template(getBootClassPathEntryForClass_name, "getBootClassPathEntryForClass") \ template(sun_misc_PostVMInitHook, "sun/misc/PostVMInitHook") \ + template(sun_misc_Launcher_ExtClassLoader, "sun/misc/Launcher$ExtClassLoader") \ \ /* Java runtime version access */ \ template(sun_misc_Version, "sun/misc/Version") \ @@ -723,6 +724,21 @@ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ + /* support for com.sum.crypto.provider.AESCrypt and some of its callers */ \ + do_class(com_sun_crypto_provider_aescrypt, "com/sun/crypto/provider/AESCrypt") \ + do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ + do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ + do_name( encryptBlock_name, "encryptBlock") \ + do_name( decryptBlock_name, "decryptBlock") \ + do_signature(byteArray_int_byteArray_int_signature, "([BI[BI)V") \ + \ + do_class(com_sun_crypto_provider_cipherBlockChaining, "com/sun/crypto/provider/CipherBlockChaining") \ + do_intrinsic(_cipherBlockChaining_encryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, encrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_name( encrypt_name, "encrypt") \ + do_name( decrypt_name, "decrypt") \ + do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)V") \ + \ /* support for sun.misc.Unsafe */ \ do_class(sun_misc_Unsafe, "sun/misc/Unsafe") \ \ diff --git a/hotspot/src/share/vm/oops/method.cpp b/hotspot/src/share/vm/oops/method.cpp index 5a1032f771e..9849829ea29 100644 --- a/hotspot/src/share/vm/oops/method.cpp +++ b/hotspot/src/share/vm/oops/method.cpp @@ -1155,8 +1155,12 @@ methodHandle Method::clone_with_new_data(methodHandle m, u_char* new_code, int n vmSymbols::SID Method::klass_id_for_intrinsics(Klass* holder) { // if loader is not the default loader (i.e., != NULL), we can't know the intrinsics // because we are not loading from core libraries - if (InstanceKlass::cast(holder)->class_loader() != NULL) + // exception: the AES intrinsics come from lib/ext/sunjce_provider.jar + // which does not use the class default class loader so we check for its loader here + if ((InstanceKlass::cast(holder)->class_loader() != NULL) && + InstanceKlass::cast(holder)->class_loader()->klass()->name() != vmSymbols::sun_misc_Launcher_ExtClassLoader()) { return vmSymbols::NO_SID; // regardless of name, no intrinsics here + } // see if the klass name is well-known: Symbol* klass_name = InstanceKlass::cast(holder)->name(); diff --git a/hotspot/src/share/vm/opto/c2_globals.hpp b/hotspot/src/share/vm/opto/c2_globals.hpp index 80996a5d713..4fdebf526cb 100644 --- a/hotspot/src/share/vm/opto/c2_globals.hpp +++ b/hotspot/src/share/vm/opto/c2_globals.hpp @@ -439,6 +439,9 @@ product(bool, DoEscapeAnalysis, true, \ "Perform escape analysis") \ \ + develop(bool, ExitEscapeAnalysisOnTimeout, true, \ + "Exit or throw assert in EA when it reaches time limit") \ + \ notproduct(bool, PrintEscapeAnalysis, false, \ "Print the results of escape analysis") \ \ diff --git a/hotspot/src/share/vm/opto/callGenerator.cpp b/hotspot/src/share/vm/opto/callGenerator.cpp index 547096b3dfc..93f2b859ba0 100644 --- a/hotspot/src/share/vm/opto/callGenerator.cpp +++ b/hotspot/src/share/vm/opto/callGenerator.cpp @@ -670,6 +670,129 @@ CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod* } +//------------------------PredictedIntrinsicGenerator------------------------------ +// Internal class which handles all predicted Intrinsic calls. +class PredictedIntrinsicGenerator : public CallGenerator { + CallGenerator* _intrinsic; + CallGenerator* _cg; + +public: + PredictedIntrinsicGenerator(CallGenerator* intrinsic, + CallGenerator* cg) + : CallGenerator(cg->method()) + { + _intrinsic = intrinsic; + _cg = cg; + } + + virtual bool is_virtual() const { return true; } + virtual bool is_inlined() const { return true; } + virtual bool is_intrinsic() const { return true; } + + virtual JVMState* generate(JVMState* jvms); +}; + + +CallGenerator* CallGenerator::for_predicted_intrinsic(CallGenerator* intrinsic, + CallGenerator* cg) { + return new PredictedIntrinsicGenerator(intrinsic, cg); +} + + +JVMState* PredictedIntrinsicGenerator::generate(JVMState* jvms) { + GraphKit kit(jvms); + PhaseGVN& gvn = kit.gvn(); + + CompileLog* log = kit.C->log(); + if (log != NULL) { + log->elem("predicted_intrinsic bci='%d' method='%d'", + jvms->bci(), log->identify(method())); + } + + Node* slow_ctl = _intrinsic->generate_predicate(kit.sync_jvms()); + if (kit.failing()) + return NULL; // might happen because of NodeCountInliningCutoff + + SafePointNode* slow_map = NULL; + JVMState* slow_jvms; + if (slow_ctl != NULL) { + PreserveJVMState pjvms(&kit); + kit.set_control(slow_ctl); + if (!kit.stopped()) { + slow_jvms = _cg->generate(kit.sync_jvms()); + if (kit.failing()) + return NULL; // might happen because of NodeCountInliningCutoff + assert(slow_jvms != NULL, "must be"); + kit.add_exception_states_from(slow_jvms); + kit.set_map(slow_jvms->map()); + if (!kit.stopped()) + slow_map = kit.stop(); + } + } + + if (kit.stopped()) { + // Predicate is always false. + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + + // Generate intrinsic code: + JVMState* new_jvms = _intrinsic->generate(kit.sync_jvms()); + if (new_jvms == NULL) { + // Intrinsic failed, so use slow code or make a direct call. + if (slow_map == NULL) { + CallGenerator* cg = CallGenerator::for_direct_call(method()); + new_jvms = cg->generate(kit.sync_jvms()); + } else { + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + } + kit.add_exception_states_from(new_jvms); + kit.set_jvms(new_jvms); + + // Need to merge slow and fast? + if (slow_map == NULL) { + // The fast path is the only path remaining. + return kit.transfer_exceptions_into_jvms(); + } + + if (kit.stopped()) { + // Intrinsic method threw an exception, so it's just the slow path after all. + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + + // Finish the diamond. + kit.C->set_has_split_ifs(true); // Has chance for split-if optimization + RegionNode* region = new (kit.C) RegionNode(3); + region->init_req(1, kit.control()); + region->init_req(2, slow_map->control()); + kit.set_control(gvn.transform(region)); + Node* iophi = PhiNode::make(region, kit.i_o(), Type::ABIO); + iophi->set_req(2, slow_map->i_o()); + kit.set_i_o(gvn.transform(iophi)); + kit.merge_memory(slow_map->merged_memory(), region, 2); + uint tos = kit.jvms()->stkoff() + kit.sp(); + uint limit = slow_map->req(); + for (uint i = TypeFunc::Parms; i < limit; i++) { + // Skip unused stack slots; fast forward to monoff(); + if (i == tos) { + i = kit.jvms()->monoff(); + if( i >= limit ) break; + } + Node* m = kit.map()->in(i); + Node* n = slow_map->in(i); + if (m != n) { + const Type* t = gvn.type(m)->meet(gvn.type(n)); + Node* phi = PhiNode::make(region, m, t); + phi->set_req(2, n); + kit.map()->set_req(i, gvn.transform(phi)); + } + } + return kit.transfer_exceptions_into_jvms(); +} + //-------------------------UncommonTrapCallGenerator----------------------------- // Internal class which handles all out-of-line calls checking receiver type. class UncommonTrapCallGenerator : public CallGenerator { diff --git a/hotspot/src/share/vm/opto/callGenerator.hpp b/hotspot/src/share/vm/opto/callGenerator.hpp index 3cfd39df63e..ae59173bf7d 100644 --- a/hotspot/src/share/vm/opto/callGenerator.hpp +++ b/hotspot/src/share/vm/opto/callGenerator.hpp @@ -143,6 +143,9 @@ class CallGenerator : public ResourceObj { // Registry for intrinsics: static CallGenerator* for_intrinsic(ciMethod* m); static void register_intrinsic(ciMethod* m, CallGenerator* cg); + static CallGenerator* for_predicted_intrinsic(CallGenerator* intrinsic, + CallGenerator* cg); + virtual Node* generate_predicate(JVMState* jvms) { return NULL; }; static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) { if (PrintInlining) diff --git a/hotspot/src/share/vm/opto/compile.cpp b/hotspot/src/share/vm/opto/compile.cpp index d870872a9d4..c4da70ee32c 100644 --- a/hotspot/src/share/vm/opto/compile.cpp +++ b/hotspot/src/share/vm/opto/compile.cpp @@ -3047,9 +3047,9 @@ bool Compile::Constant::operator==(const Constant& other) { case T_LONG: case T_DOUBLE: return (_v._value.j == other._v._value.j); case T_OBJECT: - case T_METADATA: return (_v._metadata == other._v._metadata); case T_ADDRESS: return (_v._value.l == other._v._value.l); case T_VOID: return (_v._value.l == other._v._value.l); // jump-table entries + case T_METADATA: return (_v._metadata == other._v._metadata); default: ShouldNotReachHere(); } return false; diff --git a/hotspot/src/share/vm/opto/compile.hpp b/hotspot/src/share/vm/opto/compile.hpp index 6bcf3d3da20..44bf277689c 100644 --- a/hotspot/src/share/vm/opto/compile.hpp +++ b/hotspot/src/share/vm/opto/compile.hpp @@ -149,7 +149,7 @@ class Compile : public Phase { private: BasicType _type; union { - jvalue _value; + jvalue _value; Metadata* _metadata; } _v; int _offset; // offset of this constant (in bytes) relative to the constant table base. diff --git a/hotspot/src/share/vm/opto/doCall.cpp b/hotspot/src/share/vm/opto/doCall.cpp index 30a01f34b3a..95d14884163 100644 --- a/hotspot/src/share/vm/opto/doCall.cpp +++ b/hotspot/src/share/vm/opto/doCall.cpp @@ -107,7 +107,17 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool // intrinsics handle strict f.p. correctly. if (allow_inline && allow_intrinsics) { CallGenerator* cg = find_intrinsic(callee, call_is_virtual); - if (cg != NULL) return cg; + if (cg != NULL) { + if (cg->is_predicted()) { + // Code without intrinsic but, hopefully, inlined. + CallGenerator* inline_cg = this->call_generator(callee, + vtable_index, call_is_virtual, jvms, allow_inline, prof_factor, false); + if (inline_cg != NULL) { + cg = CallGenerator::for_predicted_intrinsic(cg, inline_cg); + } + } + return cg; + } } // Do method handle calls. diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp index 9fd3180505d..a5aa47119da 100644 --- a/hotspot/src/share/vm/opto/escape.cpp +++ b/hotspot/src/share/vm/opto/escape.cpp @@ -893,12 +893,16 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { arg_has_oops && (i > TypeFunc::Parms); #ifdef ASSERT if (!(is_arraycopy || - call->as_CallLeaf()->_name != NULL && - (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre") == 0 || - strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 )) - ) { + (call->as_CallLeaf()->_name != NULL && + (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre") == 0 || + strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 || + strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 || + strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0) + ))) { call->dump(); - assert(false, "EA: unexpected CallLeaf"); + fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); } #endif // Always process arraycopy's destination object since @@ -1080,7 +1084,7 @@ bool ConnectionGraph::complete_connection_graph( C->log()->text("%s", (iterations >= CG_BUILD_ITER_LIMIT) ? "iterations" : "time"); C->log()->end_elem(" limit'"); } - assert(false, err_msg_res("infinite EA connection graph build (%f sec, %d iterations) with %d nodes and worklist size %d", + assert(ExitEscapeAnalysisOnTimeout, err_msg_res("infinite EA connection graph build (%f sec, %d iterations) with %d nodes and worklist size %d", time.seconds(), iterations, nodes_size(), ptnodes_worklist.length())); // Possible infinite build_connection_graph loop, // bailout (no changes to ideal graph were made). diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp index f34df79b476..6b90061ff4f 100644 --- a/hotspot/src/share/vm/opto/library_call.cpp +++ b/hotspot/src/share/vm/opto/library_call.cpp @@ -44,18 +44,22 @@ class LibraryIntrinsic : public InlineCallGenerator { public: private: bool _is_virtual; + bool _is_predicted; vmIntrinsics::ID _intrinsic_id; public: - LibraryIntrinsic(ciMethod* m, bool is_virtual, vmIntrinsics::ID id) + LibraryIntrinsic(ciMethod* m, bool is_virtual, bool is_predicted, vmIntrinsics::ID id) : InlineCallGenerator(m), _is_virtual(is_virtual), + _is_predicted(is_predicted), _intrinsic_id(id) { } virtual bool is_intrinsic() const { return true; } virtual bool is_virtual() const { return _is_virtual; } + virtual bool is_predicted() const { return _is_predicted; } virtual JVMState* generate(JVMState* jvms); + virtual Node* generate_predicate(JVMState* jvms); vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; } }; @@ -83,6 +87,7 @@ class LibraryCallKit : public GraphKit { int arg_size() const { return callee()->arg_size(); } bool try_to_inline(); + Node* try_to_predicate(); // Helper functions to inline natives void push_result(RegionNode* region, PhiNode* value); @@ -148,6 +153,7 @@ class LibraryCallKit : public GraphKit { CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) { return generate_method_call(method_id, true, false); } + Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static); Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2); Node* make_string_method_node(int opcode, Node* str1, Node* str2); @@ -253,6 +259,10 @@ class LibraryCallKit : public GraphKit { bool inline_reverseBytes(vmIntrinsics::ID id); bool inline_reference_get(); + bool inline_aescrypt_Block(vmIntrinsics::ID id); + bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); + Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); + Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); }; @@ -306,6 +316,8 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { } } + bool is_predicted = false; + switch (id) { case vmIntrinsics::_compareTo: if (!SpecialStringCompareTo) return NULL; @@ -413,6 +425,18 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { break; #endif + case vmIntrinsics::_aescrypt_encryptBlock: + case vmIntrinsics::_aescrypt_decryptBlock: + if (!UseAESIntrinsics) return NULL; + break; + + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + if (!UseAESIntrinsics) return NULL; + // these two require the predicated logic + is_predicted = true; + break; + default: assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility"); assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?"); @@ -444,7 +468,7 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { if (!InlineUnsafeOps) return NULL; } - return new LibraryIntrinsic(m, is_virtual, (vmIntrinsics::ID) id); + return new LibraryIntrinsic(m, is_virtual, is_predicted, (vmIntrinsics::ID) id); } //----------------------register_library_intrinsics----------------------- @@ -496,6 +520,47 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) { return NULL; } +Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) { + LibraryCallKit kit(jvms, this); + Compile* C = kit.C; + int nodes = C->unique(); +#ifndef PRODUCT + assert(is_predicted(), "sanity"); + if ((PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) && Verbose) { + char buf[1000]; + const char* str = vmIntrinsics::short_name_as_C_string(intrinsic_id(), buf, sizeof(buf)); + tty->print_cr("Predicate for intrinsic %s", str); + } +#endif + + Node* slow_ctl = kit.try_to_predicate(); + if (!kit.failing()) { + if (C->log()) { + C->log()->elem("predicate_intrinsic id='%s'%s nodes='%d'", + vmIntrinsics::name_at(intrinsic_id()), + (is_virtual() ? " virtual='1'" : ""), + C->unique() - nodes); + } + return slow_ctl; // Could be NULL if the check folds. + } + + // The intrinsic bailed out + if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) { + if (jvms->has_method()) { + // Not a root compile. + const char* msg = "failed to generate predicate for intrinsic"; + CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg); + } else { + // Root compile + tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in", + vmIntrinsics::name_at(intrinsic_id()), + (is_virtual() ? " (virtual)" : ""), kit.bci()); + } + } + C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed); + return NULL; +} + bool LibraryCallKit::try_to_inline() { // Handle symbolic names for otherwise undistinguished boolean switches: const bool is_store = true; @@ -767,6 +832,14 @@ bool LibraryCallKit::try_to_inline() { case vmIntrinsics::_Reference_get: return inline_reference_get(); + case vmIntrinsics::_aescrypt_encryptBlock: + case vmIntrinsics::_aescrypt_decryptBlock: + return inline_aescrypt_Block(intrinsic_id()); + + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt(intrinsic_id()); + default: // If you get here, it may be that someone has added a new intrinsic // to the list in vmSymbols.hpp without implementing it here. @@ -780,6 +853,36 @@ bool LibraryCallKit::try_to_inline() { } } +Node* LibraryCallKit::try_to_predicate() { + if (!jvms()->has_method()) { + // Root JVMState has a null method. + assert(map()->memory()->Opcode() == Op_Parm, ""); + // Insert the memory aliasing node + set_all_memory(reset_memory()); + } + assert(merged_memory(), ""); + + switch (intrinsic_id()) { + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt_predicate(false); + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt_predicate(true); + + default: + // If you get here, it may be that someone has added a new intrinsic + // to the list in vmSymbols.hpp without implementing it here. +#ifndef PRODUCT + if ((PrintMiscellaneous && (Verbose || WizardMode)) || PrintOpto) { + tty->print_cr("*** Warning: Unimplemented predicate for intrinsic %s(%d)", + vmIntrinsics::name_at(intrinsic_id()), intrinsic_id()); + } +#endif + Node* slow_ctl = control(); + set_control(top()); // No fast path instrinsic + return slow_ctl; + } +} + //------------------------------push_result------------------------------ // Helper function for finishing intrinsics. void LibraryCallKit::push_result(RegionNode* region, PhiNode* value) { @@ -3830,7 +3933,7 @@ Node* LibraryCallKit::generate_virtual_guard(Node* obj_klass, vtable_index*vtableEntry::size()) * wordSize + vtableEntry::method_offset_in_bytes(); Node* entry_addr = basic_plus_adr(obj_klass, entry_offset); - Node* target_call = make_load(NULL, entry_addr, TypeInstPtr::NOTNULL, T_OBJECT); + Node* target_call = make_load(NULL, entry_addr, TypePtr::NOTNULL, T_ADDRESS); // Compare the target method with the expected method (e.g., Object.hashCode). const TypePtr* native_call_addr = TypeMetadataPtr::make(method); @@ -5613,3 +5716,265 @@ bool LibraryCallKit::inline_reference_get() { push(result); return true; } + + +Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, + bool is_exact=true, bool is_static=false) { + + const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr(); + assert(tinst != NULL, "obj is null"); + assert(tinst->klass()->is_loaded(), "obj is not loaded"); + assert(!is_exact || tinst->klass_is_exact(), "klass not exact"); + + ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName), + ciSymbol::make(fieldTypeString), + is_static); + if (field == NULL) return (Node *) NULL; + assert (field != NULL, "undefined field"); + + // Next code copied from Parse::do_get_xxx(): + + // Compute address and memory type. + int offset = field->offset_in_bytes(); + bool is_vol = field->is_volatile(); + ciType* field_klass = field->type(); + assert(field_klass->is_loaded(), "should be loaded"); + const TypePtr* adr_type = C->alias_type(field)->adr_type(); + Node *adr = basic_plus_adr(fromObj, fromObj, offset); + BasicType bt = field->layout_type(); + + // Build the resultant type of the load + const Type *type = TypeOopPtr::make_from_klass(field_klass->as_klass()); + + // Build the load. + Node* loadedField = make_load(NULL, adr, type, bt, adr_type, is_vol); + return loadedField; +} + + +//------------------------------inline_aescrypt_Block----------------------- +bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) { + address stubAddr; + const char *stubName; + assert(UseAES, "need AES instruction support"); + + switch(id) { + case vmIntrinsics::_aescrypt_encryptBlock: + stubAddr = StubRoutines::aescrypt_encryptBlock(); + stubName = "aescrypt_encryptBlock"; + break; + case vmIntrinsics::_aescrypt_decryptBlock: + stubAddr = StubRoutines::aescrypt_decryptBlock(); + stubName = "aescrypt_decryptBlock"; + break; + } + if (stubAddr == NULL) return false; + + // Restore the stack and pop off the arguments. + int nargs = 5; // this + 2 oop/offset combos + assert(callee()->signature()->size() == nargs-1, "encryptBlock has 4 arguments"); + + Node *aescrypt_object = argument(0); + Node *src = argument(1); + Node *src_offset = argument(2); + Node *dest = argument(3); + Node *dest_offset = argument(4); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert (top_src != NULL && top_src->klass() != NULL && top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // for the quick and dirty code we will skip all the checks. + // we are just trying to get the call to be generated. + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // now need to get the start of its expanded key array + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start); + + return true; +} + +//------------------------------inline_cipherBlockChaining_AESCrypt----------------------- +bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) { + address stubAddr; + const char *stubName; + + assert(UseAES, "need AES instruction support"); + + switch(id) { + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + stubAddr = StubRoutines::cipherBlockChaining_encryptAESCrypt(); + stubName = "cipherBlockChaining_encryptAESCrypt"; + break; + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + stubAddr = StubRoutines::cipherBlockChaining_decryptAESCrypt(); + stubName = "cipherBlockChaining_decryptAESCrypt"; + break; + } + if (stubAddr == NULL) return false; + + + // Restore the stack and pop off the arguments. + int nargs = 6; // this + oop/offset + len + oop/offset + assert(callee()->signature()->size() == nargs-1, "wrong number of arguments"); + Node *cipherBlockChaining_object = argument(0); + Node *src = argument(1); + Node *src_offset = argument(2); + Node *len = argument(3); + Node *dest = argument(4); + Node *dest_offset = argument(5); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert (top_src != NULL && top_src->klass() != NULL + && top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // checks are the responsibility of the caller + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object + // (because of the predicated logic executed earlier). + // so we cast it here safely. + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + + Node* embeddedCipherObj = load_field_from_object(cipherBlockChaining_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + if (embeddedCipherObj == NULL) return false; + + // cast it to what we know it will be at runtime + const TypeInstPtr* tinst = _gvn.type(cipherBlockChaining_object)->isa_instptr(); + assert(tinst != NULL, "CBC obj is null"); + assert(tinst->klass()->is_loaded(), "CBC obj is not loaded"); + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) return false; + + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt); + const TypeOopPtr* xtype = aklass->as_instance_type(); + Node* aescrypt_object = new(C) CheckCastPPNode(control(), embeddedCipherObj, xtype); + aescrypt_object = _gvn.transform(aescrypt_object); + + // we need to get the start of the aescrypt_object's expanded key array + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + + // similarly, get the start address of the r vector + Node* objRvec = load_field_from_object(cipherBlockChaining_object, "r", "[B", /*is_exact*/ false); + if (objRvec == NULL) return false; + Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE); + + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len); + + // return is void so no result needs to be pushed + + return true; +} + +//------------------------------get_key_start_from_aescrypt_object----------------------- +Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { + Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false); + assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objAESCryptKey == NULL) return (Node *) NULL; + + // now have the array, need to get the start address of the K array + Node* k_start = array_element_address(objAESCryptKey, intcon(0), T_INT); + return k_start; +} + +//----------------------------inline_cipherBlockChaining_AESCrypt_predicate---------------------------- +// Return node representing slow path of predicate check. +// the pseudo code we want to emulate with this predicate is: +// for encryption: +// if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath +// for decryption: +// if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath +// note cipher==plain is more conservative than the original java code but that's OK +// +Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting) { + // First, check receiver for NULL since it is virtual method. + int nargs = arg_size(); + Node* objCBC = argument(0); + _sp += nargs; + objCBC = do_null_check(objCBC, T_OBJECT); + _sp -= nargs; + + if (stopped()) return NULL; // Always NULL + + // Load embeddedCipher field of CipherBlockChaining object. + Node* embeddedCipherObj = load_field_from_object(objCBC, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + + // get AESCrypt klass for instanceOf check + // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point + // will have same classloader as CipherBlockChaining object + const TypeInstPtr* tinst = _gvn.type(objCBC)->isa_instptr(); + assert(tinst != NULL, "CBCobj is null"); + assert(tinst->klass()->is_loaded(), "CBCobj is not loaded"); + + // we want to do an instanceof comparison against the AESCrypt class + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) { + // if AESCrypt is not even loaded, we never take the intrinsic fast path + Node* ctrl = control(); + set_control(top()); // no regular fast path + return ctrl; + } + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + + _sp += nargs; // gen_instanceof might do an uncommon trap + Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt))); + _sp -= nargs; + Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1))); + Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne)); + + Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN); + + // for encryption, we are done + if (!decrypting) + return instof_false; // even if it is NULL + + // for decryption, we need to add a further check to avoid + // taking the intrinsic path when cipher and plain are the same + // see the original java code for why. + RegionNode* region = new(C) RegionNode(3); + region->init_req(1, instof_false); + Node* src = argument(1); + Node *dest = argument(4); + Node* cmp_src_dest = _gvn.transform(new (C) CmpPNode(src, dest)); + Node* bool_src_dest = _gvn.transform(new (C) BoolNode(cmp_src_dest, BoolTest::eq)); + Node* src_dest_conjoint = generate_guard(bool_src_dest, NULL, PROB_MIN); + region->init_req(2, src_dest_conjoint); + + record_for_igvn(region); + return _gvn.transform(region); + +} + + diff --git a/hotspot/src/share/vm/opto/mulnode.cpp b/hotspot/src/share/vm/opto/mulnode.cpp index 4572a265e57..4047b933fbd 100644 --- a/hotspot/src/share/vm/opto/mulnode.cpp +++ b/hotspot/src/share/vm/opto/mulnode.cpp @@ -479,24 +479,27 @@ Node *AndINode::Ideal(PhaseGVN *phase, bool can_reshape) { return new (phase->C) AndINode(load,phase->intcon(mask&0xFFFF)); // Masking bits off of a Short? Loading a Character does some masking - if (lop == Op_LoadS && (mask & 0xFFFF0000) == 0 ) { - Node *ldus = new (phase->C) LoadUSNode(load->in(MemNode::Control), - load->in(MemNode::Memory), - load->in(MemNode::Address), - load->adr_type()); - ldus = phase->transform(ldus); - return new (phase->C) AndINode(ldus, phase->intcon(mask & 0xFFFF)); - } + if (can_reshape && + load->outcnt() == 1 && load->unique_out() == this) { + if (lop == Op_LoadS && (mask & 0xFFFF0000) == 0 ) { + Node *ldus = new (phase->C) LoadUSNode(load->in(MemNode::Control), + load->in(MemNode::Memory), + load->in(MemNode::Address), + load->adr_type()); + ldus = phase->transform(ldus); + return new (phase->C) AndINode(ldus, phase->intcon(mask & 0xFFFF)); + } - // Masking sign bits off of a Byte? Do an unsigned byte load plus - // an and. - if (lop == Op_LoadB && (mask & 0xFFFFFF00) == 0) { - Node* ldub = new (phase->C) LoadUBNode(load->in(MemNode::Control), - load->in(MemNode::Memory), - load->in(MemNode::Address), - load->adr_type()); - ldub = phase->transform(ldub); - return new (phase->C) AndINode(ldub, phase->intcon(mask)); + // Masking sign bits off of a Byte? Do an unsigned byte load plus + // an and. + if (lop == Op_LoadB && (mask & 0xFFFFFF00) == 0) { + Node* ldub = new (phase->C) LoadUBNode(load->in(MemNode::Control), + load->in(MemNode::Memory), + load->in(MemNode::Address), + load->adr_type()); + ldub = phase->transform(ldub); + return new (phase->C) AndINode(ldub, phase->intcon(mask)); + } } // Masking off sign bits? Dont make them! @@ -923,7 +926,9 @@ Node *RShiftINode::Ideal(PhaseGVN *phase, bool can_reshape) { set_req(2, phase->intcon(0)); return this; } - else if( ld->Opcode() == Op_LoadUS ) + else if( can_reshape && + ld->Opcode() == Op_LoadUS && + ld->outcnt() == 1 && ld->unique_out() == shl) // Replace zero-extension-load with sign-extension-load return new (phase->C) LoadSNode( ld->in(MemNode::Control), ld->in(MemNode::Memory), diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp index bb050533d22..51987e25e32 100644 --- a/hotspot/src/share/vm/opto/runtime.cpp +++ b/hotspot/src/share/vm/opto/runtime.cpp @@ -811,6 +811,48 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } +// for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant) +const TypeFunc* OptoRuntime::aescrypt_block_Type() { + // create input type (domain) + int num_args = 3; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void +const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { + // create input type (domain) + int num_args = 5; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + fields[argp++] = TypePtr::NOTNULL; // r array + fields[argp++] = TypeInt::INT; // src len + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp index c7077726761..13da255b742 100644 --- a/hotspot/src/share/vm/opto/runtime.hpp +++ b/hotspot/src/share/vm/opto/runtime.hpp @@ -280,6 +280,9 @@ private: static const TypeFunc* array_fill_Type(); + static const TypeFunc* aescrypt_block_Type(); + static const TypeFunc* cipherBlockChaining_aescrypt_Type(); + // leaf on stack replacement interpreter accessor types static const TypeFunc* osr_end_Type(); diff --git a/hotspot/src/share/vm/opto/superword.cpp b/hotspot/src/share/vm/opto/superword.cpp index ffc5394bb09..f53c6483750 100644 --- a/hotspot/src/share/vm/opto/superword.cpp +++ b/hotspot/src/share/vm/opto/superword.cpp @@ -1776,16 +1776,15 @@ void SuperWord::compute_vector_element_type() { set_velt_type(n, container_type(n)); } - // Propagate narrowed type backwards through operations + // Propagate integer narrowed type backwards through operations // that don't depend on higher order bits for (int i = _block.length() - 1; i >= 0; i--) { Node* n = _block.at(i); // Only integer types need be examined - const Type* vt = velt_type(n); - if (vt->basic_type() == T_INT) { + const Type* vtn = velt_type(n); + if (vtn->basic_type() == T_INT) { uint start, end; VectorNode::vector_operands(n, &start, &end); - const Type* vt = velt_type(n); for (uint j = start; j < end; j++) { Node* in = n->in(j); @@ -1801,6 +1800,24 @@ void SuperWord::compute_vector_element_type() { } } if (same_type) { + // For right shifts of small integer types (bool, byte, char, short) + // we need precise information about sign-ness. Only Load nodes have + // this information because Store nodes are the same for signed and + // unsigned values. And any arithmetic operation after a load may + // expand a value to signed Int so such right shifts can't be used + // because vector elements do not have upper bits of Int. + const Type* vt = vtn; + if (VectorNode::is_shift(in)) { + Node* load = in->in(1); + if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) { + vt = velt_type(load); + } else if (in->Opcode() != Op_LShiftI) { + // Widen type to Int to avoid creation of right shift vector + // (align + data_size(s1) check in stmts_can_pack() will fail). + // Note, left shifts work regardless type. + vt = TypeInt::INT; + } + } set_velt_type(in, vt); } } @@ -1841,7 +1858,20 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { // Smallest type containing range of values const Type* SuperWord::container_type(Node* n) { if (n->is_Mem()) { - return Type::get_const_basic_type(n->as_Mem()->memory_type()); + BasicType bt = n->as_Mem()->memory_type(); + if (n->is_Store() && (bt == T_CHAR)) { + // Use T_SHORT type instead of T_CHAR for stored values because any + // preceding arithmetic operation extends values to signed Int. + bt = T_SHORT; + } + if (n->Opcode() == Op_LoadUB) { + // Adjust type for unsigned byte loads, it is important for right shifts. + // T_BOOLEAN is used because there is no basic type representing type + // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only + // size (one byte) and sign is important. + bt = T_BOOLEAN; + } + return Type::get_const_basic_type(bt); } const Type* t = _igvn.type(n); if (t->basic_type() == T_INT) { diff --git a/hotspot/src/share/vm/opto/type.cpp b/hotspot/src/share/vm/opto/type.cpp index f982799f634..1a8ee2597dd 100644 --- a/hotspot/src/share/vm/opto/type.cpp +++ b/hotspot/src/share/vm/opto/type.cpp @@ -61,7 +61,7 @@ Type::TypeInfo Type::_type_info[Type::lastype] = { { Bad, T_ILLEGAL, "tuple:", false, Node::NotAMachineReg, relocInfo::none }, // Tuple { Bad, T_ARRAY, "array:", false, Node::NotAMachineReg, relocInfo::none }, // Array -#if defined(IA32) || defined(AMD64) +#ifndef SPARC { Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX diff --git a/hotspot/src/share/vm/opto/vectornode.cpp b/hotspot/src/share/vm/opto/vectornode.cpp index d0955a819e1..9660d4ed818 100644 --- a/hotspot/src/share/vm/opto/vectornode.cpp +++ b/hotspot/src/share/vm/opto/vectornode.cpp @@ -29,8 +29,7 @@ //------------------------------VectorNode-------------------------------------- // Return the vector operator for the specified scalar operation -// and vector length. Also used to check if the code generator -// supports the vector operation. +// and vector length. int VectorNode::opcode(int sopc, BasicType bt) { switch (sopc) { case Op_AddI: @@ -75,7 +74,7 @@ int VectorNode::opcode(int sopc, BasicType bt) { case T_BYTE: return 0; // Unimplemented case T_CHAR: case T_SHORT: return Op_MulVS; - case T_INT: return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1 + case T_INT: return Op_MulVI; } ShouldNotReachHere(); case Op_MulF: @@ -104,9 +103,9 @@ int VectorNode::opcode(int sopc, BasicType bt) { return Op_LShiftVL; case Op_RShiftI: switch (bt) { - case T_BOOLEAN: + case T_BOOLEAN:return Op_URShiftVB; // boolean is unsigned value + case T_CHAR: return Op_URShiftVS; // char is unsigned value case T_BYTE: return Op_RShiftVB; - case T_CHAR: case T_SHORT: return Op_RShiftVS; case T_INT: return Op_RShiftVI; } @@ -116,10 +115,14 @@ int VectorNode::opcode(int sopc, BasicType bt) { return Op_RShiftVL; case Op_URShiftI: switch (bt) { - case T_BOOLEAN: - case T_BYTE: return Op_URShiftVB; - case T_CHAR: - case T_SHORT: return Op_URShiftVS; + case T_BOOLEAN:return Op_URShiftVB; + case T_CHAR: return Op_URShiftVS; + case T_BYTE: + case T_SHORT: return 0; // Vector logical right shift for signed short + // values produces incorrect Java result for + // negative data because java code should convert + // a short value into int value with sign + // extension before a shift. case T_INT: return Op_URShiftVI; } ShouldNotReachHere(); @@ -157,12 +160,14 @@ int VectorNode::opcode(int sopc, BasicType bt) { return 0; // Unimplemented } +// Also used to check if the code generator +// supports the vector operation. bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { if (is_java_primitive(bt) && (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = VectorNode::opcode(opc, bt); - return vopc > 0 && Matcher::has_match_rule(vopc); + return vopc > 0 && Matcher::match_rule_supported(vopc); } return false; } diff --git a/hotspot/src/share/vm/prims/unsafe.cpp b/hotspot/src/share/vm/prims/unsafe.cpp index 051c85975e1..e3750e66f1c 100644 --- a/hotspot/src/share/vm/prims/unsafe.cpp +++ b/hotspot/src/share/vm/prims/unsafe.cpp @@ -124,6 +124,8 @@ inline void* index_oop_from_field_offset_long(oop p, jlong field_offset) { assert((void*)p->obj_field_addr((jint)byte_offset) == ptr_plus_disp, "raw [ptr+disp] must be consistent with oop::field_base"); } + jlong p_size = HeapWordSize * (jlong)(p->size()); + assert(byte_offset < p_size, err_msg("Unsafe access: offset " INT64_FORMAT " > object's size " INT64_FORMAT, byte_offset, p_size)); } #endif if (sizeof(char*) == sizeof(jint)) // (this constant folds!) diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp index e783883ebe0..39e6a98ab5e 100644 --- a/hotspot/src/share/vm/runtime/globals.hpp +++ b/hotspot/src/share/vm/runtime/globals.hpp @@ -533,6 +533,9 @@ class CommandLineFlags { product(intx, UseSSE, 99, \ "Highest supported SSE instructions set on x86/x64") \ \ + product(bool, UseAES, false, \ + "Control whether AES instructions can be used on x86/x64") \ + \ product(uintx, LargePageSizeInBytes, 0, \ "Large page size (0 to let VM choose the page size") \ \ @@ -635,6 +638,9 @@ class CommandLineFlags { product(bool, UseSSE42Intrinsics, false, \ "SSE4.2 versions of intrinsics") \ \ + product(bool, UseAESIntrinsics, false, \ + "use intrinsics for AES versions of crypto") \ + \ develop(bool, TraceCallFixup, false, \ "traces all call fixups") \ \ diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp index 5ca4ba59916..98d428abdab 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp @@ -120,6 +120,10 @@ address StubRoutines::_arrayof_jbyte_fill; address StubRoutines::_arrayof_jshort_fill; address StubRoutines::_arrayof_jint_fill; +address StubRoutines::_aescrypt_encryptBlock = NULL; +address StubRoutines::_aescrypt_decryptBlock = NULL; +address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; +address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp index 0e583aea0b1..91f273e6515 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp @@ -199,6 +199,11 @@ class StubRoutines: AllStatic { // zero heap space aligned to jlong (8 bytes) static address _zero_aligned_words; + static address _aescrypt_encryptBlock; + static address _aescrypt_decryptBlock; + static address _cipherBlockChaining_encryptAESCrypt; + static address _cipherBlockChaining_decryptAESCrypt; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -330,6 +335,11 @@ class StubRoutines: AllStatic { static address arrayof_jshort_fill() { return _arrayof_jshort_fill; } static address arrayof_jint_fill() { return _arrayof_jint_fill; } + static address aescrypt_encryptBlock() { return _aescrypt_encryptBlock; } + static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } + static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } + static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address select_fill_function(BasicType t, bool aligned, const char* &name); static address zero_aligned_words() { return _zero_aligned_words; } diff --git a/hotspot/test/compiler/6340864/TestByteVect.java b/hotspot/test/compiler/6340864/TestByteVect.java index ec4ba9fcd0f..5db3687e67c 100644 --- a/hotspot/test/compiler/6340864/TestByteVect.java +++ b/hotspot/test/compiler/6340864/TestByteVect.java @@ -33,7 +33,7 @@ public class TestByteVect { private static final int ARRLEN = 997; private static final int ITERS = 11000; - private static final int ADD_INIT = 0; + private static final int ADD_INIT = 63; private static final int BIT_MASK = 0xB7; private static final int VALUE = 3; private static final int SHIFT = 8; @@ -76,6 +76,7 @@ public class TestByteVect { test_subc(a0, a1); test_subv(a0, a1, (byte)VALUE); test_suba(a0, a1, a2); + test_mulc(a0, a1); test_mulv(a0, a1, (byte)VALUE); test_mula(a0, a1, a2); @@ -88,6 +89,7 @@ public class TestByteVect { test_divc_n(a0, a1); test_divv(a0, a1, (byte)-VALUE); test_diva(a0, a1, a3); + test_andc(a0, a1); test_andv(a0, a1, (byte)BIT_MASK); test_anda(a0, a1, a4); @@ -97,30 +99,49 @@ public class TestByteVect { test_xorc(a0, a1); test_xorv(a0, a1, (byte)BIT_MASK); test_xora(a0, a1, a4); + test_sllc(a0, a1); test_sllv(a0, a1, VALUE); test_srlc(a0, a1); test_srlv(a0, a1, VALUE); test_srac(a0, a1); test_srav(a0, a1, VALUE); + test_sllc_n(a0, a1); test_sllv(a0, a1, -VALUE); test_srlc_n(a0, a1); test_srlv(a0, a1, -VALUE); test_srac_n(a0, a1); test_srav(a0, a1, -VALUE); + test_sllc_o(a0, a1); test_sllv(a0, a1, SHIFT); test_srlc_o(a0, a1); test_srlv(a0, a1, SHIFT); test_srac_o(a0, a1); test_srav(a0, a1, SHIFT); + test_sllc_on(a0, a1); test_sllv(a0, a1, -SHIFT); test_srlc_on(a0, a1); test_srlv(a0, a1, -SHIFT); test_srac_on(a0, a1); test_srav(a0, a1, -SHIFT); + + test_sllc_add(a0, a1); + test_sllv_add(a0, a1, ADD_INIT); + test_srlc_add(a0, a1); + test_srlv_add(a0, a1, ADD_INIT); + test_srac_add(a0, a1); + test_srav_add(a0, a1, ADD_INIT); + + test_sllc_and(a0, a1); + test_sllv_and(a0, a1, BIT_MASK); + test_srlc_and(a0, a1); + test_srlv_and(a0, a1, BIT_MASK); + test_srac_and(a0, a1); + test_srav_and(a0, a1, BIT_MASK); + test_pack2(p2, a1); test_unpack2(a0, p2); test_pack2_swap(p2, a1); @@ -369,6 +390,60 @@ public class TestByteVect { errn += verify("test_srav_on: ", i, a0[i], (byte)((byte)(ADD_INIT+i)>>(-SHIFT))); } + test_sllc_add(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_add(a0, a1, ADD_INIT); + for (int i=0; i>>VALUE)); + } + + test_srac_add(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_add(a0, a1, ADD_INIT); + for (int i=0; i>VALUE)); + } + + test_sllc_and(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_and(a0, a1, BIT_MASK); + for (int i=0; i>>VALUE)); + } + + test_srac_and(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_and(a0, a1, BIT_MASK); + for (int i=0; i>VALUE)); + } + test_pack2(p2, a1); for (int i=0; i>>b); } } + static void test_srlc_add(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] + ADD_INIT)>>>VALUE); + } + } + static void test_srlv_add(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] + b)>>>VALUE); + } + } + static void test_srlc_and(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] & BIT_MASK)>>>VALUE); + } + } + static void test_srlv_and(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] & b)>>>VALUE); + } + } static void test_srac(byte[] a0, byte[] a1) { for (int i = 0; i < a0.length; i+=1) { @@ -1088,6 +1281,26 @@ public class TestByteVect { a0[i] = (byte)(a1[i]>>b); } } + static void test_srac_add(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] + ADD_INIT)>>VALUE); + } + } + static void test_srav_add(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] + b)>>VALUE); + } + } + static void test_srac_and(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] & BIT_MASK)>>VALUE); + } + } + static void test_srav_and(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)((a1[i] & b)>>VALUE); + } + } static void test_pack2(short[] p2, byte[] a1) { if (p2.length*2 > a1.length) return; diff --git a/hotspot/test/compiler/6340864/TestIntVect.java b/hotspot/test/compiler/6340864/TestIntVect.java index 36e277f731b..5866b34a5af 100644 --- a/hotspot/test/compiler/6340864/TestIntVect.java +++ b/hotspot/test/compiler/6340864/TestIntVect.java @@ -74,6 +74,7 @@ public class TestIntVect { test_subc(a0, a1); test_subv(a0, a1, (int)VALUE); test_suba(a0, a1, a2); + test_mulc(a0, a1); test_mulv(a0, a1, (int)VALUE); test_mula(a0, a1, a2); @@ -86,6 +87,7 @@ public class TestIntVect { test_divc_n(a0, a1); test_divv(a0, a1, (int)-VALUE); test_diva(a0, a1, a3); + test_andc(a0, a1); test_andv(a0, a1, (int)BIT_MASK); test_anda(a0, a1, a4); @@ -95,30 +97,49 @@ public class TestIntVect { test_xorc(a0, a1); test_xorv(a0, a1, (int)BIT_MASK); test_xora(a0, a1, a4); + test_sllc(a0, a1); test_sllv(a0, a1, VALUE); test_srlc(a0, a1); test_srlv(a0, a1, VALUE); test_srac(a0, a1); test_srav(a0, a1, VALUE); + test_sllc_n(a0, a1); test_sllv(a0, a1, -VALUE); test_srlc_n(a0, a1); test_srlv(a0, a1, -VALUE); test_srac_n(a0, a1); test_srav(a0, a1, -VALUE); + test_sllc_o(a0, a1); test_sllv(a0, a1, SHIFT); test_srlc_o(a0, a1); test_srlv(a0, a1, SHIFT); test_srac_o(a0, a1); test_srav(a0, a1, SHIFT); + test_sllc_on(a0, a1); test_sllv(a0, a1, -SHIFT); test_srlc_on(a0, a1); test_srlv(a0, a1, -SHIFT); test_srac_on(a0, a1); test_srav(a0, a1, -SHIFT); + + test_sllc_add(a0, a1); + test_sllv_add(a0, a1, ADD_INIT); + test_srlc_add(a0, a1); + test_srlv_add(a0, a1, ADD_INIT); + test_srac_add(a0, a1); + test_srav_add(a0, a1, ADD_INIT); + + test_sllc_and(a0, a1); + test_sllv_and(a0, a1, BIT_MASK); + test_srlc_and(a0, a1); + test_srlv_and(a0, a1, BIT_MASK); + test_srac_and(a0, a1); + test_srav_and(a0, a1, BIT_MASK); + test_pack2(p2, a1); test_unpack2(a0, p2); test_pack2_swap(p2, a1); @@ -359,6 +380,60 @@ public class TestIntVect { errn += verify("test_srav_on: ", i, a0[i], (int)((int)(ADD_INIT+i)>>(-SHIFT))); } + test_sllc_add(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_add(a0, a1, ADD_INIT); + for (int i=0; i>>VALUE)); + } + + test_srac_add(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_add(a0, a1, ADD_INIT); + for (int i=0; i>VALUE)); + } + + test_sllc_and(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_and(a0, a1, BIT_MASK); + for (int i=0; i>>VALUE)); + } + + test_srac_and(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_and(a0, a1, BIT_MASK); + for (int i=0; i>VALUE)); + } + test_pack2(p2, a1); for (int i=0; i>>b); } } + static void test_srlc_add(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] + ADD_INIT)>>>VALUE); + } + } + static void test_srlv_add(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] + b)>>>VALUE); + } + } + static void test_srlc_and(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] & BIT_MASK)>>>VALUE); + } + } + static void test_srlv_and(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] & b)>>>VALUE); + } + } static void test_srac(int[] a0, int[] a1) { for (int i = 0; i < a0.length; i+=1) { @@ -960,6 +1153,26 @@ public class TestIntVect { a0[i] = (int)(a1[i]>>b); } } + static void test_srac_add(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] + ADD_INIT)>>VALUE); + } + } + static void test_srav_add(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] + b)>>VALUE); + } + } + static void test_srac_and(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] & BIT_MASK)>>VALUE); + } + } + static void test_srav_and(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)((a1[i] & b)>>VALUE); + } + } static void test_pack2(long[] p2, int[] a1) { if (p2.length*2 > a1.length) return; diff --git a/hotspot/test/compiler/6340864/TestLongVect.java b/hotspot/test/compiler/6340864/TestLongVect.java index 70b41f4b4cb..436a8472df8 100644 --- a/hotspot/test/compiler/6340864/TestLongVect.java +++ b/hotspot/test/compiler/6340864/TestLongVect.java @@ -73,6 +73,7 @@ public class TestLongVect { test_subc(a0, a1); test_subv(a0, a1, (long)VALUE); test_suba(a0, a1, a2); + test_mulc(a0, a1); test_mulv(a0, a1, (long)VALUE); test_mula(a0, a1, a2); @@ -85,6 +86,7 @@ public class TestLongVect { test_divc_n(a0, a1); test_divv(a0, a1, (long)-VALUE); test_diva(a0, a1, a3); + test_andc(a0, a1); test_andv(a0, a1, (long)BIT_MASK); test_anda(a0, a1, a4); @@ -94,30 +96,48 @@ public class TestLongVect { test_xorc(a0, a1); test_xorv(a0, a1, (long)BIT_MASK); test_xora(a0, a1, a4); + test_sllc(a0, a1); test_sllv(a0, a1, VALUE); test_srlc(a0, a1); test_srlv(a0, a1, VALUE); test_srac(a0, a1); test_srav(a0, a1, VALUE); + test_sllc_n(a0, a1); test_sllv(a0, a1, -VALUE); test_srlc_n(a0, a1); test_srlv(a0, a1, -VALUE); test_srac_n(a0, a1); test_srav(a0, a1, -VALUE); + test_sllc_o(a0, a1); test_sllv(a0, a1, SHIFT); test_srlc_o(a0, a1); test_srlv(a0, a1, SHIFT); test_srac_o(a0, a1); test_srav(a0, a1, SHIFT); + test_sllc_on(a0, a1); test_sllv(a0, a1, -SHIFT); test_srlc_on(a0, a1); test_srlv(a0, a1, -SHIFT); test_srac_on(a0, a1); test_srav(a0, a1, -SHIFT); + + test_sllc_add(a0, a1); + test_sllv_add(a0, a1, ADD_INIT); + test_srlc_add(a0, a1); + test_srlv_add(a0, a1, ADD_INIT); + test_srac_add(a0, a1); + test_srav_add(a0, a1, ADD_INIT); + + test_sllc_and(a0, a1); + test_sllv_and(a0, a1, BIT_MASK); + test_srlc_and(a0, a1); + test_srlv_and(a0, a1, BIT_MASK); + test_srac_and(a0, a1); + test_srav_and(a0, a1, BIT_MASK); } // Test and verify results System.out.println("Verification"); @@ -354,6 +374,60 @@ public class TestLongVect { errn += verify("test_srav_on: ", i, a0[i], (long)((long)(ADD_INIT+i)>>(-SHIFT))); } + test_sllc_add(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_add(a0, a1, ADD_INIT); + for (int i=0; i>>VALUE)); + } + + test_srac_add(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_add(a0, a1, ADD_INIT); + for (int i=0; i>VALUE)); + } + + test_sllc_and(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_and(a0, a1, BIT_MASK); + for (int i=0; i>>VALUE)); + } + + test_srac_and(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_and(a0, a1, BIT_MASK); + for (int i=0; i>VALUE)); + } + } if (errn > 0) @@ -696,6 +770,84 @@ public class TestLongVect { end = System.currentTimeMillis(); System.out.println("test_srav_on: " + (end - start)); + start = System.currentTimeMillis(); + for (int i=0; i>>b); } } + static void test_srlc_add(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] + ADD_INIT)>>>VALUE); + } + } + static void test_srlv_add(long[] a0, long[] a1, long b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] + b)>>>VALUE); + } + } + static void test_srlc_and(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] & BIT_MASK)>>>VALUE); + } + } + static void test_srlv_and(long[] a0, long[] a1, long b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] & b)>>>VALUE); + } + } static void test_srac(long[] a0, long[] a1) { for (int i = 0; i < a0.length; i+=1) { @@ -906,6 +1098,26 @@ public class TestLongVect { a0[i] = (long)(a1[i]>>b); } } + static void test_srac_add(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] + ADD_INIT)>>VALUE); + } + } + static void test_srav_add(long[] a0, long[] a1, long b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] + b)>>VALUE); + } + } + static void test_srac_and(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] & BIT_MASK)>>VALUE); + } + } + static void test_srav_and(long[] a0, long[] a1, long b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)((a1[i] & b)>>VALUE); + } + } static int verify(String text, int i, long elem, long val) { if (elem != val) { diff --git a/hotspot/test/compiler/6340864/TestShortVect.java b/hotspot/test/compiler/6340864/TestShortVect.java index a688e0de0e0..9f59c8f22cd 100644 --- a/hotspot/test/compiler/6340864/TestShortVect.java +++ b/hotspot/test/compiler/6340864/TestShortVect.java @@ -75,6 +75,7 @@ public class TestShortVect { test_subc(a0, a1); test_subv(a0, a1, (short)VALUE); test_suba(a0, a1, a2); + test_mulc(a0, a1); test_mulv(a0, a1, (short)VALUE); test_mula(a0, a1, a2); @@ -87,6 +88,7 @@ public class TestShortVect { test_divc_n(a0, a1); test_divv(a0, a1, (short)-VALUE); test_diva(a0, a1, a3); + test_andc(a0, a1); test_andv(a0, a1, (short)BIT_MASK); test_anda(a0, a1, a4); @@ -96,30 +98,49 @@ public class TestShortVect { test_xorc(a0, a1); test_xorv(a0, a1, (short)BIT_MASK); test_xora(a0, a1, a4); + test_sllc(a0, a1); test_sllv(a0, a1, VALUE); test_srlc(a0, a1); test_srlv(a0, a1, VALUE); test_srac(a0, a1); test_srav(a0, a1, VALUE); + test_sllc_n(a0, a1); test_sllv(a0, a1, -VALUE); test_srlc_n(a0, a1); test_srlv(a0, a1, -VALUE); test_srac_n(a0, a1); test_srav(a0, a1, -VALUE); + test_sllc_o(a0, a1); test_sllv(a0, a1, SHIFT); test_srlc_o(a0, a1); test_srlv(a0, a1, SHIFT); test_srac_o(a0, a1); test_srav(a0, a1, SHIFT); + test_sllc_on(a0, a1); test_sllv(a0, a1, -SHIFT); test_srlc_on(a0, a1); test_srlv(a0, a1, -SHIFT); test_srac_on(a0, a1); test_srav(a0, a1, -SHIFT); + + test_sllc_add(a0, a1); + test_sllv_add(a0, a1, ADD_INIT); + test_srlc_add(a0, a1); + test_srlv_add(a0, a1, ADD_INIT); + test_srac_add(a0, a1); + test_srav_add(a0, a1, ADD_INIT); + + test_sllc_and(a0, a1); + test_sllv_and(a0, a1, BIT_MASK); + test_srlc_and(a0, a1); + test_srlv_and(a0, a1, BIT_MASK); + test_srac_and(a0, a1); + test_srav_and(a0, a1, BIT_MASK); + test_pack2(p2, a1); test_unpack2(a0, p2); test_pack2_swap(p2, a1); @@ -364,6 +385,60 @@ public class TestShortVect { errn += verify("test_srav_on: ", i, a0[i], (short)((short)(ADD_INIT+i)>>(-SHIFT))); } + test_sllc_add(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_add(a0, a1, ADD_INIT); + for (int i=0; i>>VALUE)); + } + + test_srac_add(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_add(a0, a1, ADD_INIT); + for (int i=0; i>VALUE)); + } + + test_sllc_and(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_and(a0, a1, BIT_MASK); + for (int i=0; i>>VALUE)); + } + + test_srac_and(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_and(a0, a1, BIT_MASK); + for (int i=0; i>VALUE)); + } + test_pack2(p2, a1); for (int i=0; i>>b); } } + static void test_srlc_add(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] + ADD_INIT)>>>VALUE); + } + } + static void test_srlv_add(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] + b)>>>VALUE); + } + } + static void test_srlc_and(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] & BIT_MASK)>>>VALUE); + } + } + static void test_srlv_and(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] & b)>>>VALUE); + } + } static void test_srac(short[] a0, short[] a1) { for (int i = 0; i < a0.length; i+=1) { @@ -1020,6 +1213,26 @@ public class TestShortVect { a0[i] = (short)(a1[i]>>b); } } + static void test_srac_add(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] + ADD_INIT)>>VALUE); + } + } + static void test_srav_add(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] + b)>>VALUE); + } + } + static void test_srac_and(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] & BIT_MASK)>>VALUE); + } + } + static void test_srav_and(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)((a1[i] & b)>>VALUE); + } + } static void test_pack2(int[] p2, short[] a1) { if (p2.length*2 > a1.length) return; diff --git a/hotspot/test/compiler/7184394/TestAESBase.java b/hotspot/test/compiler/7184394/TestAESBase.java new file mode 100644 index 00000000000..ad6c835cc84 --- /dev/null +++ b/hotspot/test/compiler/7184394/TestAESBase.java @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @author Tom Deneau + */ + +import javax.crypto.Cipher; +import javax.crypto.KeyGenerator; +import javax.crypto.SecretKey; +import javax.crypto.spec.IvParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import java.security.AlgorithmParameters; + +import java.util.Random; +import java.util.Arrays; + +abstract public class TestAESBase { + int msgSize = Integer.getInteger("msgSize", 646); + boolean checkOutput = Boolean.getBoolean("checkOutput"); + boolean noReinit = Boolean.getBoolean("noReinit"); + int keySize = Integer.getInteger("keySize", 128); + String algorithm = System.getProperty("algorithm", "AES"); + String mode = System.getProperty("mode", "CBC"); + byte[] input; + byte[] encode; + byte[] expectedEncode; + byte[] decode; + byte[] expectedDecode; + Random random = new Random(0); + Cipher cipher; + Cipher dCipher; + String paddingStr = "PKCS5Padding"; + AlgorithmParameters algParams; + SecretKey key; + int ivLen; + + static int numThreads = 0; + int threadId; + static synchronized int getThreadId() { + int id = numThreads; + numThreads++; + return id; + } + + abstract public void run(); + + public void prepare() { + try { + System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput); + + int keyLenBytes = (keySize == 0 ? 16 : keySize/8); + byte keyBytes[] = new byte[keyLenBytes]; + if (keySize == 128) + keyBytes = new byte[] {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}; + else + random.nextBytes(keyBytes); + + key = new SecretKeySpec(keyBytes, algorithm); + if (threadId == 0) { + System.out.println("Algorithm: " + key.getAlgorithm() + "(" + + key.getEncoded().length * 8 + "bit)"); + } + input = new byte[msgSize]; + for (int i=0; i 0 ? Integer.valueOf(args[0]) : 1000000); + System.out.println(iters + " iterations"); + TestAESEncode etest = new TestAESEncode(); + etest.prepare(); + long start = System.nanoTime(); + for (int i=0; i> 16; } + static int loadUS_signExt_1 (char[] ca) { return (ca[0] << 16) >> 16; } + + static long loadB2L_mask8 (byte[] ba) { return ba[0] & 0x55; } + static long loadB2L_mask8_1 (byte[] ba) { return ba[0] & 0x55; } + + public static void main(String[] args) { + for (int i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) { + byte[] ba = new byte[] { (byte) i}; + + { long v1 = loadB2L_mask8(ba); + long v2 = loadB2L_mask8_1(ba); + if (v1 != v2) + throw new InternalError(String.format("loadB2L_mask8 failed: %x != %x", v1, v2)); } + } + + for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { + short[] sa = new short[] { (short)i }; + char[] ca = new char[] { (char)i }; + + { long v1 = loadS2LmaskFF(sa); + long v2 = loadS2LmaskFF_1(sa); + if (v1 != v2) + throw new InternalError(String.format("loadS2LmaskFF failed: %x != %x", v1, v2)); } + + { long v1 = loadS2Lmask16(sa); + long v2 = loadS2Lmask16_1(sa); + if (v1 != v2) + throw new InternalError(String.format("loadS2Lmask16 failed: %x != %x", v1, v2)); } + + { long v1 = loadS2Lmask13(sa); + long v2 = loadS2Lmask13_1(sa); + if (v1 != v2) + throw new InternalError(String.format("loadS2Lmask13 failed: %x != %x", v1, v2)); } + + { int v1 = loadUS_signExt(ca); + int v2 = loadUS_signExt_1(ca); + if (v1 != v2) + throw new InternalError(String.format("loadUS_signExt failed: %x != %x", v1, v2)); } + } + + System.out.println("TEST PASSED."); + } +} diff --git a/hotspot/test/compiler/8001183/TestCharVect.java b/hotspot/test/compiler/8001183/TestCharVect.java new file mode 100644 index 00000000000..a6ff1e2b961 --- /dev/null +++ b/hotspot/test/compiler/8001183/TestCharVect.java @@ -0,0 +1,1332 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8001183 + * @summary incorrect results of char vectors right shift operaiton + * + * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestCharVect + */ + +public class TestCharVect { + private static final int ARRLEN = 997; + private static final int ITERS = 11000; + private static final int ADD_INIT = Character.MAX_VALUE-500; + private static final int BIT_MASK = 0xB731; + private static final int VALUE = 7; + private static final int SHIFT = 16; + + public static void main(String args[]) { + System.out.println("Testing Char vectors"); + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + char[] a0 = new char[ARRLEN]; + char[] a1 = new char[ARRLEN]; + short[] a2 = new short[ARRLEN]; + short[] a3 = new short[ARRLEN]; + short[] a4 = new short[ARRLEN]; + int[] p2 = new int[ARRLEN/2]; + long[] p4 = new long[ARRLEN/4]; + // Initialize + int gold_sum = 0; + for (int i=0; i>>VALUE)); + } + test_srlv(a0, a1, VALUE); + for (int i=0; i>>VALUE)); + } + + test_srac(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav(a0, a1, VALUE); + for (int i=0; i>VALUE)); + } + + test_sllc_n(a0, a1); + for (int i=0; i>>(-VALUE))); + } + test_srlv(a0, a1, -VALUE); + for (int i=0; i>>(-VALUE))); + } + + test_srac_n(a0, a1); + for (int i=0; i>(-VALUE))); + } + test_srav(a0, a1, -VALUE); + for (int i=0; i>(-VALUE))); + } + + test_sllc_o(a0, a1); + for (int i=0; i>>SHIFT)); + } + test_srlv(a0, a1, SHIFT); + for (int i=0; i>>SHIFT)); + } + + test_srac_o(a0, a1); + for (int i=0; i>SHIFT)); + } + test_srav(a0, a1, SHIFT); + for (int i=0; i>SHIFT)); + } + + test_sllc_on(a0, a1); + for (int i=0; i>>(-SHIFT))); + } + test_srlv(a0, a1, -SHIFT); + for (int i=0; i>>(-SHIFT))); + } + + test_srac_on(a0, a1); + for (int i=0; i>(-SHIFT))); + } + test_srav(a0, a1, -SHIFT); + for (int i=0; i>(-SHIFT))); + } + + test_sllc_add(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_add(a0, a1, ADD_INIT); + for (int i=0; i>>VALUE)); + } + + test_srac_add(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_add(a0, a1, ADD_INIT); + for (int i=0; i>VALUE)); + } + + test_sllc_and(a0, a1); + for (int i=0; i>>VALUE)); + } + test_srlv_and(a0, a1, BIT_MASK); + for (int i=0; i>>VALUE)); + } + + test_srac_and(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav_and(a0, a1, BIT_MASK); + for (int i=0; i>VALUE)); + } + + test_pack2(p2, a1); + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i>>VALUE); + } + } + static void test_srlc_n(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>>(-VALUE)); + } + } + static void test_srlc_o(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>>SHIFT); + } + } + static void test_srlc_on(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>>(-SHIFT)); + } + } + static void test_srlv(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>>b); + } + } + static void test_srlc_add(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] + ADD_INIT)>>>VALUE); + } + } + static void test_srlv_add(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] + b)>>>VALUE); + } + } + static void test_srlc_and(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] & BIT_MASK)>>>VALUE); + } + } + static void test_srlv_and(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] & b)>>>VALUE); + } + } + + static void test_srac(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>VALUE); + } + } + static void test_srac_n(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>(-VALUE)); + } + } + static void test_srac_o(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>SHIFT); + } + } + static void test_srac_on(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>(-SHIFT)); + } + } + static void test_srav(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)(a1[i]>>b); + } + } + static void test_srac_add(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] + ADD_INIT)>>VALUE); + } + } + static void test_srav_add(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] + b)>>VALUE); + } + } + static void test_srac_and(char[] a0, char[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] & BIT_MASK)>>VALUE); + } + } + static void test_srav_and(char[] a0, char[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (char)((a1[i] & b)>>VALUE); + } + } + + static void test_pack2(int[] p2, char[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l0 = (int)a1[i*2+0]; + int l1 = (int)a1[i*2+1]; + p2[i] = (l1 << 16) | (l0 & 0xFFFF); + } + } + static void test_unpack2(char[] a0, int[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l = p2[i]; + a0[i*2+0] = (char)(l & 0xFFFF); + a0[i*2+1] = (char)(l >> 16); + } + } + static void test_pack2_swap(int[] p2, char[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l0 = (int)a1[i*2+0]; + int l1 = (int)a1[i*2+1]; + p2[i] = (l0 << 16) | (l1 & 0xFFFF); + } + } + static void test_unpack2_swap(char[] a0, int[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l = p2[i]; + a0[i*2+0] = (char)(l >> 16); + a0[i*2+1] = (char)(l & 0xFFFF); + } + } + + static void test_pack4(long[] p4, char[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l0 = (long)a1[i*4+0]; + long l1 = (long)a1[i*4+1]; + long l2 = (long)a1[i*4+2]; + long l3 = (long)a1[i*4+3]; + p4[i] = (l0 & 0xFFFFl) | + ((l1 & 0xFFFFl) << 16) | + ((l2 & 0xFFFFl) << 32) | + ((l3 & 0xFFFFl) << 48); + } + } + static void test_unpack4(char[] a0, long[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l = p4[i]; + a0[i*4+0] = (char)(l & 0xFFFFl); + a0[i*4+1] = (char)(l >> 16); + a0[i*4+2] = (char)(l >> 32); + a0[i*4+3] = (char)(l >> 48); + } + } + static void test_pack4_swap(long[] p4, char[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l0 = (long)a1[i*4+0]; + long l1 = (long)a1[i*4+1]; + long l2 = (long)a1[i*4+2]; + long l3 = (long)a1[i*4+3]; + p4[i] = (l3 & 0xFFFFl) | + ((l2 & 0xFFFFl) << 16) | + ((l1 & 0xFFFFl) << 32) | + ((l0 & 0xFFFFl) << 48); + } + } + static void test_unpack4_swap(char[] a0, long[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l = p4[i]; + a0[i*4+0] = (char)(l >> 48); + a0[i*4+1] = (char)(l >> 32); + a0[i*4+2] = (char)(l >> 16); + a0[i*4+3] = (char)(l & 0xFFFFl); + } + } + + static int verify(String text, int i, int elem, int val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } + + static int verify(String text, int i, long elem, long val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val)); + return 1; + } + return 0; + } +}