8273297: AES/GCM non-AVX512+VAES CPUs suffer after 8267125

Reviewed-by: ascarpino, sviswanathan, aph
This commit is contained in:
Smita Kamath 2021-09-24 19:21:32 +00:00 committed by Anthony Scarpino
parent 753b25633b
commit 13e9ea9e92
16 changed files with 114 additions and 56 deletions

View File

@ -55,6 +55,9 @@
// No support for generic vector operands. // No support for generic vector operands.
static const bool supports_generic_vector_operands = false; static const bool supports_generic_vector_operands = false;
// No support for 48 extra htbl entries in aes-gcm intrinsic
static const int htbl_entries = -1;
static constexpr bool isSimpleConstant64(jlong value) { static constexpr bool isSimpleConstant64(jlong value) {
// Will one (StoreL ConL) be cheaper than two (StoreI ConI)?. // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
// Probably always true, even if a temp register is required. // Probably always true, even if a temp register is required.

View File

@ -56,6 +56,9 @@
// No support for generic vector operands. // No support for generic vector operands.
static const bool supports_generic_vector_operands = false; static const bool supports_generic_vector_operands = false;
// No support for 48 extra htbl entries in aes-gcm intrinsic
static const int htbl_entries = -1;
static constexpr bool isSimpleConstant64(jlong value) { static constexpr bool isSimpleConstant64(jlong value) {
// Will one (StoreL ConL) be cheaper than two (StoreI ConI)?. // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
return false; return false;

View File

@ -57,6 +57,9 @@
// No support for generic vector operands. // No support for generic vector operands.
static const bool supports_generic_vector_operands = false; static const bool supports_generic_vector_operands = false;
// No support for 48 extra htbl entries in aes-gcm intrinsic
static const int htbl_entries = -1;
static constexpr bool isSimpleConstant64(jlong value) { static constexpr bool isSimpleConstant64(jlong value) {
// Probably always true, even if a temp register is required. // Probably always true, even if a temp register is required.
return true; return true;

View File

@ -57,6 +57,9 @@
// No support for generic vector operands. // No support for generic vector operands.
static const bool supports_generic_vector_operands = false; static const bool supports_generic_vector_operands = false;
// No support for 48 extra htbl entries in aes-gcm intrinsic
static const int htbl_entries = -1;
static constexpr bool isSimpleConstant64(jlong value) { static constexpr bool isSimpleConstant64(jlong value) {
// Probably always true, even if a temp register is required. // Probably always true, even if a temp register is required.
return true; return true;

View File

@ -946,7 +946,7 @@ private:
void lastroundDec(XMMRegister key, int rnum); void lastroundDec(XMMRegister key, int rnum);
void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask); void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
void gfmul_avx512(XMMRegister ghash, XMMRegister hkey); void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
void generateHtbl_48_block_zmm(Register htbl); void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl);
void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx,
XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction, XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction,
XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos, XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos,
@ -957,7 +957,7 @@ public:
void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
Register len_reg, Register used, Register used_addr, Register saved_encCounter_start); Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
Register state, Register subkeyHtbl, Register counter); Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
#endif #endif

View File

@ -1293,7 +1293,7 @@ void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit); vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
} }
void MacroAssembler::generateHtbl_48_block_zmm(Register htbl) { void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) {
const XMMRegister HK = xmm6; const XMMRegister HK = xmm6;
const XMMRegister ZT5 = xmm4; const XMMRegister ZT5 = xmm4;
const XMMRegister ZT7 = xmm7; const XMMRegister ZT7 = xmm7;
@ -1320,48 +1320,48 @@ void MacroAssembler::generateHtbl_48_block_zmm(Register htbl) {
vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit); vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit);
vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit); vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
movdqu(Address(htbl, 16 * 56), xmm6); // H ^ 2 movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2
// Compute the remaining three powers of H using XMM registers and all following powers using ZMM // Compute the remaining three powers of H using XMM registers and all following powers using ZMM
movdqu(ZT5, HK); movdqu(ZT5, HK);
vinserti32x4(ZT7, ZT7, HK, 3); vinserti32x4(ZT7, ZT7, HK, 3);
gfmul_avx512(ZT5, HK); gfmul_avx512(ZT5, HK);
movdqu(Address(htbl, 16 * 55), ZT5); // H ^ 2 * 2 movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2
vinserti32x4(ZT7, ZT7, ZT5, 2); vinserti32x4(ZT7, ZT7, ZT5, 2);
gfmul_avx512(ZT5, HK); gfmul_avx512(ZT5, HK);
movdqu(Address(htbl, 16 * 54), ZT5); // H ^ 2 * 3 movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3
vinserti32x4(ZT7, ZT7, ZT5, 1); vinserti32x4(ZT7, ZT7, ZT5, 1);
gfmul_avx512(ZT5, HK); gfmul_avx512(ZT5, HK);
movdqu(Address(htbl, 16 * 53), ZT5); // H ^ 2 * 4 movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4
vinserti32x4(ZT7, ZT7, ZT5, 0); vinserti32x4(ZT7, ZT7, ZT5, 0);
evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit);
evmovdquq(ZT8, ZT7, Assembler::AVX_512bit); evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 49), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit);
evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit); evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);
gfmul_avx512(ZT8, ZT5); gfmul_avx512(ZT8, ZT5);
evmovdquq(Address(htbl, 16 * 45), ZT8, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 41), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit);
gfmul_avx512(ZT8, ZT5); gfmul_avx512(ZT8, ZT5);
evmovdquq(Address(htbl, 16 * 37), ZT8, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 33), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);
gfmul_avx512(ZT8, ZT5); gfmul_avx512(ZT8, ZT5);
evmovdquq(Address(htbl, 16 * 29), ZT8, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 25), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit);
gfmul_avx512(ZT8, ZT5); gfmul_avx512(ZT8, ZT5);
evmovdquq(Address(htbl, 16 * 21), ZT8, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 17), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit);
gfmul_avx512(ZT8, ZT5); gfmul_avx512(ZT8, ZT5);
evmovdquq(Address(htbl, 16 * 13), ZT8, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit);
gfmul_avx512(ZT7, ZT5); gfmul_avx512(ZT7, ZT5);
evmovdquq(Address(htbl, 16 * 9), ZT7, Assembler::AVX_512bit); evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit);
ret(0); ret(0);
} }
@ -1477,8 +1477,8 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb
// ZTMP19 & ZTMP20 used for loading hash key // ZTMP19 & ZTMP20 used for loading hash key
// Pre-load hash key // Pre-load hash key
evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit);
evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
// Load data for computing ghash // Load data for computing ghash
evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
@ -1499,7 +1499,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb
// GHASH 4 blocks // GHASH 4 blocks
carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19); carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19);
// Load the next hkey and Ghash data // Load the next hkey and Ghash data
evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit);
@ -1510,7 +1510,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb
// GHASH 4 blocks(11 to 8) // GHASH 4 blocks(11 to 8)
carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20); carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20);
// Load the next hkey and GDATA // Load the next hkey and GDATA
evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit);
evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit);
@ -1628,8 +1628,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb
} }
void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
Register state, Register subkeyHtbl, Register counter) { Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32, Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32,
AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16; AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16;
const XMMRegister CTR_BLOCKx = xmm9; const XMMRegister CTR_BLOCKx = xmm9;
@ -1761,7 +1760,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
// 2) No reduction -> accumulate multiplication values // 2) No reduction -> accumulate multiplication values
// 3) Final reduction post 48 blocks -> new ghash value is computed for the next round // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round
// Reduction value = first time // Reduction value = first time
ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
addl(pos, 256); addl(pos, 256);
addl(ghash_pos, 256); addl(ghash_pos, 256);
index += 4; index += 4;
@ -1778,12 +1777,12 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
// Each call uses 4 subkeyHtbl values, so increment the index by 4. // Each call uses 4 subkeyHtbl values, so increment the index by 4.
bind(GHASH_16_AES_16); bind(GHASH_16_AES_16);
// Reduction value = no reduction // Reduction value = no reduction
ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK); ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
addl(pos, 256); addl(pos, 256);
addl(ghash_pos, 256); addl(ghash_pos, 256);
index += 4; index += 4;
// Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash
ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK); ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK);
addl(pos, 256); addl(pos, 256);
addl(ghash_pos, 256); addl(ghash_pos, 256);
// Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline
@ -1792,7 +1791,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
// Restart the pipeline // Restart the pipeline
// Reduction value = first time // Reduction value = first time
ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
addl(pos, 256); addl(pos, 256);
addl(ghash_pos, 256); addl(ghash_pos, 256);
index += 4; index += 4;
@ -1812,8 +1811,8 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
// Load ghash keys // Load ghash keys
evmovdquq(ZTMP15, Address(subkeyHtbl, rbx, Address::times_1, 0 * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(ZTMP16, Address(subkeyHtbl, rbx, Address::times_1, 1 * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
// Ghash blocks 0 - 3 // Ghash blocks 0 - 3
carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15); carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15);
@ -1837,8 +1836,8 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit);
vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit);
evmovdquq(ZTMP15, Address(subkeyHtbl, rbx, Address::times_1, 0 * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(ZTMP16, Address(subkeyHtbl, rbx, Address::times_1, 1 * 64 + 144), Assembler::AVX_512bit); evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit);
// ghash blocks 0 - 3 // ghash blocks 0 - 3
carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15); carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15);
@ -1884,7 +1883,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi
jmp(ENC_DEC_DONE); jmp(ENC_DEC_DONE);
bind(GENERATE_HTBL_48_BLKS); bind(GENERATE_HTBL_48_BLKS);
generateHtbl_48_block_zmm(subkeyHtbl); generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl);
bind(ENC_DEC_DONE); bind(ENC_DEC_DONE);
movq(rax, pos); movq(rax, pos);

View File

@ -148,6 +148,9 @@
static const bool int_in_long = false; static const bool int_in_long = false;
#endif #endif
// Number of htbl entries for aes-gcm intrinsic
static const int htbl_entries = 96;
// Does the CPU supports vector variable shift instructions? // Does the CPU supports vector variable shift instructions?
static bool supports_vector_variable_shifts(void) { static bool supports_vector_variable_shifts(void) {
return (UseAVX >= 2); return (UseAVX >= 2);

View File

@ -4412,7 +4412,9 @@ class StubGenerator: public StubCodeGenerator {
const Register state = c_rarg5; const Register state = c_rarg5;
const Address subkeyH_mem(rbp, 2 * wordSize); const Address subkeyH_mem(rbp, 2 * wordSize);
const Register subkeyHtbl = r11; const Register subkeyHtbl = r11;
const Address counter_mem(rbp, 3 * wordSize); const Address avx512_subkeyH_mem(rbp, 3 * wordSize);
const Register avx512_subkeyHtbl = r13;
const Address counter_mem(rbp, 4 * wordSize);
const Register counter = r12; const Register counter = r12;
#else #else
const Address key_mem(rbp, 6 * wordSize); const Address key_mem(rbp, 6 * wordSize);
@ -4421,7 +4423,9 @@ class StubGenerator: public StubCodeGenerator {
const Register state = r13; const Register state = r13;
const Address subkeyH_mem(rbp, 8 * wordSize); const Address subkeyH_mem(rbp, 8 * wordSize);
const Register subkeyHtbl = r14; const Register subkeyHtbl = r14;
const Address counter_mem(rbp, 9 * wordSize); const Address avx512_subkeyH_mem(rbp, 9 * wordSize);
const Register avx512_subkeyHtbl = r12;
const Address counter_mem(rbp, 10 * wordSize);
const Register counter = rsi; const Register counter = rsi;
#endif #endif
__ enter(); __ enter();
@ -4438,9 +4442,10 @@ class StubGenerator: public StubCodeGenerator {
__ movptr(state, state_mem); __ movptr(state, state_mem);
#endif #endif
__ movptr(subkeyHtbl, subkeyH_mem); __ movptr(subkeyHtbl, subkeyH_mem);
__ movptr(avx512_subkeyHtbl, avx512_subkeyH_mem);
__ movptr(counter, counter_mem); __ movptr(counter, counter_mem);
__ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, counter); __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
// Restore state before leaving routine // Restore state before leaving routine
#ifdef _WIN64 #ifdef _WIN64

View File

@ -417,7 +417,7 @@ class methodHandle;
\ \
do_class(com_sun_crypto_provider_galoisCounterMode, "com/sun/crypto/provider/GaloisCounterMode") \ do_class(com_sun_crypto_provider_galoisCounterMode, "com/sun/crypto/provider/GaloisCounterMode") \
do_intrinsic(_galoisCounterMode_AESCrypt, com_sun_crypto_provider_galoisCounterMode, gcm_crypt_name, aes_gcm_signature, F_S) \ do_intrinsic(_galoisCounterMode_AESCrypt, com_sun_crypto_provider_galoisCounterMode, gcm_crypt_name, aes_gcm_signature, F_S) \
do_name(gcm_crypt_name, "implGCMCrypt") \ do_name(gcm_crypt_name, "implGCMCrypt0") \
do_signature(aes_gcm_signature, "([BII[BI[BILcom/sun/crypto/provider/GCTR;Lcom/sun/crypto/provider/GHASH;)I") \ do_signature(aes_gcm_signature, "([BII[BI[BILcom/sun/crypto/provider/GCTR;Lcom/sun/crypto/provider/GHASH;)I") \
\ \
/* support for sun.security.provider.MD5 */ \ /* support for sun.security.provider.MD5 */ \

View File

@ -2488,7 +2488,8 @@ Node* GraphKit::make_runtime_call(int flags,
Node* parm0, Node* parm1, Node* parm0, Node* parm1,
Node* parm2, Node* parm3, Node* parm2, Node* parm3,
Node* parm4, Node* parm5, Node* parm4, Node* parm5,
Node* parm6, Node* parm7) { Node* parm6, Node* parm7,
Node* parm8) {
assert(call_addr != NULL, "must not call NULL targets"); assert(call_addr != NULL, "must not call NULL targets");
// Slow-path call // Slow-path call
@ -2535,7 +2536,8 @@ Node* GraphKit::make_runtime_call(int flags,
if (parm5 != NULL) { call->init_req(TypeFunc::Parms+5, parm5); if (parm5 != NULL) { call->init_req(TypeFunc::Parms+5, parm5);
if (parm6 != NULL) { call->init_req(TypeFunc::Parms+6, parm6); if (parm6 != NULL) { call->init_req(TypeFunc::Parms+6, parm6);
if (parm7 != NULL) { call->init_req(TypeFunc::Parms+7, parm7); if (parm7 != NULL) { call->init_req(TypeFunc::Parms+7, parm7);
/* close each nested if ===> */ } } } } } } } } if (parm8 != NULL) { call->init_req(TypeFunc::Parms+8, parm8);
/* close each nested if ===> */ } } } } } } } } }
assert(call->in(call->req()-1) != NULL, "must initialize all parms"); assert(call->in(call->req()-1) != NULL, "must initialize all parms");
if (!is_leaf) { if (!is_leaf) {

View File

@ -802,7 +802,8 @@ class GraphKit : public Phase {
Node* parm0 = NULL, Node* parm1 = NULL, Node* parm0 = NULL, Node* parm1 = NULL,
Node* parm2 = NULL, Node* parm3 = NULL, Node* parm2 = NULL, Node* parm3 = NULL,
Node* parm4 = NULL, Node* parm5 = NULL, Node* parm4 = NULL, Node* parm5 = NULL,
Node* parm6 = NULL, Node* parm7 = NULL); Node* parm6 = NULL, Node* parm7 = NULL,
Node* parm8 = NULL);
Node* sign_extend_byte(Node* in); Node* sign_extend_byte(Node* in);
Node* sign_extend_short(Node* in); Node* sign_extend_short(Node* in);

View File

@ -6789,11 +6789,23 @@ bool LibraryCallKit::inline_galoisCounterMode_AESCrypt() {
Node* state_start = array_element_address(state, intcon(0), T_LONG); Node* state_start = array_element_address(state, intcon(0), T_LONG);
Node* subkeyHtbl_start = array_element_address(subkeyHtbl, intcon(0), T_LONG); Node* subkeyHtbl_start = array_element_address(subkeyHtbl, intcon(0), T_LONG);
ciKlass* klass = ciTypeArrayKlass::make(T_LONG);
Node* klass_node = makecon(TypeKlassPtr::make(klass));
// htbl entries is set to 96 only fox x86-64
if (Matcher::htbl_entries == -1) return false;
// new array to hold 48 computed htbl entries
Node* subkeyHtbl_48_entries = new_array(klass_node, intcon(Matcher::htbl_entries), 0);
if (subkeyHtbl_48_entries == NULL) return false;
Node* subkeyHtbl_48_entries_start = array_element_address(subkeyHtbl_48_entries, intcon(0), T_LONG);
// Call the stub, passing params // Call the stub, passing params
Node* gcmCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, Node* gcmCrypt = make_runtime_call(RC_LEAF|RC_NO_FP,
OptoRuntime::galoisCounterMode_aescrypt_Type(), OptoRuntime::galoisCounterMode_aescrypt_Type(),
stubAddr, stubName, TypePtr::BOTTOM, stubAddr, stubName, TypePtr::BOTTOM,
in_start, len, ct_start, out_start, k_start, state_start, subkeyHtbl_start, cnt_start); in_start, len, ct_start, out_start, k_start, state_start, subkeyHtbl_start, subkeyHtbl_48_entries_start, cnt_start);
// return cipher length (int) // return cipher length (int)
Node* retvalue = _gvn.transform(new ProjNode(gcmCrypt, TypeFunc::Parms)); Node* retvalue = _gvn.transform(new ProjNode(gcmCrypt, TypeFunc::Parms));

View File

@ -958,7 +958,7 @@ const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() {
//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int //for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
const TypeFunc* OptoRuntime::galoisCounterMode_aescrypt_Type() { const TypeFunc* OptoRuntime::galoisCounterMode_aescrypt_Type() {
// create input type (domain) // create input type (domain)
int num_args = 8; int num_args = 9;
int argcnt = num_args; int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt); const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms; int argp = TypeFunc::Parms;
@ -969,6 +969,7 @@ const TypeFunc* OptoRuntime::galoisCounterMode_aescrypt_Type() {
fields[argp++] = TypePtr::NOTNULL; // byte[] key from AESCrypt obj fields[argp++] = TypePtr::NOTNULL; // byte[] key from AESCrypt obj
fields[argp++] = TypePtr::NOTNULL; // long[] state from GHASH obj fields[argp++] = TypePtr::NOTNULL; // long[] state from GHASH obj
fields[argp++] = TypePtr::NOTNULL; // long[] subkeyHtbl from GHASH obj fields[argp++] = TypePtr::NOTNULL; // long[] subkeyHtbl from GHASH obj
fields[argp++] = TypePtr::NOTNULL; // long[] avx512_subkeyHtbl newly created
fields[argp++] = TypePtr::NOTNULL; // byte[] counter from GCTR obj fields[argp++] = TypePtr::NOTNULL; // byte[] counter from GCTR obj
assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); assert(argp == TypeFunc::Parms + argcnt, "correct decoding");

View File

@ -122,7 +122,7 @@ final class GHASH implements Cloneable, GCM {
/* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */ /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */
// hashtable subkeyHtbl holds 2*57 powers of subkeyH computed using // hashtable subkeyHtbl holds 2*9 powers of subkeyH computed using
// carry-less multiplication // carry-less multiplication
private long[] subkeyHtbl; private long[] subkeyHtbl;
@ -143,9 +143,8 @@ final class GHASH implements Cloneable, GCM {
throw new ProviderException("Internal error"); throw new ProviderException("Internal error");
} }
state = new long[2]; state = new long[2];
// 48 keys for the interleaved implementation,
// 8 for avx-ghash implementation and 1 for the original key // 8 for avx-ghash implementation and 1 for the original key
subkeyHtbl = new long[2*57]; subkeyHtbl = new long[2*9];
subkeyHtbl[0] = (long)asLongView.get(subkeyH, 0); subkeyHtbl[0] = (long)asLongView.get(subkeyH, 0);
subkeyHtbl[1] = (long)asLongView.get(subkeyH, 8); subkeyHtbl[1] = (long)asLongView.get(subkeyH, 8);
} }
@ -266,7 +265,7 @@ final class GHASH implements Cloneable, GCM {
throw new RuntimeException("internal state has invalid length: " + throw new RuntimeException("internal state has invalid length: " +
st.length); st.length);
} }
if (subH.length != 114) { if (subH.length != 18) {
throw new RuntimeException("internal subkeyHtbl has invalid length: " + throw new RuntimeException("internal subkeyHtbl has invalid length: " +
subH.length); subH.length);
} }

View File

@ -86,7 +86,9 @@ abstract class GaloisCounterMode extends CipherSpi {
// data size when buffer is divided up to aid in intrinsics // data size when buffer is divided up to aid in intrinsics
private static final int TRIGGERLEN = 65536; // 64k private static final int TRIGGERLEN = 65536; // 64k
// x86-64 parallel intrinsic data size // x86-64 parallel intrinsic data size
private static final int PARALLEL_LEN = 768; private static final int PARALLEL_LEN = 8192;
// max data size for x86-64 intrinsic
private static final int SPLIT_LEN = 1048576; // 1MB
static final byte[] EMPTY_BUF = new byte[0]; static final byte[] EMPTY_BUF = new byte[0];
@ -570,6 +572,28 @@ abstract class GaloisCounterMode extends CipherSpi {
return j0; return j0;
} }
// Wrapper function around AES-GCM interleaved intrinsic that splits
// large chunks of data into 1MB sized chunks. This is to place
// an upper limit on the number of blocks encrypted in the intrinsic.
private static int implGCMCrypt(byte[] in, int inOfs, int inLen, byte[] ct,
int ctOfs, byte[] out, int outOfs,
GCTR gctr, GHASH ghash) {
int len = 0;
if (inLen > SPLIT_LEN) {
while (inLen >= SPLIT_LEN) {
int partlen = implGCMCrypt0(in, inOfs + len, SPLIT_LEN, ct,
ctOfs + len, out, outOfs + len, gctr, ghash);
len += partlen;
inLen -= partlen;
}
}
if (inLen > 0) {
len += implGCMCrypt0(in, inOfs + len, inLen, ct,
ctOfs + len, out, outOfs + len, gctr, ghash);
}
return len;
}
/** /**
* Intrinsic for Vector AES Galois Counter Mode implementation. * Intrinsic for Vector AES Galois Counter Mode implementation.
* AES and GHASH operations are interleaved in the intrinsic implementation. * AES and GHASH operations are interleaved in the intrinsic implementation.
@ -590,7 +614,7 @@ abstract class GaloisCounterMode extends CipherSpi {
* @return number of processed bytes * @return number of processed bytes
*/ */
@IntrinsicCandidate @IntrinsicCandidate
private static int implGCMCrypt(byte[] in, int inOfs, int inLen, private static int implGCMCrypt0(byte[] in, int inOfs, int inLen,
byte[] ct, int ctOfs, byte[] out, int outOfs, byte[] ct, int ctOfs, byte[] out, int outOfs,
GCTR gctr, GHASH ghash) { GCTR gctr, GHASH ghash) {

View File

@ -98,25 +98,25 @@
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DmsgSize=8320
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DmsgSize=8326
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 -DmsgSize=8326
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 -DmsgSize=8326
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DmsgSize=8326
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DmsgSize=2054 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DmsgSize=8326
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=2048 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=8320
* -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
* compiler.codegen.aes.TestAESMain * compiler.codegen.aes.TestAESMain
* *