From 13e9ea9e922030927775345b1abde1313a6ec03f Mon Sep 17 00:00:00 2001 From: Smita Kamath Date: Fri, 24 Sep 2021 19:21:32 +0000 Subject: [PATCH] 8273297: AES/GCM non-AVX512+VAES CPUs suffer after 8267125 Reviewed-by: ascarpino, sviswanathan, aph --- src/hotspot/cpu/aarch64/matcher_aarch64.hpp | 3 + src/hotspot/cpu/arm/matcher_arm.hpp | 3 + src/hotspot/cpu/ppc/matcher_ppc.hpp | 3 + src/hotspot/cpu/s390/matcher_s390.hpp | 3 + src/hotspot/cpu/x86/macroAssembler_x86.hpp | 4 +- .../cpu/x86/macroAssembler_x86_aes.cpp | 61 +++++++++---------- src/hotspot/cpu/x86/matcher_x86.hpp | 3 + src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 11 +++- src/hotspot/share/classfile/vmIntrinsics.hpp | 2 +- src/hotspot/share/opto/graphKit.cpp | 6 +- src/hotspot/share/opto/graphKit.hpp | 3 +- src/hotspot/share/opto/library_call.cpp | 14 ++++- src/hotspot/share/opto/runtime.cpp | 3 +- .../com/sun/crypto/provider/GHASH.java | 9 ++- .../crypto/provider/GaloisCounterMode.java | 28 ++++++++- .../compiler/codegen/aes/TestAESMain.java | 14 ++--- 16 files changed, 114 insertions(+), 56 deletions(-) diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index 0a7f14fa23b..e5bee7990a6 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -55,6 +55,9 @@ // No support for generic vector operands. static const bool supports_generic_vector_operands = false; + // No support for 48 extra htbl entries in aes-gcm intrinsic + static const int htbl_entries = -1; + static constexpr bool isSimpleConstant64(jlong value) { // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?. // Probably always true, even if a temp register is required. diff --git a/src/hotspot/cpu/arm/matcher_arm.hpp b/src/hotspot/cpu/arm/matcher_arm.hpp index 56f77409a89..b7a9a3f5042 100644 --- a/src/hotspot/cpu/arm/matcher_arm.hpp +++ b/src/hotspot/cpu/arm/matcher_arm.hpp @@ -56,6 +56,9 @@ // No support for generic vector operands. static const bool supports_generic_vector_operands = false; + // No support for 48 extra htbl entries in aes-gcm intrinsic + static const int htbl_entries = -1; + static constexpr bool isSimpleConstant64(jlong value) { // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?. return false; diff --git a/src/hotspot/cpu/ppc/matcher_ppc.hpp b/src/hotspot/cpu/ppc/matcher_ppc.hpp index d1bb8b21dbf..df1672b3048 100644 --- a/src/hotspot/cpu/ppc/matcher_ppc.hpp +++ b/src/hotspot/cpu/ppc/matcher_ppc.hpp @@ -57,6 +57,9 @@ // No support for generic vector operands. static const bool supports_generic_vector_operands = false; + // No support for 48 extra htbl entries in aes-gcm intrinsic + static const int htbl_entries = -1; + static constexpr bool isSimpleConstant64(jlong value) { // Probably always true, even if a temp register is required. return true; diff --git a/src/hotspot/cpu/s390/matcher_s390.hpp b/src/hotspot/cpu/s390/matcher_s390.hpp index bc6956c445d..7577a7b2666 100644 --- a/src/hotspot/cpu/s390/matcher_s390.hpp +++ b/src/hotspot/cpu/s390/matcher_s390.hpp @@ -57,6 +57,9 @@ // No support for generic vector operands. static const bool supports_generic_vector_operands = false; + // No support for 48 extra htbl entries in aes-gcm intrinsic + static const int htbl_entries = -1; + static constexpr bool isSimpleConstant64(jlong value) { // Probably always true, even if a temp register is required. return true; diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 4ecbe6add71..b44f59a9ce3 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -946,7 +946,7 @@ private: void lastroundDec(XMMRegister key, int rnum); void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask); void gfmul_avx512(XMMRegister ghash, XMMRegister hkey); - void generateHtbl_48_block_zmm(Register htbl); + void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl); void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction, XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos, @@ -957,7 +957,7 @@ public: void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, Register len_reg, Register used, Register used_addr, Register saved_encCounter_start); void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, - Register state, Register subkeyHtbl, Register counter); + Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter); #endif diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp index d889ed64cbc..776eebaa684 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp @@ -1293,7 +1293,7 @@ void MacroAssembler::gfmul_avx512(XMMRegister GH, XMMRegister HK) { vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit); } -void MacroAssembler::generateHtbl_48_block_zmm(Register htbl) { +void MacroAssembler::generateHtbl_48_block_zmm(Register htbl, Register avx512_htbl) { const XMMRegister HK = xmm6; const XMMRegister ZT5 = xmm4; const XMMRegister ZT7 = xmm7; @@ -1320,48 +1320,48 @@ void MacroAssembler::generateHtbl_48_block_zmm(Register htbl) { vpcmpeqd(xmm2, xmm2, xmm12, AVX_128bit); vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit); vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit); - movdqu(Address(htbl, 16 * 56), xmm6); // H ^ 2 + movdqu(Address(avx512_htbl, 16 * 47), xmm6); // H ^ 2 // Compute the remaining three powers of H using XMM registers and all following powers using ZMM movdqu(ZT5, HK); vinserti32x4(ZT7, ZT7, HK, 3); gfmul_avx512(ZT5, HK); - movdqu(Address(htbl, 16 * 55), ZT5); // H ^ 2 * 2 + movdqu(Address(avx512_htbl, 16 * 46), ZT5); // H ^ 2 * 2 vinserti32x4(ZT7, ZT7, ZT5, 2); gfmul_avx512(ZT5, HK); - movdqu(Address(htbl, 16 * 54), ZT5); // H ^ 2 * 3 + movdqu(Address(avx512_htbl, 16 * 45), ZT5); // H ^ 2 * 3 vinserti32x4(ZT7, ZT7, ZT5, 1); gfmul_avx512(ZT5, HK); - movdqu(Address(htbl, 16 * 53), ZT5); // H ^ 2 * 4 + movdqu(Address(avx512_htbl, 16 * 44), ZT5); // H ^ 2 * 4 vinserti32x4(ZT7, ZT7, ZT5, 0); evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); evmovdquq(ZT8, ZT7, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 49), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 40), ZT7, Assembler::AVX_512bit); evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit); gfmul_avx512(ZT8, ZT5); - evmovdquq(Address(htbl, 16 * 45), ZT8, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 36), ZT8, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 41), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 32), ZT7, Assembler::AVX_512bit); gfmul_avx512(ZT8, ZT5); - evmovdquq(Address(htbl, 16 * 37), ZT8, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 28), ZT8, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 33), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit); gfmul_avx512(ZT8, ZT5); - evmovdquq(Address(htbl, 16 * 29), ZT8, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 20), ZT8, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 25), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 16), ZT7, Assembler::AVX_512bit); gfmul_avx512(ZT8, ZT5); - evmovdquq(Address(htbl, 16 * 21), ZT8, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 12), ZT8, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 17), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 8), ZT7, Assembler::AVX_512bit); gfmul_avx512(ZT8, ZT5); - evmovdquq(Address(htbl, 16 * 13), ZT8, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 4), ZT8, Assembler::AVX_512bit); gfmul_avx512(ZT7, ZT5); - evmovdquq(Address(htbl, 16 * 9), ZT7, Assembler::AVX_512bit); + evmovdquq(Address(avx512_htbl, 16 * 0), ZT7, Assembler::AVX_512bit); ret(0); } @@ -1477,8 +1477,8 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb // ZTMP19 & ZTMP20 used for loading hash key // Pre-load hash key - evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64 + 144), Assembler::AVX_512bit); - evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); + evmovdquq(ZTMP19, Address(subkeyHtbl, i * 64), Assembler::AVX_512bit); + evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); // Load data for computing ghash evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); @@ -1499,7 +1499,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb // GHASH 4 blocks carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP21, ZTMP19); // Load the next hkey and Ghash data - evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); + evmovdquq(ZTMP19, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); evmovdquq(ZTMP21, Address(data, ghash_pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); vpshufb(ZTMP21, ZTMP21, xmm24, Assembler::AVX_512bit); @@ -1510,7 +1510,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb // GHASH 4 blocks(11 to 8) carrylessMultiply(ZTMP10, ZTMP12, ZTMP11, ZTMP9, ZTMP22, ZTMP20); // Load the next hkey and GDATA - evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64 + 144), Assembler::AVX_512bit); + evmovdquq(ZTMP20, Address(subkeyHtbl, ++i * 64), Assembler::AVX_512bit); evmovdquq(ZTMP22, Address(data, ghash_pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); vpshufb(ZTMP22, ZTMP22, xmm24, Assembler::AVX_512bit); @@ -1628,8 +1628,7 @@ void MacroAssembler::ghash16_encrypt16_parallel(Register key, Register subkeyHtb } void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, - Register state, Register subkeyHtbl, Register counter) { - + Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) { Label ENC_DEC_DONE, GENERATE_HTBL_48_BLKS, AES_192, AES_256, STORE_CT, GHASH_LAST_32, AES_32_BLOCKS, GHASH_AES_PARALLEL, LOOP, ACCUMULATE, GHASH_16_AES_16; const XMMRegister CTR_BLOCKx = xmm9; @@ -1761,7 +1760,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi // 2) No reduction -> accumulate multiplication values // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round // Reduction value = first time - ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); + ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); addl(pos, 256); addl(ghash_pos, 256); index += 4; @@ -1778,12 +1777,12 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi // Each call uses 4 subkeyHtbl values, so increment the index by 4. bind(GHASH_16_AES_16); // Reduction value = no reduction - ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK); + ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK); addl(pos, 256); addl(ghash_pos, 256); index += 4; // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash - ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK); + ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK); addl(pos, 256); addl(ghash_pos, 256); // Calculated ghash value needs to be moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline @@ -1792,7 +1791,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi // Restart the pipeline // Reduction value = first time - ghash16_encrypt16_parallel(key, subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); + ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK); addl(pos, 256); addl(ghash_pos, 256); index += 4; @@ -1812,8 +1811,8 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); // Load ghash keys - evmovdquq(ZTMP15, Address(subkeyHtbl, rbx, Address::times_1, 0 * 64 + 144), Assembler::AVX_512bit); - evmovdquq(ZTMP16, Address(subkeyHtbl, rbx, Address::times_1, 1 * 64 + 144), Assembler::AVX_512bit); + evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); + evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); // Ghash blocks 0 - 3 carrylessMultiply(ZTMP2, ZTMP3, ZTMP4, ZTMP1, ZTMP13, ZTMP15); @@ -1837,8 +1836,8 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi evmovdquq(ZTMP14, Address(ct, ghash_pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); vpshufb(ZTMP13, ZTMP13, xmm24, Assembler::AVX_512bit); vpshufb(ZTMP14, ZTMP14, xmm24, Assembler::AVX_512bit); - evmovdquq(ZTMP15, Address(subkeyHtbl, rbx, Address::times_1, 0 * 64 + 144), Assembler::AVX_512bit); - evmovdquq(ZTMP16, Address(subkeyHtbl, rbx, Address::times_1, 1 * 64 + 144), Assembler::AVX_512bit); + evmovdquq(ZTMP15, Address(avx512_subkeyHtbl, rbx, Address::times_1, 0 * 64), Assembler::AVX_512bit); + evmovdquq(ZTMP16, Address(avx512_subkeyHtbl, rbx, Address::times_1, 1 * 64), Assembler::AVX_512bit); // ghash blocks 0 - 3 carrylessMultiply(ZTMP6, ZTMP7, ZTMP8, ZTMP5, ZTMP13, ZTMP15); @@ -1884,7 +1883,7 @@ void MacroAssembler::aesgcm_encrypt(Register in, Register len, Register ct, Regi jmp(ENC_DEC_DONE); bind(GENERATE_HTBL_48_BLKS); - generateHtbl_48_block_zmm(subkeyHtbl); + generateHtbl_48_block_zmm(subkeyHtbl, avx512_subkeyHtbl); bind(ENC_DEC_DONE); movq(rax, pos); diff --git a/src/hotspot/cpu/x86/matcher_x86.hpp b/src/hotspot/cpu/x86/matcher_x86.hpp index f8c675c87bd..510395f121f 100644 --- a/src/hotspot/cpu/x86/matcher_x86.hpp +++ b/src/hotspot/cpu/x86/matcher_x86.hpp @@ -148,6 +148,9 @@ static const bool int_in_long = false; #endif + // Number of htbl entries for aes-gcm intrinsic + static const int htbl_entries = 96; + // Does the CPU supports vector variable shift instructions? static bool supports_vector_variable_shifts(void) { return (UseAVX >= 2); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index acbf0228dca..6ce1ea56479 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4412,7 +4412,9 @@ class StubGenerator: public StubCodeGenerator { const Register state = c_rarg5; const Address subkeyH_mem(rbp, 2 * wordSize); const Register subkeyHtbl = r11; - const Address counter_mem(rbp, 3 * wordSize); + const Address avx512_subkeyH_mem(rbp, 3 * wordSize); + const Register avx512_subkeyHtbl = r13; + const Address counter_mem(rbp, 4 * wordSize); const Register counter = r12; #else const Address key_mem(rbp, 6 * wordSize); @@ -4421,7 +4423,9 @@ class StubGenerator: public StubCodeGenerator { const Register state = r13; const Address subkeyH_mem(rbp, 8 * wordSize); const Register subkeyHtbl = r14; - const Address counter_mem(rbp, 9 * wordSize); + const Address avx512_subkeyH_mem(rbp, 9 * wordSize); + const Register avx512_subkeyHtbl = r12; + const Address counter_mem(rbp, 10 * wordSize); const Register counter = rsi; #endif __ enter(); @@ -4438,9 +4442,10 @@ class StubGenerator: public StubCodeGenerator { __ movptr(state, state_mem); #endif __ movptr(subkeyHtbl, subkeyH_mem); + __ movptr(avx512_subkeyHtbl, avx512_subkeyH_mem); __ movptr(counter, counter_mem); - __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, counter); + __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter); // Restore state before leaving routine #ifdef _WIN64 diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 02aa3cb5615..2a4f54880c9 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -417,7 +417,7 @@ class methodHandle; \ do_class(com_sun_crypto_provider_galoisCounterMode, "com/sun/crypto/provider/GaloisCounterMode") \ do_intrinsic(_galoisCounterMode_AESCrypt, com_sun_crypto_provider_galoisCounterMode, gcm_crypt_name, aes_gcm_signature, F_S) \ - do_name(gcm_crypt_name, "implGCMCrypt") \ + do_name(gcm_crypt_name, "implGCMCrypt0") \ do_signature(aes_gcm_signature, "([BII[BI[BILcom/sun/crypto/provider/GCTR;Lcom/sun/crypto/provider/GHASH;)I") \ \ /* support for sun.security.provider.MD5 */ \ diff --git a/src/hotspot/share/opto/graphKit.cpp b/src/hotspot/share/opto/graphKit.cpp index 7fcdd60f886..40d4e690b00 100644 --- a/src/hotspot/share/opto/graphKit.cpp +++ b/src/hotspot/share/opto/graphKit.cpp @@ -2488,7 +2488,8 @@ Node* GraphKit::make_runtime_call(int flags, Node* parm0, Node* parm1, Node* parm2, Node* parm3, Node* parm4, Node* parm5, - Node* parm6, Node* parm7) { + Node* parm6, Node* parm7, + Node* parm8) { assert(call_addr != NULL, "must not call NULL targets"); // Slow-path call @@ -2535,7 +2536,8 @@ Node* GraphKit::make_runtime_call(int flags, if (parm5 != NULL) { call->init_req(TypeFunc::Parms+5, parm5); if (parm6 != NULL) { call->init_req(TypeFunc::Parms+6, parm6); if (parm7 != NULL) { call->init_req(TypeFunc::Parms+7, parm7); - /* close each nested if ===> */ } } } } } } } } + if (parm8 != NULL) { call->init_req(TypeFunc::Parms+8, parm8); + /* close each nested if ===> */ } } } } } } } } } assert(call->in(call->req()-1) != NULL, "must initialize all parms"); if (!is_leaf) { diff --git a/src/hotspot/share/opto/graphKit.hpp b/src/hotspot/share/opto/graphKit.hpp index d2d1043c1ea..d2a6dd8d6c8 100644 --- a/src/hotspot/share/opto/graphKit.hpp +++ b/src/hotspot/share/opto/graphKit.hpp @@ -802,7 +802,8 @@ class GraphKit : public Phase { Node* parm0 = NULL, Node* parm1 = NULL, Node* parm2 = NULL, Node* parm3 = NULL, Node* parm4 = NULL, Node* parm5 = NULL, - Node* parm6 = NULL, Node* parm7 = NULL); + Node* parm6 = NULL, Node* parm7 = NULL, + Node* parm8 = NULL); Node* sign_extend_byte(Node* in); Node* sign_extend_short(Node* in); diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 2dbd3dc1c4c..0ac5bf6d502 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -6789,11 +6789,23 @@ bool LibraryCallKit::inline_galoisCounterMode_AESCrypt() { Node* state_start = array_element_address(state, intcon(0), T_LONG); Node* subkeyHtbl_start = array_element_address(subkeyHtbl, intcon(0), T_LONG); + ciKlass* klass = ciTypeArrayKlass::make(T_LONG); + Node* klass_node = makecon(TypeKlassPtr::make(klass)); + + // htbl entries is set to 96 only fox x86-64 + if (Matcher::htbl_entries == -1) return false; + + // new array to hold 48 computed htbl entries + Node* subkeyHtbl_48_entries = new_array(klass_node, intcon(Matcher::htbl_entries), 0); + if (subkeyHtbl_48_entries == NULL) return false; + + Node* subkeyHtbl_48_entries_start = array_element_address(subkeyHtbl_48_entries, intcon(0), T_LONG); + // Call the stub, passing params Node* gcmCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::galoisCounterMode_aescrypt_Type(), stubAddr, stubName, TypePtr::BOTTOM, - in_start, len, ct_start, out_start, k_start, state_start, subkeyHtbl_start, cnt_start); + in_start, len, ct_start, out_start, k_start, state_start, subkeyHtbl_start, subkeyHtbl_48_entries_start, cnt_start); // return cipher length (int) Node* retvalue = _gvn.transform(new ProjNode(gcmCrypt, TypeFunc::Parms)); diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 22fa5e48bd7..e3e7e690a50 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -958,7 +958,7 @@ const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() { //for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int const TypeFunc* OptoRuntime::galoisCounterMode_aescrypt_Type() { // create input type (domain) - int num_args = 8; + int num_args = 9; int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; @@ -969,6 +969,7 @@ const TypeFunc* OptoRuntime::galoisCounterMode_aescrypt_Type() { fields[argp++] = TypePtr::NOTNULL; // byte[] key from AESCrypt obj fields[argp++] = TypePtr::NOTNULL; // long[] state from GHASH obj fields[argp++] = TypePtr::NOTNULL; // long[] subkeyHtbl from GHASH obj + fields[argp++] = TypePtr::NOTNULL; // long[] avx512_subkeyHtbl newly created fields[argp++] = TypePtr::NOTNULL; // byte[] counter from GCTR obj assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); diff --git a/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java b/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java index 24cddd2b9f6..8a1a086e433 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java @@ -122,7 +122,7 @@ final class GHASH implements Cloneable, GCM { /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */ - // hashtable subkeyHtbl holds 2*57 powers of subkeyH computed using + // hashtable subkeyHtbl holds 2*9 powers of subkeyH computed using // carry-less multiplication private long[] subkeyHtbl; @@ -143,9 +143,8 @@ final class GHASH implements Cloneable, GCM { throw new ProviderException("Internal error"); } state = new long[2]; - // 48 keys for the interleaved implementation, - // 8 for avx-ghash implementation and 1 for the original key - subkeyHtbl = new long[2*57]; + // 8 for avx-ghash implementation and 1 for the original key + subkeyHtbl = new long[2*9]; subkeyHtbl[0] = (long)asLongView.get(subkeyH, 0); subkeyHtbl[1] = (long)asLongView.get(subkeyH, 8); } @@ -266,7 +265,7 @@ final class GHASH implements Cloneable, GCM { throw new RuntimeException("internal state has invalid length: " + st.length); } - if (subH.length != 114) { + if (subH.length != 18) { throw new RuntimeException("internal subkeyHtbl has invalid length: " + subH.length); } diff --git a/src/java.base/share/classes/com/sun/crypto/provider/GaloisCounterMode.java b/src/java.base/share/classes/com/sun/crypto/provider/GaloisCounterMode.java index 957823b886e..728d463068e 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/GaloisCounterMode.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/GaloisCounterMode.java @@ -86,7 +86,9 @@ abstract class GaloisCounterMode extends CipherSpi { // data size when buffer is divided up to aid in intrinsics private static final int TRIGGERLEN = 65536; // 64k // x86-64 parallel intrinsic data size - private static final int PARALLEL_LEN = 768; + private static final int PARALLEL_LEN = 8192; + // max data size for x86-64 intrinsic + private static final int SPLIT_LEN = 1048576; // 1MB static final byte[] EMPTY_BUF = new byte[0]; @@ -570,6 +572,28 @@ abstract class GaloisCounterMode extends CipherSpi { return j0; } + // Wrapper function around AES-GCM interleaved intrinsic that splits + // large chunks of data into 1MB sized chunks. This is to place + // an upper limit on the number of blocks encrypted in the intrinsic. + private static int implGCMCrypt(byte[] in, int inOfs, int inLen, byte[] ct, + int ctOfs, byte[] out, int outOfs, + GCTR gctr, GHASH ghash) { + + int len = 0; + if (inLen > SPLIT_LEN) { + while (inLen >= SPLIT_LEN) { + int partlen = implGCMCrypt0(in, inOfs + len, SPLIT_LEN, ct, + ctOfs + len, out, outOfs + len, gctr, ghash); + len += partlen; + inLen -= partlen; + } + } + if (inLen > 0) { + len += implGCMCrypt0(in, inOfs + len, inLen, ct, + ctOfs + len, out, outOfs + len, gctr, ghash); + } + return len; + } /** * Intrinsic for Vector AES Galois Counter Mode implementation. * AES and GHASH operations are interleaved in the intrinsic implementation. @@ -590,7 +614,7 @@ abstract class GaloisCounterMode extends CipherSpi { * @return number of processed bytes */ @IntrinsicCandidate - private static int implGCMCrypt(byte[] in, int inOfs, int inLen, + private static int implGCMCrypt0(byte[] in, int inOfs, int inLen, byte[] ct, int ctOfs, byte[] out, int outOfs, GCTR gctr, GHASH ghash) { diff --git a/test/hotspot/jtreg/compiler/codegen/aes/TestAESMain.java b/test/hotspot/jtreg/compiler/codegen/aes/TestAESMain.java index 8b76f5cc3d0..9ea3f88fc57 100644 --- a/test/hotspot/jtreg/compiler/codegen/aes/TestAESMain.java +++ b/test/hotspot/jtreg/compiler/codegen/aes/TestAESMain.java @@ -98,25 +98,25 @@ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DmsgSize=8320 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DmsgSize=8326 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 -DmsgSize=8326 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 -DmsgSize=8326 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DmsgSize=8326 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DmsgSize=2054 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DmsgSize=8326 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain - * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=2048 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=8320 * -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. * compiler.codegen.aes.TestAESMain *