8337632: AES-GCM Algorithm optimization for x86_64

Reviewed-by: jbhateja, sviswanathan
This commit is contained in:
Smita Kamath 2024-09-30 17:00:13 +00:00
parent 5586f83e34
commit a6b318863f
8 changed files with 764 additions and 491 deletions

View File

@ -1919,6 +1919,11 @@ void Assembler::cmpb(Address dst, int imm8) {
emit_int8(imm8);
}
void Assembler::cmpb(Register dst, int imm8) {
prefix(dst);
emit_arith_b(0x80, 0xF8, dst, imm8);
}
void Assembler::cmpl(Address dst, int32_t imm32) {
InstructionMark im(this);
prefix(dst);
@ -9667,6 +9672,15 @@ void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src,
emit_int24(0x3A, (0xC0 | encode), imm8 & 0x01);
}
void Assembler::evinserti64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) {
assert(VM_Version::supports_avx512dq(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x38, (0xC0 | encode), imm8 & 0x03);
}
// vinsertf forms
@ -11731,6 +11745,21 @@ void Assembler::vbroadcastf128(XMMRegister dst, Address src, int vector_len) {
emit_operand(dst, src, 0);
}
void Assembler::evbroadcastf64x2(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_avx512dq(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_T2, /* input_size_in_bits */ EVEX_64bit);
attributes.set_is_evex_instruction();
// swap src<->dst for encoding
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x1A);
emit_operand(dst, src, 0);
}
// gpr source broadcast forms
// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL

View File

@ -1239,6 +1239,7 @@ private:
void cmpb(Address dst, int imm8);
void cmpb(Address dst, Register reg);
void cmpb(Register reg, Address dst);
void cmpb(Register reg, int imm8);
void cmpl(Address dst, int32_t imm32);
void cmpl(Register dst, int32_t imm32);
@ -2986,6 +2987,7 @@ private:
void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
void evinserti64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len);
// vinsertf forms
void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
@ -3035,6 +3037,7 @@ private:
void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
void vbroadcastf128(XMMRegister dst, Address src, int vector_len);
void evbroadcastf64x2(XMMRegister dst, Address src, int vector_len);
// gpr sourced byte/word/dword/qword replicate
void evpbroadcastb(XMMRegister dst, Register src, int vector_len);

View File

@ -376,11 +376,22 @@ class StubGenerator: public StubCodeGenerator {
void roundDec(XMMRegister key, int rnum);
void lastroundDec(XMMRegister key, int rnum);
void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl, Register rscratch);
void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx,
XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction,
XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos,
bool final_reduction, int index, XMMRegister counter_inc_mask);
void ghash16_encrypt_parallel16_avx512(Register in, Register out, Register ct, Register pos, Register avx512_subkeyHtbl,
Register CTR_CHECK, Register NROUNDS, Register key, XMMRegister CTR, XMMRegister GHASH,
XMMRegister ADDBE_4x4, XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK,
bool hk_broadcast, bool is_hash_start, bool do_hash_reduction, bool do_hash_hxor,
bool no_ghash_in, int ghashin_offset, int aesout_offset, int hashkey_offset);
void generateHtbl_32_blocks_avx512(Register htbl, Register avx512_htbl);
void initial_blocks_16_avx512(Register in, Register out, Register ct, Register pos, Register key, Register avx512_subkeyHtbl,
Register CTR_CHECK, Register rounds, XMMRegister CTR, XMMRegister GHASH, XMMRegister ADDBE_4x4,
XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK, int stack_offset);
void gcm_enc_dec_last_avx512(Register len, Register in, Register pos, XMMRegister HASH, XMMRegister SHUFM, Register subkeyHtbl,
int ghashin_offset, int hashkey_offset, bool start_ghash, bool do_reduction);
void ghash16_avx512(bool start_ghash, bool do_reduction, bool uload_shuffle, bool hk_broadcast, bool do_hxor,
Register in, Register pos, Register subkeyHtbl, XMMRegister HASH, XMMRegister SHUFM, int in_offset,
int in_disp, int displacement, int hashkey_offset);
void aesgcm_avx512(Register in, Register len, Register ct, Register out, Register key,
Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
// AVX2 AES-GCM related functions
void initial_blocks_avx2(XMMRegister ctr, Register rounds, Register key, Register len,
Register in, Register out, Register ct, XMMRegister aad_hashx, Register pos);

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, 2021, Intel Corporation. All rights reserved.
* Copyright (c) 2019, 2024, Intel Corporation. All rights reserved.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@ -57,7 +57,10 @@ address StubGenerator::ghash_byte_swap_mask_addr() {
// Polynomial x^128+x^127+x^126+x^121+1
ATTRIBUTE_ALIGNED(16) static const uint64_t GHASH_POLYNOMIAL[] = {
0x0000000000000001UL, 0xC200000000000000UL,
0x0000000000000001ULL, 0xC200000000000000ULL,
0x0000000000000001ULL, 0xC200000000000000ULL,
0x0000000000000001ULL, 0xC200000000000000ULL,
0x0000000000000001ULL, 0xC200000000000000ULL
};
address StubGenerator::ghash_polynomial_addr() {
return (address)GHASH_POLYNOMIAL;

View File

@ -72,7 +72,7 @@ abstract class GaloisCounterMode extends CipherSpi {
// data size when buffer is divided up to aid in intrinsics
private static final int TRIGGERLEN = 65536; // 64k
// x86-64 parallel intrinsic data size
private static final int PARALLEL_LEN = 7680;
private static final int PARALLEL_LEN = 512;
// max data size for x86-64 intrinsic
private static final int SPLIT_LEN = 1048576; // 1MB

View File

@ -35,7 +35,7 @@ import javax.crypto.spec.GCMParameterSpec;
public class AESGCMBench extends BenchBase {
@Param({"128"})
@Param({"128", "192", "256"})
int keyLength;
public static final int IV_MODULO = 16;

View File

@ -45,7 +45,7 @@ public abstract class BenchBase extends CryptoBase {
int keyLength = 256;
// Default data sizes for full tests
@Param({"1024", "1500", "4096", "16384"})
@Param({"128", "256", "512", "1024", "1500", "4096", "16384"})
int dataSize;
static final int IV_BUFFER_SIZE = 36;