8288047: Accelerate Poly1305 on x86_64 using AVX512 instructions

Reviewed-by: sviswanathan, vlivanov
This commit is contained in:
Volodymyr Paprotski 2022-11-21 21:01:25 +00:00 committed by Sandhya Viswanathan
parent cd6a203a3e
commit f12710e938
32 changed files with 1857 additions and 36 deletions

View File

@ -5008,6 +5008,40 @@ assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
emit_int16(0x04, (0xC0 | encode));
}
void Assembler::evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpmadd52luq(dst, k0, src1, src2, false, vector_len);
}
void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_avx512ifma(), "");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xB4, (0xC0 | encode));
}
void Assembler::evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpmadd52huq(dst, k0, src1, src2, false, vector_len);
}
void Assembler::evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_avx512ifma(), "");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xB5, (0xC0 | encode));
}
void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_avx512_vnni(), "must support vnni");
@ -5425,6 +5459,42 @@ void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
emit_int16(0x6C, (0xC0 | encode));
}
void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpunpcklqdq(dst, k0, src1, src2, false, vector_len);
}
void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6C, (0xC0 | encode));
}
void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpunpckhqdq(dst, k0, src1, src2, false, vector_len);
}
void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6D, (0xC0 | encode));
}
void Assembler::push(int32_t imm32) {
// in 64bits we push 64bits onto the stack but only
// take a 32bit immediate
@ -5869,6 +5939,18 @@ void Assembler::shrdl(Register dst, Register src, int8_t imm8) {
emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
}
#ifdef _LP64
void Assembler::shldq(Register dst, Register src, int8_t imm8) {
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
emit_int32(0x0F, (unsigned char)0xA4, (0xC0 | encode), imm8);
}
void Assembler::shrdq(Register dst, Register src, int8_t imm8) {
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
}
#endif
// copies a single word from [esi] to [edi]
void Assembler::smovl() {
emit_int8((unsigned char)0xA5);
@ -7740,11 +7822,12 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_
emit_operand(dst, src, 0);
}
void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xDB, (0xC0 | encode));
void Assembler::evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
evpandq(dst, k0, nds, src, false, vector_len);
}
void Assembler::evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
evpandq(dst, k0, nds, src, false, vector_len);
}
//Variable Shift packed integers logically left.
@ -7857,13 +7940,13 @@ void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_l
emit_operand(dst, src, 0);
}
void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xEB, (0xC0 | encode));
void Assembler::evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
evporq(dst, k0, nds, src, false, vector_len);
}
void Assembler::evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
evporq(dst, k0, nds, src, false, vector_len);
}
void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
@ -8004,7 +8087,8 @@ void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
}
void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
@ -8016,7 +8100,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMReg
}
void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
@ -8031,7 +8116,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
}
void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
@ -8043,7 +8129,8 @@ void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegi
}
void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
@ -8201,8 +8288,8 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address
}
void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@ -8211,6 +8298,20 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis
emit_int8(imm8);
}
void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x25);
emit_operand(dst, src3, 1);
emit_int8(imm8);
}
void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
@ -13452,6 +13553,13 @@ void Assembler::vzeroupper() {
emit_copy(code_section(), vzup_code, vzup_len);
}
void Assembler::vzeroall() {
assert(VM_Version::supports_avx(), "requires AVX");
InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
(void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int8(0x77);
}
void Assembler::pushq(Address src) {
InstructionMark im(this);
emit_int16(get_prefixq(src), (unsigned char)0xFF);

View File

@ -1891,6 +1891,10 @@ private:
void pmaddwd(XMMRegister dst, XMMRegister src);
void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
// Multiply add accumulate
void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -1990,6 +1994,11 @@ private:
// Interleave Low Quadwords
void punpcklqdq(XMMRegister dst, XMMRegister src);
void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
// Vector sum of absolute difference.
void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -2092,6 +2101,10 @@ private:
void shldl(Register dst, Register src, int8_t imm8);
void shrdl(Register dst, Register src);
void shrdl(Register dst, Register src, int8_t imm8);
#ifdef _LP64
void shldq(Register dst, Register src, int8_t imm8);
void shrdq(Register dst, Register src, int8_t imm8);
#endif
void shll(Register dst, int imm8);
void shll(Register dst);
@ -2616,7 +2629,8 @@ private:
void pand(XMMRegister dst, XMMRegister src);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Andn packed integers
void pandn(XMMRegister dst, XMMRegister src);
@ -2626,7 +2640,8 @@ private:
void por(XMMRegister dst, XMMRegister src);
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Xor packed integers
void pxor(XMMRegister dst, XMMRegister src);
@ -2640,6 +2655,7 @@ private:
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);
// Vector compress/expand instructions.
void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
@ -2753,6 +2769,8 @@ private:
// runtime code and native libraries.
void vzeroupper();
void vzeroall();
// Vector double compares
void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,

View File

@ -5255,7 +5255,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg
// Get the reverse bit sequence of lower nibble of each byte.
vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
vpandq(dst, xtmp2, src, vec_enc);
evpandq(dst, xtmp2, src, vec_enc);
vpshufb(dst, xtmp1, dst, vec_enc);
vpsllq(dst, dst, 4, vec_enc);
@ -5266,7 +5266,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg
// Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
// right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
vporq(xtmp2, dst, xtmp2, vec_enc);
evporq(xtmp2, dst, xtmp2, vec_enc);
vector_reverse_byte(bt, dst, xtmp2, vec_enc);
} else if(vec_enc == Assembler::AVX_512bit) {
@ -5321,11 +5321,11 @@ void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, X
void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
XMMRegister xtmp1, Register rtmp, int vec_enc) {
vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
vpandq(dst, xtmp1, src, vec_enc);
evpandq(dst, xtmp1, src, vec_enc);
vpsllq(dst, dst, nbits, vec_enc);
vpandn(xtmp1, xtmp1, src, vec_enc);
vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
vporq(dst, dst, xtmp1, vec_enc);
evporq(dst, dst, xtmp1, vec_enc);
}
void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,

View File

@ -1217,6 +1217,19 @@ void MacroAssembler::andptr(Register dst, int32_t imm32) {
LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
}
#ifdef _LP64
void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
assert(rscratch != noreg || always_reachable(src), "missing");
if (reachable(src)) {
andq(dst, as_Address(src));
} else {
lea(rscratch, src);
andq(dst, Address(rscratch, 0));
}
}
#endif
void MacroAssembler::atomic_incl(Address counter_addr) {
lock();
incrementl(counter_addr);
@ -9105,6 +9118,40 @@ void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMM
fatal("Unexpected type argument %s", type2name(type)); break;
}
}
void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
assert(rscratch != noreg || always_reachable(src), "missing");
if (reachable(src)) {
evpandq(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch, src);
evpandq(dst, nds, Address(rscratch, 0), vector_len);
}
}
void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
assert(rscratch != noreg || always_reachable(src), "missing");
if (reachable(src)) {
evporq(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch, src);
evporq(dst, nds, Address(rscratch, 0), vector_len);
}
}
void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
assert(rscratch != noreg || always_reachable(src3), "missing");
if (reachable(src3)) {
vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
} else {
lea(rscratch, src3);
vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
}
}
#if COMPILER2_OR_JVMCI
void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,

View File

@ -730,6 +730,11 @@ public:
void andptr(Register dst, int32_t src);
void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
#ifdef _LP64
using Assembler::andq;
void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
#endif
void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
// renamed to drag out the casting of address to int32_t/intptr_t
@ -1754,6 +1759,15 @@ public:
void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
using Assembler::evpandq;
void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
using Assembler::evporq;
void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
using Assembler::vpternlogq;
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch);
void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch);

View File

@ -2519,7 +2519,7 @@ address StubGenerator::generate_base64_decodeBlock() {
// Decode all bytes within our merged input
__ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
__ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
__ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
__ evporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
// Check for error. Compare (decoded | initial) to all invalid.
// If any bytes have their high-order bit set, then we have an error.
@ -3709,6 +3709,10 @@ void StubGenerator::generate_initial() {
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (UsePolyIntrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}
if (UseCRC32CIntrinsics) {
bool supports_clmul = VM_Version::supports_clmul();
StubRoutines::x86::generate_CRC32C_table(supports_clmul);

View File

@ -387,6 +387,24 @@ class StubGenerator: public StubCodeGenerator {
// Ghash single and multi block operations using AVX instructions
address generate_avx_ghash_processBlocks();
// Poly1305 multiblock using IFMA instructions
address generate_poly1305_processBlocks();
void poly1305_process_blocks_avx512(const Register input, const Register length,
const Register A0, const Register A1, const Register A2,
const Register R0, const Register R1, const Register C1);
void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2,
const Register r0, const Register r1, const Register c1, bool only128,
const Register t0, const Register t1, const Register t2,
const Register mulql, const Register mulqh);
void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
const XMMRegister TMP, const Register rscratch);
void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1);
void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1);
void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1,
const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
const XMMRegister TMP, const Register rscratch);
// BASE64 stubs

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
enum platform_dependent_constants {
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
code_size2 = 35300 LP64_ONLY(+35000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small)
code_size2 = 35300 LP64_ONLY(+45000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small)
};
class x86 {

View File

@ -947,6 +947,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512_VBMI;
_features &= ~CPU_AVX512_VBMI2;
_features &= ~CPU_AVX512_BITALG;
_features &= ~CPU_AVX512_IFMA;
}
if (UseAVX < 2)
@ -978,6 +979,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_FLUSHOPT;
_features &= ~CPU_GFNI;
_features &= ~CPU_AVX512_BITALG;
_features &= ~CPU_AVX512_IFMA;
}
}
@ -1330,6 +1332,18 @@ void VM_Version::get_processor_features() {
}
#endif // COMPILER2 && ASSERT
#ifdef _LP64
if (supports_avx512ifma() && supports_avx512vlbw() && MaxVectorSize >= 64) {
if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) {
FLAG_SET_DEFAULT(UsePolyIntrinsics, true);
}
} else
#endif
if (UsePolyIntrinsics) {
warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UsePolyIntrinsics, false);
}
#ifdef _LP64
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
@ -2894,6 +2908,8 @@ uint64_t VM_Version::feature_flags() {
result |= CPU_AVX512CD;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0)
result |= CPU_AVX512DQ;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512ifma != 0)
result |= CPU_AVX512_IFMA;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0)
result |= CPU_AVX512PF;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0)

View File

@ -223,7 +223,9 @@ class VM_Version : public Abstract_VM_Version {
avx512dq : 1,
: 1,
adx : 1,
: 3,
: 1,
avx512ifma : 1,
: 1,
clflushopt : 1,
clwb : 1,
: 1,
@ -387,7 +389,8 @@ protected:
decl(PKU, "pku", 54) /* Protection keys for user-mode pages */ \
decl(OSPKE, "ospke", 55) /* OS enables protection keys */ \
decl(CET_IBT, "cet_ibt", 56) /* Control Flow Enforcement - Indirect Branch Tracking */ \
decl(CET_SS, "cet_ss", 57) /* Control Flow Enforcement - Shadow Stack */
decl(CET_SS, "cet_ss", 57) /* Control Flow Enforcement - Shadow Stack */ \
decl(AVX512_IFMA, "avx512_ifma", 58) /* Integer Vector FMA instructions*/
#define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
@ -667,6 +670,7 @@ public:
static bool supports_adx() { return (_features & CPU_ADX) != 0; }
static bool supports_evex() { return (_features & CPU_AVX512F) != 0; }
static bool supports_avx512dq() { return (_features & CPU_AVX512DQ) != 0; }
static bool supports_avx512ifma() { return (_features & CPU_AVX512_IFMA) != 0; }
static bool supports_avx512pf() { return (_features & CPU_AVX512PF) != 0; }
static bool supports_avx512er() { return (_features & CPU_AVX512ER) != 0; }
static bool supports_avx512cd() { return (_features & CPU_AVX512CD) != 0; }

View File

@ -479,6 +479,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
case vmIntrinsics::_base64_decodeBlock:
if (!UseBASE64Intrinsics) return true;
break;
case vmIntrinsics::_poly1305_processBlocks:
if (!UsePolyIntrinsics) return true;
break;
case vmIntrinsics::_updateBytesCRC32C:
case vmIntrinsics::_updateDirectByteBufferCRC32C:
if (!UseCRC32CIntrinsics) return true;

View File

@ -519,7 +519,7 @@ class methodHandle;
do_class(java_util_Base64_Decoder, "java/util/Base64$Decoder") \
do_intrinsic(_base64_decodeBlock, java_util_Base64_Decoder, decodeBlock_name, decodeBlock_signature, F_R) \
do_name(decodeBlock_name, "decodeBlock") \
do_signature(decodeBlock_signature, "([BII[BIZZ)I") \
do_signature(decodeBlock_signature, "([BII[BIZZ)I") \
\
/* support for com.sun.crypto.provider.GHASH */ \
do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \
@ -527,6 +527,11 @@ class methodHandle;
do_name(processBlocks_name, "processBlocks") \
do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \
\
/* support for com.sun.crypto.provider.Poly1305 */ \
do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \
do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \
do_name(processMultipleBlocks_name, "processMultipleBlocks") \
\
/* support for java.util.zip */ \
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \

View File

@ -739,6 +739,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_ghash_processBlocks:
case vmIntrinsics::_base64_encodeBlock:
case vmIntrinsics::_base64_decodeBlock:
case vmIntrinsics::_poly1305_processBlocks:
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:

View File

@ -1166,6 +1166,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
strcmp(call->as_CallLeaf()->_name, "electronicCodeBook_decryptAESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 ||

View File

@ -612,6 +612,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
return inline_base64_encodeBlock();
case vmIntrinsics::_base64_decodeBlock:
return inline_base64_decodeBlock();
case vmIntrinsics::_poly1305_processBlocks:
return inline_poly1305_processBlocks();
case vmIntrinsics::_encodeISOArray:
case vmIntrinsics::_encodeByteISOArray:
@ -6962,6 +6964,42 @@ bool LibraryCallKit::inline_base64_decodeBlock() {
return true;
}
bool LibraryCallKit::inline_poly1305_processBlocks() {
address stubAddr;
const char *stubName;
assert(UsePolyIntrinsics, "need Poly intrinsics support");
assert(callee()->signature()->size() == 5, "poly1305_processBlocks has %d parameters", callee()->signature()->size());
stubAddr = StubRoutines::poly1305_processBlocks();
stubName = "poly1305_processBlocks";
if (!stubAddr) return false;
null_check_receiver(); // null-check receiver
if (stopped()) return true;
Node* input = argument(1);
Node* input_offset = argument(2);
Node* len = argument(3);
Node* alimbs = argument(4);
Node* rlimbs = argument(5);
input = must_be_not_null(input, true);
alimbs = must_be_not_null(alimbs, true);
rlimbs = must_be_not_null(rlimbs, true);
Node* input_start = array_element_address(input, input_offset, T_BYTE);
assert(input_start, "input array is NULL");
Node* acc_start = array_element_address(alimbs, intcon(0), T_LONG);
assert(acc_start, "acc array is NULL");
Node* r_start = array_element_address(rlimbs, intcon(0), T_LONG);
assert(r_start, "r array is NULL");
Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
OptoRuntime::poly1305_processBlocks_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
input_start, len, acc_start, r_start);
return true;
}
//------------------------------inline_digestBase_implCompress-----------------------
//
// Calculate MD5 for single-block byte[] array.

View File

@ -293,6 +293,7 @@ class LibraryCallKit : public GraphKit {
bool inline_ghash_processBlocks();
bool inline_base64_encodeBlock();
bool inline_base64_decodeBlock();
bool inline_poly1305_processBlocks();
bool inline_digestBase_implCompress(vmIntrinsics::ID id);
bool inline_digestBase_implCompressMB(int predicate);
bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass,

View File

@ -1266,6 +1266,26 @@ const TypeFunc* OptoRuntime::base64_decodeBlock_Type() {
return TypeFunc::make(domain, range);
}
// Poly1305 processMultipleBlocks function
const TypeFunc* OptoRuntime::poly1305_processBlocks_Type() {
int argcnt = 4;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // input array
fields[argp++] = TypeInt::INT; // input length
fields[argp++] = TypePtr::NOTNULL; // accumulator array
fields[argp++] = TypePtr::NOTNULL; // r array
assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms + 0] = NULL; // void
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
//------------- Interpreter state access for on stack replacement
const TypeFunc* OptoRuntime::osr_end_Type() {
// create input type (domain)

View File

@ -280,6 +280,7 @@ private:
static const TypeFunc* ghash_processBlocks_Type();
static const TypeFunc* base64_encodeBlock_Type();
static const TypeFunc* base64_decodeBlock_Type();
static const TypeFunc* poly1305_processBlocks_Type();
static const TypeFunc* updateBytesCRC32_Type();
static const TypeFunc* updateBytesCRC32C_Type();

View File

@ -238,6 +238,9 @@ const int ObjectAlignmentInBytes = 8;
product(bool, UseBASE64Intrinsics, false, \
"Use intrinsics for java.util.Base64") \
\
product(bool, UsePolyIntrinsics, false, DIAGNOSTIC, \
"Use intrinsics for sun.security.util.math.intpoly") \
\
product(size_t, LargePageSizeInBytes, 0, \
"Maximum large page size used (0 will use the default large " \
"page size for the environment as the maximum)") \

View File

@ -130,6 +130,7 @@ address StubRoutines::_galoisCounterMode_AESCrypt = NULL;
address StubRoutines::_ghash_processBlocks = NULL;
address StubRoutines::_base64_encodeBlock = NULL;
address StubRoutines::_base64_decodeBlock = NULL;
address StubRoutines::_poly1305_processBlocks = NULL;
address StubRoutines::_md5_implCompress = NULL;
address StubRoutines::_md5_implCompressMB = NULL;

View File

@ -211,6 +211,7 @@ class StubRoutines: AllStatic {
static address _ghash_processBlocks;
static address _base64_encodeBlock;
static address _base64_decodeBlock;
static address _poly1305_processBlocks;
static address _md5_implCompress;
static address _md5_implCompressMB;
@ -384,6 +385,7 @@ class StubRoutines: AllStatic {
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
static address electronicCodeBook_encryptAESCrypt() { return _electronicCodeBook_encryptAESCrypt; }
static address electronicCodeBook_decryptAESCrypt() { return _electronicCodeBook_decryptAESCrypt; }
static address poly1305_processBlocks() { return _poly1305_processBlocks; }
static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
static address ghash_processBlocks() { return _ghash_processBlocks; }
static address base64_encodeBlock() { return _base64_encodeBlock; }

View File

@ -544,6 +544,7 @@
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _base64_encodeBlock, address) \
static_field(StubRoutines, _base64_decodeBlock, address) \
static_field(StubRoutines, _poly1305_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _crc32c_table_addr, address) \

View File

@ -34,6 +34,8 @@ import java.util.Objects;
import sun.security.util.math.*;
import sun.security.util.math.intpoly.*;
import jdk.internal.vm.annotation.IntrinsicCandidate;
import jdk.internal.vm.annotation.ForceInline;
/**
* This class represents the Poly1305 function defined in RFC 7539.
@ -59,8 +61,10 @@ final class Poly1305 {
private IntegerModuloP s;
private MutableIntegerModuloP a;
private final MutableIntegerModuloP n = ipl1305.get1().mutable();
private final boolean checkWeakKey;
Poly1305() { }
Poly1305() { this(true); }
Poly1305(boolean checkKey) { checkWeakKey = checkKey; }
/**
* Initialize the Poly1305 object
@ -165,11 +169,15 @@ final class Poly1305 {
blockOffset = 0;
}
}
while (len >= BLOCK_LENGTH) {
processBlock(input, offset, BLOCK_LENGTH);
offset += BLOCK_LENGTH;
len -= BLOCK_LENGTH;
}
int blockMultipleLength = len & (~(BLOCK_LENGTH-1));
long[] aLimbs = a.getLimbs();
long[] rLimbs = r.getLimbs();
processMultipleBlocksCheck(input, offset, blockMultipleLength, aLimbs, rLimbs);
processMultipleBlocks(input, offset, blockMultipleLength, aLimbs, rLimbs);
offset += blockMultipleLength;
len -= blockMultipleLength;
if (len > 0) { // and len < BLOCK_LENGTH
System.arraycopy(input, offset, block, 0, len);
blockOffset = len;
@ -235,12 +243,35 @@ final class Poly1305 {
a.setProduct(r); // a = (a * r) % p
}
// This is an intrinsified method. The unused parameters aLimbs and rLimbs are used by the intrinsic.
// They correspond to this.a and this.r respectively
@ForceInline
@IntrinsicCandidate
private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) {
while (length >= BLOCK_LENGTH) {
processBlock(input, offset, BLOCK_LENGTH);
offset += BLOCK_LENGTH;
length -= BLOCK_LENGTH;
}
}
private static void processMultipleBlocksCheck(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) {
Objects.checkFromIndexSize(offset, length, input.length);
final int numLimbs = 5; // Intrinsic expects exactly 5 limbs
if (aLimbs.length != numLimbs) {
throw new RuntimeException("invalid accumulator length: " + aLimbs.length);
}
if (rLimbs.length != numLimbs) {
throw new RuntimeException("invalid R length: " + rLimbs.length);
}
}
/**
* Partition the authentication key into the R and S components, clamp
* the R value, and instantiate IntegerModuloP objects to R and S's
* numeric values.
*/
private void setRSVals() {
private void setRSVals() throws InvalidKeyException {
// Clamp the bytes in the "r" half of the key.
keyBytes[3] &= 15;
keyBytes[7] &= 15;
@ -250,6 +281,24 @@ final class Poly1305 {
keyBytes[8] &= (byte)252;
keyBytes[12] &= (byte)252;
if (checkWeakKey) {
byte keyIsZero = 0;
for (int i = 0; i < RS_LENGTH; i++) {
keyIsZero |= keyBytes[i];
}
if (keyIsZero == 0) {
throw new InvalidKeyException("R is set to zero");
}
keyIsZero = 0;
for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) {
keyIsZero |= keyBytes[i];
}
if (keyIsZero == 0) {
throw new InvalidKeyException("S is set to zero");
}
}
// Create IntegerModuloP elements from the r and s values
r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0);
s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0);

View File

@ -153,6 +153,11 @@ public interface IntegerModuloP {
*/
void asByteArray(byte[] result);
/**
* Break encapsulation, used for IntrinsicCandidate functions
*/
long[] getLimbs();
/**
* Compute the multiplicative inverse of this field element.
*

View File

@ -626,6 +626,10 @@ public abstract sealed class IntegerPolynomial implements IntegerFieldModuloP
}
limbsToByteArray(limbs, result);
}
public long[] getLimbs() {
return limbs;
}
}
protected class MutableElement extends Element

View File

@ -231,6 +231,7 @@ public class AMD64 extends Architecture {
OSPKE,
CET_IBT,
CET_SS,
AVX512_IFMA,
}
private final EnumSet<CPUFeature> features;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -28,3 +28,41 @@
* @run main java.base/com.sun.crypto.provider.Poly1305UnitTest
* @summary Unit test for com.sun.crypto.provider.Poly1305.
*/
/*
* @test
* @key randomness
* @modules java.base/com.sun.crypto.provider
* @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest
* @summary Unit test for com.sun.crypto.provider.Poly1305.
*/
/*
* @test
* @modules java.base/com.sun.crypto.provider
* @run main java.base/com.sun.crypto.provider.Poly1305KAT
* @summary Unit test for com.sun.crypto.provider.Poly1305.
*/
/*
* @test
* @key randomness
* @modules java.base/com.sun.crypto.provider
* @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305.
* @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest
*/
/*
* @test
* @modules java.base/com.sun.crypto.provider
* @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305.
* @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305KAT
*/
package com.sun.crypto.provider.Cipher.ChaCha20;
public class Poly1305UnitTestDriver {
static public void main(String[] args) {
System.out.println("Passed");
}
}

View File

@ -0,0 +1,95 @@
/*
* Copyright (c) 2022, Intel Corporation. All rights reserved.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package com.sun.crypto.provider;
import java.nio.ByteBuffer;
import java.util.Arrays;
import javax.crypto.spec.SecretKeySpec;
// This test case relies on the fact that single-byte Poly1305.engineUpdate(byte) does not have an intrinsic
// In this way we can compare if the intrinsic and pure java produce same result
// This test case is NOT entirely deterministic, it uses a random seed for pseudo-random number generator
// If a failure occurs, hardcode the seed to make the test case deterministic
public class Poly1305IntrinsicFuzzTest {
public static void main(String[] args) throws Exception {
//Note: it might be useful to increase this number during development of new Poly1305 intrinsics
final int repeat = 100;
for (int i = 0; i < repeat; i++) {
run();
}
System.out.println("Fuzz Success");
}
public static void run() throws Exception {
java.util.Random rnd = new java.util.Random();
long seed = rnd.nextLong();
rnd.setSeed(seed);
byte[] key = new byte[32];
rnd.nextBytes(key);
int msgLen = rnd.nextInt(128, 4096); // x86_64 intrinsic requires 256 bytes minimum
byte[] message = new byte[msgLen];
Poly1305 authenticator = new Poly1305();
Poly1305 authenticatorSlow = new Poly1305();
if (authenticator.engineGetMacLength() != 16) {
throw new RuntimeException("The length of Poly1305 MAC must be 16-bytes.");
}
authenticator.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null);
authenticatorSlow.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null);
if (rnd.nextBoolean()) {
// Prime just the buffer and/or accumulator (buffer can keep at most 16 bytes from previous engineUpdate)
int initDataLen = rnd.nextInt(8, 24);
authenticator.engineUpdate(message, 0, initDataLen);
slowUpdate(authenticatorSlow, message, 0, initDataLen);
}
if (rnd.nextBoolean()) {
// Multiple calls to engineUpdate
authenticator.engineUpdate(message, 0, message.length);
slowUpdate(authenticatorSlow, message, 0, message.length);
}
authenticator.engineUpdate(message, 0, message.length);
slowUpdate(authenticatorSlow, message, 0, message.length);
byte[] tag = authenticator.engineDoFinal();
byte[] tagSlow = authenticatorSlow.engineDoFinal();
if (!Arrays.equals(tag, tagSlow)) {
throw new RuntimeException("[Seed "+seed+"] Tag mismatch: " + Arrays.toString(tag) + " != " + Arrays.toString(tagSlow));
}
}
static void slowUpdate(Poly1305 authenticator, byte[] message, int offset, int len) {
len = Math.min(message.length, offset + len);
for (int i = offset; i < len; i++) {
authenticator.engineUpdate(message[i]);
}
}
}

View File

@ -0,0 +1,199 @@
/*
* Copyright (c) 2022, Intel Corporation. All rights reserved.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package com.sun.crypto.provider;
import java.util.*;
import java.nio.ByteBuffer;
import java.util.Arrays;
import javax.crypto.spec.SecretKeySpec;
public class Poly1305KAT {
public static class TestData {
public TestData(String name, String keyStr, String inputStr, String outStr) {
HexFormat hex = HexFormat.of();
testName = Objects.requireNonNull(name);
key = hex.parseHex(Objects.requireNonNull(keyStr));
input = hex.parseHex(Objects.requireNonNull(inputStr));
expOutput = hex.parseHex(Objects.requireNonNull(outStr));
}
public final String testName;
public final byte[] key;
public final byte[] input;
public final byte[] expOutput;
}
public static final List<TestData> testList = new LinkedList<TestData>() {{
add(new TestData("RFC 7539 A.3 Test Vector #1",
"0000000000000000000000000000000000000000000000000000000000000000",
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000",
"00000000000000000000000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #2",
"0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e",
"416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
"6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
"636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
"20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
"746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
"206f6620616e204945544620616374697669747920697320636f6e7369646572" +
"656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
"73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
"747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
"7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
"74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
"207768696368206172652061646472657373656420746f",
"36e5f6b5c5e06070f0efca96227a863e"));
add(new TestData("RFC 7539 A.3 Test Vector #3",
"36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000",
"416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
"6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
"636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
"20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
"746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
"206f6620616e204945544620616374697669747920697320636f6e7369646572" +
"656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
"73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
"747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
"7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
"74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
"207768696368206172652061646472657373656420746f",
"f3477e7cd95417af89a6b8794c310cf0"));
add(new TestData("RFC 7539 A.3 Test Vector #4",
"1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0",
"2754776173206272696c6c69672c20616e642074686520736c6974687920746f" +
"7665730a446964206779726520616e642067696d626c6520696e207468652077" +
"6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" +
"732c0a416e6420746865206d6f6d65207261746873206f757467726162652e",
"4541669a7eaaee61e708dc7cbcc5eb62"));
add(new TestData("RFC 7539 A.3 Test Vector #5: If one uses 130-bit partial reduction, does the code handle the case where partially reducedfinal result is not fully reduced?",
"0200000000000000000000000000000000000000000000000000000000000000",
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
"03000000000000000000000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #6: What happens if addition of s overflows modulo 2^128?",
"02000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
"02000000000000000000000000000000",
"03000000000000000000000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #7: What happens if data limb is all ones and there is carry from lower limb?",
"0100000000000000000000000000000000000000000000000000000000000000",
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" +
"11000000000000000000000000000000",
"05000000000000000000000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #8: What happens if final result from polynomial part is exactly 2^130-5?",
"0100000000000000000000000000000000000000000000000000000000000000",
"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFEFEFEFEFEFEFEFEFEFEFEFEFEFEFE" +
"01010101010101010101010101010101",
"00000000000000000000000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #9: What happens if final result from polynomial part is exactly 2^130-6?",
"0200000000000000000000000000000000000000000000000000000000000000",
"FDFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
"FAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"));
add(new TestData("RFC 7539 A.3 Test Vector #10: What happens if 5*H+L-type reduction produces 131-bit intermediate result?",
"0100000000000000040000000000000000000000000000000000000000000000",
"E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" +
"0000000000000000000000000000000001000000000000000000000000000000",
"14000000000000005500000000000000"));
add(new TestData("RFC 7539 A.3 Test Vector #11: What happens if 5*H+L-type reduction produces 131-bit final result?",
"0100000000000000040000000000000000000000000000000000000000000000",
"E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" +
"00000000000000000000000000000000",
"13000000000000000000000000000000"));
}};
public static void main(String args[]) throws Exception {
int testsPassed = 0;
int testNumber = 0;
for (TestData test : testList) {
System.out.println("*** Test " + ++testNumber + ": " +
test.testName);
if (runSingleTest(test)) {
testsPassed++;
}
}
System.out.println();
if (testsPassed != testNumber) {
throw new RuntimeException("One or more tests failed. " +
"Check output for details");
}
}
private static boolean runSingleTest(TestData testData) throws Exception {
Poly1305 authenticator = new Poly1305(false);
authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null);
authenticator.engineUpdate(testData.input, 0, testData.input.length);
byte[] tag = authenticator.engineDoFinal();
if (!Arrays.equals(tag, testData.expOutput)) {
System.out.println("ERROR - Output Mismatch!");
System.out.println("Expected:\n" +
dumpHexBytes(testData.expOutput, testData.expOutput.length, "\n", " "));
System.out.println("Actual:\n" +
dumpHexBytes(tag, tag.length, "\n", " "));
System.out.println();
return false;
}
return true;
}
/**
* Dump the hex bytes of a buffer into string form.
*
* @param data The array of bytes to dump to stdout.
* @param itemsPerLine The number of bytes to display per line
* if the {@code lineDelim} character is blank then all bytes
* will be printed on a single line.
* @param lineDelim The delimiter between lines
* @param itemDelim The delimiter between bytes
*
* @return The hexdump of the byte array
*/
private static String dumpHexBytes(byte[] data, int itemsPerLine,
String lineDelim, String itemDelim) {
return dumpHexBytes(ByteBuffer.wrap(data), itemsPerLine, lineDelim,
itemDelim);
}
private static String dumpHexBytes(ByteBuffer data, int itemsPerLine,
String lineDelim, String itemDelim) {
StringBuilder sb = new StringBuilder();
if (data != null) {
data.mark();
int i = 0;
while (data.remaining() > 0) {
if (i % itemsPerLine == 0 && i != 0) {
sb.append(lineDelim);
}
sb.append(String.format("%02X", data.get())).append(itemDelim);
i++;
}
data.reset();
}
return sb.toString();
}
}

View File

@ -65,7 +65,7 @@ public class CPUInfoTest {
"avx512_vbmi2", "avx512_vbmi", "rdtscp", "rdpid",
"hv", "fsrm", "avx512_bitalg", "gfni",
"f16c", "pku", "ospke", "cet_ibt",
"cet_ss"
"cet_ss", "avx512_ifma"
);
// @formatter:on
// Checkstyle: resume

View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2022, Intel Corporation. All rights reserved.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.javax.crypto.full;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Setup;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.reflect.Method;
import java.lang.reflect.Constructor;
import java.security.Key;
import java.security.spec.AlgorithmParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.annotations.Measurement;
@Measurement(iterations = 3, time = 10)
@Warmup(iterations = 3, time = 10)
@Fork(value = 1, jvmArgsAppend = {"--add-opens", "java.base/com.sun.crypto.provider=ALL-UNNAMED"})
public class Poly1305DigestBench extends CryptoBase {
public static final int SET_SIZE = 128;
@Param({"64", "256", "1024", "" + 16*1024, "" + 1024*1024})
int dataSize;
private byte[][] data;
int index = 0;
private static MethodHandle polyEngineInit, polyEngineUpdate, polyEngineFinal;
private static Object polyObj;
static {
try {
MethodHandles.Lookup lookup = MethodHandles.lookup();
Class<?> polyClazz = Class.forName("com.sun.crypto.provider.Poly1305");
Constructor<?> constructor = polyClazz.getDeclaredConstructor();
constructor.setAccessible(true);
polyObj = constructor.newInstance();
Method m = polyClazz.getDeclaredMethod("engineInit", Key.class, AlgorithmParameterSpec.class);
m.setAccessible(true);
polyEngineInit = lookup.unreflect(m);
m = polyClazz.getDeclaredMethod("engineUpdate", byte[].class, int.class, int.class);
m.setAccessible(true);
polyEngineUpdate = lookup.unreflect(m);
m = polyClazz.getDeclaredMethod("engineDoFinal");
m.setAccessible(true);
polyEngineFinal = lookup.unreflect(m);
} catch (Throwable ex) {
throw new RuntimeException(ex);
}
}
@Setup
public void setup() {
setupProvider();
data = fillRandom(new byte[SET_SIZE][dataSize]);
}
@Benchmark
public byte[] digest() {
try {
byte[] d = data[index];
index = (index +1) % SET_SIZE;
polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null);
polyEngineUpdate.invoke(polyObj, d, 0, d.length);
return (byte[])polyEngineFinal.invoke(polyObj);
} catch (Throwable ex) {
throw new RuntimeException(ex);
}
}
}