8245512: CRC32 optimization using AVX512 instructions

Reviewed-by: kvn
This commit is contained in:
Shravya Rukmannagari 2020-06-04 17:36:58 -07:00
parent 9a7f519aee
commit 9d5f388498
7 changed files with 469 additions and 14 deletions

View File

@ -6241,6 +6241,17 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address
emit_int8(imm8);
}
void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x25);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(imm8);
}
// vinserti forms
void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
@ -6693,6 +6704,21 @@ void Assembler::vpbroadcastq(XMMRegister dst, Address src, int vector_len) {
emit_int8(0x59);
emit_operand(dst, src);
}
void Assembler::evbroadcasti32x4(XMMRegister dst, Address src, int vector_len) {
assert(vector_len != Assembler::AVX_128bit, "");
assert(VM_Version::supports_avx512dq(), "");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_rex_vex_w_reverted();
attributes.set_address_attributes(/* tuple_type */ EVEX_T2, /* input_size_in_bits */ EVEX_64bit);
// swap src<->dst for encoding
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x5A);
emit_operand(dst, src);
}
void Assembler::evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len) {
assert(vector_len != Assembler::AVX_128bit, "");
assert(VM_Version::supports_avx512dq(), "");
@ -7587,6 +7613,15 @@ void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop
emit_int24((unsigned char)0xC2, (0xC0 | encode), (0xF & cop));
}
void Assembler::blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(vector_len <= AVX_256bit, "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
int src2_enc = src2->encoding();
emit_int24(0x4C, (0xC0 | encode), (0xF0 & src2_enc << 4));
}
void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(vector_len <= AVX_256bit, "");

View File

@ -2201,6 +2201,7 @@ private:
// Ternary logic instruction.
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
// vinserti forms
void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
@ -2245,6 +2246,7 @@ private:
void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
void vpbroadcastq(XMMRegister dst, Address src, int vector_len);
void evbroadcasti32x4(XMMRegister dst, Address src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
@ -2274,6 +2276,7 @@ private:
void vzeroupper();
// AVX support for vectorized conditional move (float/double). The following two instructions used only coupled.
void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);

View File

@ -6479,16 +6479,6 @@ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register tabl
xorl(crc, Address(table, val, Address::times_4, 0));
}
/**
* Fold four 128-bit data chunks
*/
void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
}
/**
* Fold 128-bit data chunk
*/
@ -6692,6 +6682,372 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
}
#ifdef _LP64
// Helper function for AVX 512 CRC32
// Fold 512-bit data chunks
void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
Register pos, int offset) {
evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
}
// Helper function for AVX 512 CRC32
// Compute CRC32 for < 256B buffers
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
// check if there is enough buffer to be able to fold 16B at a time
cmpl(len, 32);
jcc(Assembler::less, L_less_than_32);
// if there is, load the constants
movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10
movdl(xmm0, crc); // get the initial crc value
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
pxor(xmm7, xmm0);
// update the buffer pointer
addl(pos, 16);
//update the counter.subtract 32 instead of 16 to save one instruction from the loop
subl(len, 32);
jmp(L_16B_reduction_loop);
bind(L_less_than_32);
//mov initial crc to the return value. this is necessary for zero - length buffers.
movl(rax, crc);
testl(len, len);
jcc(Assembler::equal, L_cleanup);
movdl(xmm0, crc); //get the initial crc value
cmpl(len, 16);
jcc(Assembler::equal, L_exact_16_left);
jcc(Assembler::less, L_less_than_16_left);
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
pxor(xmm7, xmm0); //xor the initial crc value
addl(pos, 16);
subl(len, 16);
movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10
jmp(L_get_last_two_xmms);
bind(L_less_than_16_left);
//use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
pxor(xmm1, xmm1);
movptr(tmp1, rsp);
movdqu(Address(tmp1, 0 * 16), xmm1);
cmpl(len, 4);
jcc(Assembler::less, L_only_less_than_4);
//backup the counter value
movl(tmp2, len);
cmpl(len, 8);
jcc(Assembler::less, L_less_than_8_left);
//load 8 Bytes
movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
movq(Address(tmp1, 0 * 16), rax);
addptr(tmp1, 8);
subl(len, 8);
addl(pos, 8);
bind(L_less_than_8_left);
cmpl(len, 4);
jcc(Assembler::less, L_less_than_4_left);
//load 4 Bytes
movl(rax, Address(buf, pos, Address::times_1, 0));
movl(Address(tmp1, 0 * 16), rax);
addptr(tmp1, 4);
subl(len, 4);
addl(pos, 4);
bind(L_less_than_4_left);
cmpl(len, 2);
jcc(Assembler::less, L_less_than_2_left);
// load 2 Bytes
movw(rax, Address(buf, pos, Address::times_1, 0));
movl(Address(tmp1, 0 * 16), rax);
addptr(tmp1, 2);
subl(len, 2);
addl(pos, 2);
bind(L_less_than_2_left);
cmpl(len, 1);
jcc(Assembler::less, L_zero_left);
// load 1 Byte
movb(rax, Address(buf, pos, Address::times_1, 0));
movb(Address(tmp1, 0 * 16), rax);
bind(L_zero_left);
movdqu(xmm7, Address(rsp, 0));
pxor(xmm7, xmm0); //xor the initial crc value
lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
movdqu(xmm0, Address(rax, tmp2));
pshufb(xmm7, xmm0);
jmp(L_128_done);
bind(L_exact_16_left);
movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
pxor(xmm7, xmm0); //xor the initial crc value
jmp(L_128_done);
bind(L_only_less_than_4);
cmpl(len, 3);
jcc(Assembler::less, L_only_less_than_3);
// load 3 Bytes
movb(rax, Address(buf, pos, Address::times_1, 0));
movb(Address(tmp1, 0), rax);
movb(rax, Address(buf, pos, Address::times_1, 1));
movb(Address(tmp1, 1), rax);
movb(rax, Address(buf, pos, Address::times_1, 2));
movb(Address(tmp1, 2), rax);
movdqu(xmm7, Address(rsp, 0));
pxor(xmm7, xmm0); //xor the initial crc value
pslldq(xmm7, 0x5);
jmp(L_barrett);
bind(L_only_less_than_3);
cmpl(len, 2);
jcc(Assembler::less, L_only_less_than_2);
// load 2 Bytes
movb(rax, Address(buf, pos, Address::times_1, 0));
movb(Address(tmp1, 0), rax);
movb(rax, Address(buf, pos, Address::times_1, 1));
movb(Address(tmp1, 1), rax);
movdqu(xmm7, Address(rsp, 0));
pxor(xmm7, xmm0); //xor the initial crc value
pslldq(xmm7, 0x6);
jmp(L_barrett);
bind(L_only_less_than_2);
//load 1 Byte
movb(rax, Address(buf, pos, Address::times_1, 0));
movb(Address(tmp1, 0), rax);
movdqu(xmm7, Address(rsp, 0));
pxor(xmm7, xmm0); //xor the initial crc value
pslldq(xmm7, 0x7);
}
/**
* Compute CRC32 using AVX512 instructions
* param crc register containing existing CRC (32-bit)
* param buf register pointing to input byte buffer (byte*)
* param len register containing number of bytes
* param tmp1 scratch register
* param tmp2 scratch register
* return rax result register
*/
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
const Register pos = r12;
push(r12);
subptr(rsp, 16 * 2 + 8);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
notl(crc);
movl(pos, 0);
// check if smaller than 256B
cmpl(len, 256);
jcc(Assembler::less, L_less_than_256);
// load the initial crc value
movdl(xmm10, crc);
// receive the initial 64B data, xor the initial crc value
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
subl(len, 256);
cmpl(len, 256);
jcc(Assembler::less, L_fold_128_B_loop);
evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
subl(len, 256);
bind(L_fold_256_B_loop);
addl(pos, 256);
fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
subl(len, 256);
jcc(Assembler::greaterEqual, L_fold_256_B_loop);
// Fold 256 into 128
addl(pos, 256);
evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
addl(len, 128);
jmp(L_fold_128_B_register);
// at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
// loop will fold 128B at a time until we have 128 + y Bytes of buffer
// fold 128B at a time.This section of the code folds 8 xmm registers in parallel
bind(L_fold_128_B_loop);
addl(pos, 128);
fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
subl(len, 128);
jcc(Assembler::greaterEqual, L_fold_128_B_loop);
addl(pos, 128);
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
bind(L_fold_128_B_register);
evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
// save last that has no multiplicand
vextracti64x2(xmm7, xmm4, 3);
evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
// Needed later in reduction loop
movdqu(xmm10, Address(key, 1 * 16));
vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
// Swap 1,0,3,2 - 01 00 11 10
evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
vextracti128(xmm5, xmm8, 1);
evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
// instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
// instead of a cmp instruction, we use the negative flag with the jl instruction
addl(len, 128 - 16);
jcc(Assembler::less, L_final_reduction_for_128);
bind(L_16B_reduction_loop);
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
addl(pos, 16);
subl(len, 16);
jcc(Assembler::greaterEqual, L_16B_reduction_loop);
bind(L_final_reduction_for_128);
addl(len, 16);
jcc(Assembler::equal, L_128_done);
bind(L_get_last_two_xmms);
movdqu(xmm2, xmm7);
addl(pos, len);
movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
subl(pos, len);
// get rid of the extra data that was loaded before
// load the shift constant
lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
movdqu(xmm0, Address(rax, len));
addl(rax, len);
vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
//Change mask to 512
vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
bind(L_128_done);
// compute crc of a 128-bit value
movdqu(xmm10, Address(key, 3 * 16));
movdqu(xmm0, xmm7);
// 64b fold
vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
// 32b fold
movdqu(xmm0, xmm7);
vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
jmp(L_barrett);
bind(L_less_than_256);
kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
//barrett reduction
bind(L_barrett);
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
movdqu(xmm1, xmm7);
movdqu(xmm2, xmm7);
movdqu(xmm10, Address(key, 4 * 16));
pclmulqdq(xmm7, xmm10, 0x0);
pxor(xmm7, xmm2);
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
movdqu(xmm2, xmm7);
pclmulqdq(xmm7, xmm10, 0x10);
pxor(xmm7, xmm2);
pxor(xmm7, xmm1);
pextrd(crc, xmm7, 2);
bind(L_cleanup);
notl(crc); // ~c
addptr(rsp, 16 * 2 + 8);
pop(r12);
}
// S. Gueron / Information Processing Letters 112 (2012) 184
// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].

View File

@ -1658,6 +1658,15 @@ public:
// CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
void update_byte_crc32(Register crc, Register val, Register table);
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
#ifdef _LP64
void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
#endif // _LP64
// CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
// Note on a naming convention:
// Prefix w = register only used on a Westmere+ architecture
@ -1694,10 +1703,13 @@ public:
// Fold 128-bit data chunk
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
#ifdef _LP64
// Fold 512-bit data chunk
void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
#endif // _LP64
// Fold 8-bit data
void fold_8bit_crc32(Register crc, Register table, Register tmp);
void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
// Compress char[] array to byte[].
void char_array_compress(Register src, Register dst, Register len,

View File

@ -5325,13 +5325,20 @@ address generate_avx_ghash_processBlocks() {
const Register buf = c_rarg1; // source java byte array address
const Register len = c_rarg2; // length
const Register table = c_rarg3; // crc_table address (reuse register)
const Register tmp = r11;
assert_different_registers(crc, buf, len, table, tmp, rax);
const Register tmp1 = r11;
const Register tmp2 = r10;
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ kernel_crc32(crc, buf, len, table, tmp);
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
VM_Version::supports_avx512bw() &&
VM_Version::supports_avx512vl()) {
__ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
} else {
__ kernel_crc32(crc, buf, len, table, tmp1);
}
__ movl(rax, crc);
__ vzeroupper();

View File

@ -184,6 +184,38 @@ juint StubRoutines::x86::_crc_table[] =
0x2d02ef8dUL
};
#ifdef _LP64
juint StubRoutines::x86::_crc_table_avx512[] =
{
0xe95c1271UL, 0x00000000UL, 0xce3371cbUL, 0x00000000UL,
0xccaa009eUL, 0x00000000UL, 0x751997d0UL, 0x00000001UL,
0x4a7fe880UL, 0x00000001UL, 0xe88ef372UL, 0x00000001UL,
0xccaa009eUL, 0x00000000UL, 0x63cd6124UL, 0x00000001UL,
0xf7011640UL, 0x00000001UL, 0xdb710640UL, 0x00000001UL,
0xd7cfc6acUL, 0x00000001UL, 0xea89367eUL, 0x00000001UL,
0x8cb44e58UL, 0x00000001UL, 0xdf068dc2UL, 0x00000000UL,
0xae0b5394UL, 0x00000000UL, 0xc7569e54UL, 0x00000001UL,
0xc6e41596UL, 0x00000001UL, 0x54442bd4UL, 0x00000001UL,
0x74359406UL, 0x00000001UL, 0x3db1ecdcUL, 0x00000000UL,
0x5a546366UL, 0x00000001UL, 0xf1da05aaUL, 0x00000000UL,
0xccaa009eUL, 0x00000000UL, 0x751997d0UL, 0x00000001UL,
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
};
juint StubRoutines::x86::_crc_by128_masks_avx512[] =
{
0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL,
0x00000000UL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL,
0x80808080UL, 0x80808080UL, 0x80808080UL, 0x80808080UL
};
juint StubRoutines::x86::_shuf_table_crc32_avx512[] =
{
0x83828100UL, 0x87868584UL, 0x8b8a8988UL, 0x8f8e8d8cUL,
0x03020100UL, 0x07060504UL, 0x0b0a0908UL, 0x000e0d0cUL
};
#endif // _LP64
#define D 32
#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)

View File

@ -120,6 +120,11 @@ class x86 {
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
#ifdef _LP64
static juint _crc_by128_masks_avx512[];
static juint _crc_table_avx512[];
static juint _shuf_table_crc32_avx512[];
#endif // _LP64
// table for CRC32C
static juint* _crc32c_table;
// swap mask for ghash
@ -210,6 +215,11 @@ class x86 {
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
#ifdef _LP64
static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; }
static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; }
static address crc_table_avx512_addr() { return (address)_crc_table_avx512; }
#endif // _LP64
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
static address ghash_shufflemask_addr() { return _ghash_shuffmask_addr; }