From 9d5f388498e18e5823d09b53ed66ff0025e661fc Mon Sep 17 00:00:00 2001 From: Shravya Rukmannagari Date: Thu, 4 Jun 2020 17:36:58 -0700 Subject: [PATCH] 8245512: CRC32 optimization using AVX512 instructions Reviewed-by: kvn --- src/hotspot/cpu/x86/assembler_x86.cpp | 35 ++ src/hotspot/cpu/x86/assembler_x86.hpp | 3 + src/hotspot/cpu/x86/macroAssembler_x86.cpp | 376 ++++++++++++++++++- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 14 +- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 13 +- src/hotspot/cpu/x86/stubRoutines_x86.cpp | 32 ++ src/hotspot/cpu/x86/stubRoutines_x86.hpp | 10 + 7 files changed, 469 insertions(+), 14 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index e8c5f951878..1cb1df54968 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -6241,6 +6241,17 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address emit_int8(imm8); } +void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x25); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + // vinserti forms void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) { @@ -6693,6 +6704,21 @@ void Assembler::vpbroadcastq(XMMRegister dst, Address src, int vector_len) { emit_int8(0x59); emit_operand(dst, src); } + +void Assembler::evbroadcasti32x4(XMMRegister dst, Address src, int vector_len) { + assert(vector_len != Assembler::AVX_128bit, ""); + assert(VM_Version::supports_avx512dq(), ""); + assert(dst != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + attributes.set_address_attributes(/* tuple_type */ EVEX_T2, /* input_size_in_bits */ EVEX_64bit); + // swap src<->dst for encoding + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x5A); + emit_operand(dst, src); +} + void Assembler::evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len) { assert(vector_len != Assembler::AVX_128bit, ""); assert(VM_Version::supports_avx512dq(), ""); @@ -7587,6 +7613,15 @@ void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop emit_int24((unsigned char)0xC2, (0xC0 | encode), (0xF & cop)); } +void Assembler::blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(VM_Version::supports_avx(), ""); + assert(vector_len <= AVX_256bit, ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + int src2_enc = src2->encoding(); + emit_int24(0x4C, (0xC0 | encode), (0xF0 & src2_enc << 4)); +} + void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { assert(VM_Version::supports_avx(), ""); assert(vector_len <= AVX_256bit, ""); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index d7c548381b0..01d92f896f3 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -2201,6 +2201,7 @@ private: // Ternary logic instruction. void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len); + void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); // vinserti forms void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8); @@ -2245,6 +2246,7 @@ private: void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len); void vpbroadcastq(XMMRegister dst, Address src, int vector_len); + void evbroadcasti32x4(XMMRegister dst, Address src, int vector_len); void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len); void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len); @@ -2274,6 +2276,7 @@ private: void vzeroupper(); // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled. + void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index b5753410fd4..522ddf71817 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -6479,16 +6479,6 @@ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register tabl xorl(crc, Address(table, val, Address::times_4, 0)); } -/** -* Fold four 128-bit data chunks -*/ -void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { - evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64] - evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0] - evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */); - evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */); -} - /** * Fold 128-bit data chunk */ @@ -6692,6 +6682,372 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi } #ifdef _LP64 +// Helper function for AVX 512 CRC32 +// Fold 512-bit data chunks +void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, + Register pos, int offset) { + evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); + evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] + evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] + evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); + evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); +} + +// Helper function for AVX 512 CRC32 +// Compute CRC32 for < 256B buffers +void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos, + Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, + Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { + + Label L_less_than_32, L_exact_16_left, L_less_than_16_left; + Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; + Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; + + // check if there is enough buffer to be able to fold 16B at a time + cmpl(len, 32); + jcc(Assembler::less, L_less_than_32); + + // if there is, load the constants + movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10 + movdl(xmm0, crc); // get the initial crc value + movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext + pxor(xmm7, xmm0); + + // update the buffer pointer + addl(pos, 16); + //update the counter.subtract 32 instead of 16 to save one instruction from the loop + subl(len, 32); + jmp(L_16B_reduction_loop); + + bind(L_less_than_32); + //mov initial crc to the return value. this is necessary for zero - length buffers. + movl(rax, crc); + testl(len, len); + jcc(Assembler::equal, L_cleanup); + + movdl(xmm0, crc); //get the initial crc value + + cmpl(len, 16); + jcc(Assembler::equal, L_exact_16_left); + jcc(Assembler::less, L_less_than_16_left); + + movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext + pxor(xmm7, xmm0); //xor the initial crc value + addl(pos, 16); + subl(len, 16); + movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10 + jmp(L_get_last_two_xmms); + + bind(L_less_than_16_left); + //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. + pxor(xmm1, xmm1); + movptr(tmp1, rsp); + movdqu(Address(tmp1, 0 * 16), xmm1); + + cmpl(len, 4); + jcc(Assembler::less, L_only_less_than_4); + + //backup the counter value + movl(tmp2, len); + cmpl(len, 8); + jcc(Assembler::less, L_less_than_8_left); + + //load 8 Bytes + movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); + movq(Address(tmp1, 0 * 16), rax); + addptr(tmp1, 8); + subl(len, 8); + addl(pos, 8); + + bind(L_less_than_8_left); + cmpl(len, 4); + jcc(Assembler::less, L_less_than_4_left); + + //load 4 Bytes + movl(rax, Address(buf, pos, Address::times_1, 0)); + movl(Address(tmp1, 0 * 16), rax); + addptr(tmp1, 4); + subl(len, 4); + addl(pos, 4); + + bind(L_less_than_4_left); + cmpl(len, 2); + jcc(Assembler::less, L_less_than_2_left); + + // load 2 Bytes + movw(rax, Address(buf, pos, Address::times_1, 0)); + movl(Address(tmp1, 0 * 16), rax); + addptr(tmp1, 2); + subl(len, 2); + addl(pos, 2); + + bind(L_less_than_2_left); + cmpl(len, 1); + jcc(Assembler::less, L_zero_left); + + // load 1 Byte + movb(rax, Address(buf, pos, Address::times_1, 0)); + movb(Address(tmp1, 0 * 16), rax); + + bind(L_zero_left); + movdqu(xmm7, Address(rsp, 0)); + pxor(xmm7, xmm0); //xor the initial crc value + + lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); + movdqu(xmm0, Address(rax, tmp2)); + pshufb(xmm7, xmm0); + jmp(L_128_done); + + bind(L_exact_16_left); + movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); + pxor(xmm7, xmm0); //xor the initial crc value + jmp(L_128_done); + + bind(L_only_less_than_4); + cmpl(len, 3); + jcc(Assembler::less, L_only_less_than_3); + + // load 3 Bytes + movb(rax, Address(buf, pos, Address::times_1, 0)); + movb(Address(tmp1, 0), rax); + + movb(rax, Address(buf, pos, Address::times_1, 1)); + movb(Address(tmp1, 1), rax); + + movb(rax, Address(buf, pos, Address::times_1, 2)); + movb(Address(tmp1, 2), rax); + + movdqu(xmm7, Address(rsp, 0)); + pxor(xmm7, xmm0); //xor the initial crc value + + pslldq(xmm7, 0x5); + jmp(L_barrett); + bind(L_only_less_than_3); + cmpl(len, 2); + jcc(Assembler::less, L_only_less_than_2); + + // load 2 Bytes + movb(rax, Address(buf, pos, Address::times_1, 0)); + movb(Address(tmp1, 0), rax); + + movb(rax, Address(buf, pos, Address::times_1, 1)); + movb(Address(tmp1, 1), rax); + + movdqu(xmm7, Address(rsp, 0)); + pxor(xmm7, xmm0); //xor the initial crc value + + pslldq(xmm7, 0x6); + jmp(L_barrett); + + bind(L_only_less_than_2); + //load 1 Byte + movb(rax, Address(buf, pos, Address::times_1, 0)); + movb(Address(tmp1, 0), rax); + + movdqu(xmm7, Address(rsp, 0)); + pxor(xmm7, xmm0); //xor the initial crc value + + pslldq(xmm7, 0x7); +} + +/** +* Compute CRC32 using AVX512 instructions +* param crc register containing existing CRC (32-bit) +* param buf register pointing to input byte buffer (byte*) +* param len register containing number of bytes +* param tmp1 scratch register +* param tmp2 scratch register +* return rax result register +*/ +void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) { + assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax); + + Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; + Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; + Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; + Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; + Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; + + const Register pos = r12; + push(r12); + subptr(rsp, 16 * 2 + 8); + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr())); + notl(crc); + movl(pos, 0); + + // check if smaller than 256B + cmpl(len, 256); + jcc(Assembler::less, L_less_than_256); + + // load the initial crc value + movdl(xmm10, crc); + + // receive the initial 64B data, xor the initial crc value + evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); + evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); + evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); + evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 + + subl(len, 256); + cmpl(len, 256); + jcc(Assembler::less, L_fold_128_B_loop); + + evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); + evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); + evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 + subl(len, 256); + + bind(L_fold_256_B_loop); + addl(pos, 256); + fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); + fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); + fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); + fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); + + subl(len, 256); + jcc(Assembler::greaterEqual, L_fold_256_B_loop); + + // Fold 256 into 128 + addl(pos, 256); + evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); + evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); + vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC + + evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); + evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); + vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC + + evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); + evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); + + addl(len, 128); + jmp(L_fold_128_B_register); + + // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop + // loop will fold 128B at a time until we have 128 + y Bytes of buffer + + // fold 128B at a time.This section of the code folds 8 xmm registers in parallel + bind(L_fold_128_B_loop); + addl(pos, 128); + fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); + fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); + + subl(len, 128); + jcc(Assembler::greaterEqual, L_fold_128_B_loop); + + addl(pos, 128); + + // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 + // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + bind(L_fold_128_B_register); + evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 + evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 + evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); + evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); + // save last that has no multiplicand + vextracti64x2(xmm7, xmm4, 3); + + evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); + evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); + // Needed later in reduction loop + movdqu(xmm10, Address(key, 1 * 16)); + vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC + vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC + + // Swap 1,0,3,2 - 01 00 11 10 + evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); + evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); + vextracti128(xmm5, xmm8, 1); + evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); + + // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop + // instead of a cmp instruction, we use the negative flag with the jl instruction + addl(len, 128 - 16); + jcc(Assembler::less, L_final_reduction_for_128); + + bind(L_16B_reduction_loop); + vpclmulqdq(xmm8, xmm7, xmm10, 0x1); + vpclmulqdq(xmm7, xmm7, xmm10, 0x10); + vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); + movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); + vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); + addl(pos, 16); + subl(len, 16); + jcc(Assembler::greaterEqual, L_16B_reduction_loop); + + bind(L_final_reduction_for_128); + addl(len, 16); + jcc(Assembler::equal, L_128_done); + + bind(L_get_last_two_xmms); + movdqu(xmm2, xmm7); + addl(pos, len); + movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); + subl(pos, len); + + // get rid of the extra data that was loaded before + // load the shift constant + lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); + movdqu(xmm0, Address(rax, len)); + addl(rax, len); + + vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); + //Change mask to 512 + vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); + vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); + + blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); + vpclmulqdq(xmm8, xmm7, xmm10, 0x1); + vpclmulqdq(xmm7, xmm7, xmm10, 0x10); + vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); + vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); + + bind(L_128_done); + // compute crc of a 128-bit value + movdqu(xmm10, Address(key, 3 * 16)); + movdqu(xmm0, xmm7); + + // 64b fold + vpclmulqdq(xmm7, xmm7, xmm10, 0x0); + vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); + vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); + + // 32b fold + movdqu(xmm0, xmm7); + vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); + vpclmulqdq(xmm7, xmm7, xmm10, 0x10); + vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); + jmp(L_barrett); + + bind(L_less_than_256); + kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); + + //barrett reduction + bind(L_barrett); + vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); + movdqu(xmm1, xmm7); + movdqu(xmm2, xmm7); + movdqu(xmm10, Address(key, 4 * 16)); + + pclmulqdq(xmm7, xmm10, 0x0); + pxor(xmm7, xmm2); + vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); + movdqu(xmm2, xmm7); + pclmulqdq(xmm7, xmm10, 0x10); + pxor(xmm7, xmm2); + pxor(xmm7, xmm1); + pextrd(crc, xmm7, 2); + + bind(L_cleanup); + notl(crc); // ~c + addptr(rsp, 16 * 2 + 8); + pop(r12); +} + // S. Gueron / Information Processing Letters 112 (2012) 184 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 58b9e1f77bb..a939e7794d4 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -1658,6 +1658,15 @@ public: // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic. void update_byte_crc32(Register crc, Register val, Register table); void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp); + + +#ifdef _LP64 + void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2); + void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos, + Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, + Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup); +#endif // _LP64 + // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic // Note on a naming convention: // Prefix w = register only used on a Westmere+ architecture @@ -1694,10 +1703,13 @@ public: // Fold 128-bit data chunk void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset); void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf); +#ifdef _LP64 + // Fold 512-bit data chunk + void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset); +#endif // _LP64 // Fold 8-bit data void fold_8bit_crc32(Register crc, Register table, Register tmp); void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp); - void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset); // Compress char[] array to byte[]. void char_array_compress(Register src, Register dst, Register len, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 9de5886755a..b1a98598128 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -5325,13 +5325,20 @@ address generate_avx_ghash_processBlocks() { const Register buf = c_rarg1; // source java byte array address const Register len = c_rarg2; // length const Register table = c_rarg3; // crc_table address (reuse register) - const Register tmp = r11; - assert_different_registers(crc, buf, len, table, tmp, rax); + const Register tmp1 = r11; + const Register tmp2 = r10; + assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax); BLOCK_COMMENT("Entry:"); __ enter(); // required for proper stackwalking of RuntimeStub frame - __ kernel_crc32(crc, buf, len, table, tmp); + if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() && + VM_Version::supports_avx512bw() && + VM_Version::supports_avx512vl()) { + __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2); + } else { + __ kernel_crc32(crc, buf, len, table, tmp1); + } __ movl(rax, crc); __ vzeroupper(); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index 4749a276d5f..5d93d118e7b 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -184,6 +184,38 @@ juint StubRoutines::x86::_crc_table[] = 0x2d02ef8dUL }; +#ifdef _LP64 +juint StubRoutines::x86::_crc_table_avx512[] = +{ + 0xe95c1271UL, 0x00000000UL, 0xce3371cbUL, 0x00000000UL, + 0xccaa009eUL, 0x00000000UL, 0x751997d0UL, 0x00000001UL, + 0x4a7fe880UL, 0x00000001UL, 0xe88ef372UL, 0x00000001UL, + 0xccaa009eUL, 0x00000000UL, 0x63cd6124UL, 0x00000001UL, + 0xf7011640UL, 0x00000001UL, 0xdb710640UL, 0x00000001UL, + 0xd7cfc6acUL, 0x00000001UL, 0xea89367eUL, 0x00000001UL, + 0x8cb44e58UL, 0x00000001UL, 0xdf068dc2UL, 0x00000000UL, + 0xae0b5394UL, 0x00000000UL, 0xc7569e54UL, 0x00000001UL, + 0xc6e41596UL, 0x00000001UL, 0x54442bd4UL, 0x00000001UL, + 0x74359406UL, 0x00000001UL, 0x3db1ecdcUL, 0x00000000UL, + 0x5a546366UL, 0x00000001UL, 0xf1da05aaUL, 0x00000000UL, + 0xccaa009eUL, 0x00000000UL, 0x751997d0UL, 0x00000001UL, + 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL +}; + +juint StubRoutines::x86::_crc_by128_masks_avx512[] = +{ + 0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL, + 0x00000000UL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, + 0x80808080UL, 0x80808080UL, 0x80808080UL, 0x80808080UL +}; + +juint StubRoutines::x86::_shuf_table_crc32_avx512[] = +{ + 0x83828100UL, 0x87868584UL, 0x8b8a8988UL, 0x8f8e8d8cUL, + 0x03020100UL, 0x07060504UL, 0x0b0a0908UL, 0x000e0d0cUL +}; +#endif // _LP64 + #define D 32 #define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41) diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index f68656d8aa1..a23ee3666a6 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -120,6 +120,11 @@ class x86 { // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; +#ifdef _LP64 + static juint _crc_by128_masks_avx512[]; + static juint _crc_table_avx512[]; + static juint _shuf_table_crc32_avx512[]; +#endif // _LP64 // table for CRC32C static juint* _crc32c_table; // swap mask for ghash @@ -210,6 +215,11 @@ class x86 { static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } +#ifdef _LP64 + static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; } + static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; } + static address crc_table_avx512_addr() { return (address)_crc_table_avx512; } +#endif // _LP64 static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } static address ghash_shufflemask_addr() { return _ghash_shuffmask_addr; }