diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index b44e8ee15c8..05a36a8195d 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -7030,7 +7030,7 @@ void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, X // Helper function for AVX 512 CRC32 // Compute CRC32 for < 256B buffers -void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos, +void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos, Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { @@ -7043,7 +7043,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist jcc(Assembler::less, L_less_than_32); // if there is, load the constants - movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10 + movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10 movdl(xmm0, crc); // get the initial crc value movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext pxor(xmm7, xmm0); @@ -7070,7 +7070,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist pxor(xmm7, xmm0); //xor the initial crc value addl(pos, 16); subl(len, 16); - movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10 + movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10 jmp(L_get_last_two_xmms); bind(L_less_than_16_left); @@ -7190,12 +7190,17 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist * param crc register containing existing CRC (32-bit) * param buf register pointing to input byte buffer (byte*) * param len register containing number of bytes +* param table address of crc or crc32c table * param tmp1 scratch register * param tmp2 scratch register * return rax result register +* +* This routine is identical for crc32c with the exception of the precomputed constant +* table which will be passed as the table argument. The calculation steps are +* the same for both variants. */ -void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) { - assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax); +void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) { + assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12); Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; @@ -7210,8 +7215,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX. - lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr())); - notl(crc); movl(pos, 0); // check if smaller than 256B @@ -7225,7 +7228,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); - evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 + evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 subl(len, 256); cmpl(len, 256); @@ -7233,7 +7236,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); - evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 + evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 subl(len, 256); bind(L_fold_256_B_loop); @@ -7279,8 +7282,8 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 bind(L_fold_128_B_register); - evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 - evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 + evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 + evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); // save last that has no multiplicand @@ -7289,7 +7292,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); // Needed later in reduction loop - movdqu(xmm10, Address(key, 1 * 16)); + movdqu(xmm10, Address(table, 1 * 16)); vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC @@ -7305,7 +7308,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le jcc(Assembler::less, L_final_reduction_for_128); bind(L_16B_reduction_loop); - vpclmulqdq(xmm8, xmm7, xmm10, 0x1); + vpclmulqdq(xmm8, xmm7, xmm10, 0x01); vpclmulqdq(xmm7, xmm7, xmm10, 0x10); vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); @@ -7336,14 +7339,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); - vpclmulqdq(xmm8, xmm7, xmm10, 0x1); + vpclmulqdq(xmm8, xmm7, xmm10, 0x01); vpclmulqdq(xmm7, xmm7, xmm10, 0x10); vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); bind(L_128_done); // compute crc of a 128-bit value - movdqu(xmm10, Address(key, 3 * 16)); + movdqu(xmm10, Address(table, 3 * 16)); movdqu(xmm0, xmm7); // 64b fold @@ -7359,14 +7362,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le jmp(L_barrett); bind(L_less_than_256); - kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); + kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); //barrett reduction bind(L_barrett); vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); movdqu(xmm1, xmm7); movdqu(xmm2, xmm7); - movdqu(xmm10, Address(key, 4 * 16)); + movdqu(xmm10, Address(table, 4 * 16)); pclmulqdq(xmm7, xmm10, 0x0); pxor(xmm7, xmm2); @@ -7378,7 +7381,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le pextrd(crc, xmm7, 2); bind(L_cleanup); - notl(crc); // ~c addptr(rsp, 16 * 2 + 8); pop(r12); } diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index f5ef24ddf4c..9e08504b117 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -6528,7 +6528,13 @@ address generate_avx_ghash_processBlocks() { if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) { + // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value. + // However, the constant table for CRC32-C assumes the original crc value. Account for this + // difference before calling and after returning. + __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr())); + __ notl(crc); __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2); + __ notl(crc); } else { __ kernel_crc32(crc, buf, len, table, tmp1); } @@ -6580,20 +6586,27 @@ address generate_avx_ghash_processBlocks() { BLOCK_COMMENT("Entry:"); __ enter(); // required for proper stackwalking of RuntimeStub frame + if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() && + VM_Version::supports_avx512bw() && + VM_Version::supports_avx512vl()) { + __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr())); + __ kernel_crc32_avx512(crc, buf, len, j, l, k); + } else { #ifdef _WIN64 - __ push(y); - __ push(z); + __ push(y); + __ push(z); #endif - __ crc32c_ipl_alg2_alt2(crc, buf, len, - a, j, k, - l, y, z, - c_farg0, c_farg1, c_farg2, - is_pclmulqdq_supported); + __ crc32c_ipl_alg2_alt2(crc, buf, len, + a, j, k, + l, y, z, + c_farg0, c_farg1, c_farg2, + is_pclmulqdq_supported); +#ifdef _WIN64 + __ pop(z); + __ pop(y); +#endif + } __ movl(rax, crc); -#ifdef _WIN64 - __ pop(z); - __ pop(y); -#endif __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index ec5a5d0f143..81362c76bd6 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -221,6 +221,23 @@ juint StubRoutines::x86::_crc_table_avx512[] = 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL }; +juint StubRoutines::x86::_crc32c_table_avx512[] = +{ + 0xb9e02b86UL, 0x00000000UL, 0xdcb17aa4UL, 0x00000000UL, + 0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL, + 0x06e38d70UL, 0x00000002UL, 0x6992cea2UL, 0x00000000UL, + 0x493c7d27UL, 0x00000000UL, 0xdd45aab8UL, 0x00000000UL, + 0xdea713f0UL, 0x00000000UL, 0x05ec76f0UL, 0x00000001UL, + 0x47db8317UL, 0x00000000UL, 0x2ad91c30UL, 0x00000000UL, + 0x0715ce53UL, 0x00000000UL, 0xc49f4f67UL, 0x00000000UL, + 0x39d3b296UL, 0x00000000UL, 0x083a6eecUL, 0x00000000UL, + 0x9e4addf8UL, 0x00000000UL, 0x740eef02UL, 0x00000000UL, + 0xddc0152bUL, 0x00000000UL, 0x1c291d04UL, 0x00000000UL, + 0xba4fc28eUL, 0x00000000UL, 0x3da6d0cbUL, 0x00000000UL, + 0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL, + 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL +}; + juint StubRoutines::x86::_crc_by128_masks_avx512[] = { 0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL, diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index 1ef8377dfc2..e4dd9550ce2 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -137,6 +137,7 @@ class x86 { #ifdef _LP64 static juint _crc_by128_masks_avx512[]; static juint _crc_table_avx512[]; + static juint _crc32c_table_avx512[]; static juint _shuf_table_crc32_avx512[]; static juint _adler32_shuf0_table[]; static juint _adler32_shuf1_table[]; @@ -256,6 +257,7 @@ class x86 { static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; } static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; } static address crc_table_avx512_addr() { return (address)_crc_table_avx512; } + static address crc32c_table_avx512_addr() { return (address)_crc32c_table_avx512; } static address ghash_polynomial512_addr() { return _ghash_poly512_addr; } #endif // _LP64 static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } diff --git a/test/micro/org/openjdk/bench/java/util/TestCRC32C.java b/test/micro/org/openjdk/bench/java/util/TestCRC32C.java new file mode 100644 index 00000000000..0c3b39fc59a --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/TestCRC32C.java @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.util; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.zip.CRC32C; +import org.openjdk.jmh.annotations.*; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Fork(value = 2) + +public class TestCRC32C { + + private CRC32C crc32c; + private Random random; + private byte[] bytes; + + @Param({"64", "128", "256", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"}) + private int count; + + public TestCRC32C() { + crc32c = new CRC32C(); + random = new Random(2147483648L); + bytes = new byte[1000000]; + random.nextBytes(bytes); + } + + @Setup(Level.Iteration) + public void setupBytes() { + crc32c.reset(); + } + + @Benchmark + public void testCRC32CUpdate() { + crc32c.update(bytes, 0, count); + } +}