8277358: Accelerate CRC32-C
Co-authored-by: Greg Tucker <greg.b.tucker@intel.com> Co-authored-by: Scott Gibbons <sgibbons@openjdk.org> Reviewed-by: kvn, sviswanathan, ecaspole
This commit is contained in:
parent
73a9654c26
commit
e0f1fc783c
@ -7030,7 +7030,7 @@ void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, X
|
||||
|
||||
// Helper function for AVX 512 CRC32
|
||||
// Compute CRC32 for < 256B buffers
|
||||
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
|
||||
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
|
||||
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
|
||||
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
|
||||
|
||||
@ -7043,7 +7043,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
|
||||
jcc(Assembler::less, L_less_than_32);
|
||||
|
||||
// if there is, load the constants
|
||||
movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10
|
||||
movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
|
||||
movdl(xmm0, crc); // get the initial crc value
|
||||
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
|
||||
pxor(xmm7, xmm0);
|
||||
@ -7070,7 +7070,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
|
||||
pxor(xmm7, xmm0); //xor the initial crc value
|
||||
addl(pos, 16);
|
||||
subl(len, 16);
|
||||
movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10
|
||||
movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
|
||||
jmp(L_get_last_two_xmms);
|
||||
|
||||
bind(L_less_than_16_left);
|
||||
@ -7190,12 +7190,17 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
|
||||
* param crc register containing existing CRC (32-bit)
|
||||
* param buf register pointing to input byte buffer (byte*)
|
||||
* param len register containing number of bytes
|
||||
* param table address of crc or crc32c table
|
||||
* param tmp1 scratch register
|
||||
* param tmp2 scratch register
|
||||
* return rax result register
|
||||
*
|
||||
* This routine is identical for crc32c with the exception of the precomputed constant
|
||||
* table which will be passed as the table argument. The calculation steps are
|
||||
* the same for both variants.
|
||||
*/
|
||||
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
|
||||
assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
|
||||
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
|
||||
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
|
||||
|
||||
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
|
||||
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
|
||||
@ -7210,8 +7215,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
|
||||
notl(crc);
|
||||
movl(pos, 0);
|
||||
|
||||
// check if smaller than 256B
|
||||
@ -7225,7 +7228,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
||||
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
||||
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
|
||||
evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
|
||||
evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
|
||||
|
||||
subl(len, 256);
|
||||
cmpl(len, 256);
|
||||
@ -7233,7 +7236,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
|
||||
evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
|
||||
evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
|
||||
evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
|
||||
evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
|
||||
subl(len, 256);
|
||||
|
||||
bind(L_fold_256_B_loop);
|
||||
@ -7279,8 +7282,8 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
|
||||
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
|
||||
bind(L_fold_128_B_register);
|
||||
evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
|
||||
evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
|
||||
evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
|
||||
evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
|
||||
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
|
||||
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
|
||||
// save last that has no multiplicand
|
||||
@ -7289,7 +7292,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
|
||||
evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
|
||||
// Needed later in reduction loop
|
||||
movdqu(xmm10, Address(key, 1 * 16));
|
||||
movdqu(xmm10, Address(table, 1 * 16));
|
||||
vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
|
||||
vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
|
||||
|
||||
@ -7305,7 +7308,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
jcc(Assembler::less, L_final_reduction_for_128);
|
||||
|
||||
bind(L_16B_reduction_loop);
|
||||
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
|
||||
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
|
||||
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
|
||||
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
|
||||
movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
|
||||
@ -7336,14 +7339,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
|
||||
|
||||
blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
|
||||
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
|
||||
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
|
||||
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
|
||||
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
|
||||
vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
|
||||
|
||||
bind(L_128_done);
|
||||
// compute crc of a 128-bit value
|
||||
movdqu(xmm10, Address(key, 3 * 16));
|
||||
movdqu(xmm10, Address(table, 3 * 16));
|
||||
movdqu(xmm0, xmm7);
|
||||
|
||||
// 64b fold
|
||||
@ -7359,14 +7362,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
jmp(L_barrett);
|
||||
|
||||
bind(L_less_than_256);
|
||||
kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
|
||||
kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
|
||||
|
||||
//barrett reduction
|
||||
bind(L_barrett);
|
||||
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
|
||||
movdqu(xmm1, xmm7);
|
||||
movdqu(xmm2, xmm7);
|
||||
movdqu(xmm10, Address(key, 4 * 16));
|
||||
movdqu(xmm10, Address(table, 4 * 16));
|
||||
|
||||
pclmulqdq(xmm7, xmm10, 0x0);
|
||||
pxor(xmm7, xmm2);
|
||||
@ -7378,7 +7381,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
|
||||
pextrd(crc, xmm7, 2);
|
||||
|
||||
bind(L_cleanup);
|
||||
notl(crc); // ~c
|
||||
addptr(rsp, 16 * 2 + 8);
|
||||
pop(r12);
|
||||
}
|
||||
|
@ -6528,7 +6528,13 @@ address generate_avx_ghash_processBlocks() {
|
||||
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
|
||||
VM_Version::supports_avx512bw() &&
|
||||
VM_Version::supports_avx512vl()) {
|
||||
// The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
|
||||
// However, the constant table for CRC32-C assumes the original crc value. Account for this
|
||||
// difference before calling and after returning.
|
||||
__ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
|
||||
__ notl(crc);
|
||||
__ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
|
||||
__ notl(crc);
|
||||
} else {
|
||||
__ kernel_crc32(crc, buf, len, table, tmp1);
|
||||
}
|
||||
@ -6580,6 +6586,12 @@ address generate_avx_ghash_processBlocks() {
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
|
||||
VM_Version::supports_avx512bw() &&
|
||||
VM_Version::supports_avx512vl()) {
|
||||
__ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
|
||||
__ kernel_crc32_avx512(crc, buf, len, j, l, k);
|
||||
} else {
|
||||
#ifdef _WIN64
|
||||
__ push(y);
|
||||
__ push(z);
|
||||
@ -6589,11 +6601,12 @@ address generate_avx_ghash_processBlocks() {
|
||||
l, y, z,
|
||||
c_farg0, c_farg1, c_farg2,
|
||||
is_pclmulqdq_supported);
|
||||
__ movl(rax, crc);
|
||||
#ifdef _WIN64
|
||||
__ pop(z);
|
||||
__ pop(y);
|
||||
#endif
|
||||
}
|
||||
__ movl(rax, crc);
|
||||
__ vzeroupper();
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
|
@ -221,6 +221,23 @@ juint StubRoutines::x86::_crc_table_avx512[] =
|
||||
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
|
||||
};
|
||||
|
||||
juint StubRoutines::x86::_crc32c_table_avx512[] =
|
||||
{
|
||||
0xb9e02b86UL, 0x00000000UL, 0xdcb17aa4UL, 0x00000000UL,
|
||||
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
|
||||
0x06e38d70UL, 0x00000002UL, 0x6992cea2UL, 0x00000000UL,
|
||||
0x493c7d27UL, 0x00000000UL, 0xdd45aab8UL, 0x00000000UL,
|
||||
0xdea713f0UL, 0x00000000UL, 0x05ec76f0UL, 0x00000001UL,
|
||||
0x47db8317UL, 0x00000000UL, 0x2ad91c30UL, 0x00000000UL,
|
||||
0x0715ce53UL, 0x00000000UL, 0xc49f4f67UL, 0x00000000UL,
|
||||
0x39d3b296UL, 0x00000000UL, 0x083a6eecUL, 0x00000000UL,
|
||||
0x9e4addf8UL, 0x00000000UL, 0x740eef02UL, 0x00000000UL,
|
||||
0xddc0152bUL, 0x00000000UL, 0x1c291d04UL, 0x00000000UL,
|
||||
0xba4fc28eUL, 0x00000000UL, 0x3da6d0cbUL, 0x00000000UL,
|
||||
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
|
||||
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
|
||||
};
|
||||
|
||||
juint StubRoutines::x86::_crc_by128_masks_avx512[] =
|
||||
{
|
||||
0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL,
|
||||
|
@ -137,6 +137,7 @@ class x86 {
|
||||
#ifdef _LP64
|
||||
static juint _crc_by128_masks_avx512[];
|
||||
static juint _crc_table_avx512[];
|
||||
static juint _crc32c_table_avx512[];
|
||||
static juint _shuf_table_crc32_avx512[];
|
||||
static juint _adler32_shuf0_table[];
|
||||
static juint _adler32_shuf1_table[];
|
||||
@ -256,6 +257,7 @@ class x86 {
|
||||
static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; }
|
||||
static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; }
|
||||
static address crc_table_avx512_addr() { return (address)_crc_table_avx512; }
|
||||
static address crc32c_table_avx512_addr() { return (address)_crc32c_table_avx512; }
|
||||
static address ghash_polynomial512_addr() { return _ghash_poly512_addr; }
|
||||
#endif // _LP64
|
||||
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
|
||||
|
60
test/micro/org/openjdk/bench/java/util/TestCRC32C.java
Normal file
60
test/micro/org/openjdk/bench/java/util/TestCRC32C.java
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.java.util;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.CRC32C;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Fork(value = 2)
|
||||
|
||||
public class TestCRC32C {
|
||||
|
||||
private CRC32C crc32c;
|
||||
private Random random;
|
||||
private byte[] bytes;
|
||||
|
||||
@Param({"64", "128", "256", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"})
|
||||
private int count;
|
||||
|
||||
public TestCRC32C() {
|
||||
crc32c = new CRC32C();
|
||||
random = new Random(2147483648L);
|
||||
bytes = new byte[1000000];
|
||||
random.nextBytes(bytes);
|
||||
}
|
||||
|
||||
@Setup(Level.Iteration)
|
||||
public void setupBytes() {
|
||||
crc32c.reset();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testCRC32CUpdate() {
|
||||
crc32c.update(bytes, 0, count);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user