8277358: Accelerate CRC32-C

Co-authored-by: Greg Tucker <greg.b.tucker@intel.com>
Co-authored-by: Scott Gibbons <sgibbons@openjdk.org>
Reviewed-by: kvn, sviswanathan, ecaspole
This commit is contained in:
Scott Gibbons 2021-12-02 20:06:05 +00:00 committed by Sandhya Viswanathan
parent 73a9654c26
commit e0f1fc783c
5 changed files with 123 additions and 29 deletions

View File

@ -7030,7 +7030,7 @@ void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, X
// Helper function for AVX 512 CRC32
// Compute CRC32 for < 256B buffers
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
@ -7043,7 +7043,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
jcc(Assembler::less, L_less_than_32);
// if there is, load the constants
movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10
movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
movdl(xmm0, crc); // get the initial crc value
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
pxor(xmm7, xmm0);
@ -7070,7 +7070,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
pxor(xmm7, xmm0); //xor the initial crc value
addl(pos, 16);
subl(len, 16);
movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10
movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
jmp(L_get_last_two_xmms);
bind(L_less_than_16_left);
@ -7190,12 +7190,17 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
* param crc register containing existing CRC (32-bit)
* param buf register pointing to input byte buffer (byte*)
* param len register containing number of bytes
* param table address of crc or crc32c table
* param tmp1 scratch register
* param tmp2 scratch register
* return rax result register
*
* This routine is identical for crc32c with the exception of the precomputed constant
* table which will be passed as the table argument. The calculation steps are
* the same for both variants.
*/
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
@ -7210,8 +7215,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
notl(crc);
movl(pos, 0);
// check if smaller than 256B
@ -7225,7 +7228,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
subl(len, 256);
cmpl(len, 256);
@ -7233,7 +7236,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
subl(len, 256);
bind(L_fold_256_B_loop);
@ -7279,8 +7282,8 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
bind(L_fold_128_B_register);
evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
// save last that has no multiplicand
@ -7289,7 +7292,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
// Needed later in reduction loop
movdqu(xmm10, Address(key, 1 * 16));
movdqu(xmm10, Address(table, 1 * 16));
vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
@ -7305,7 +7308,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
jcc(Assembler::less, L_final_reduction_for_128);
bind(L_16B_reduction_loop);
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
@ -7336,14 +7339,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
bind(L_128_done);
// compute crc of a 128-bit value
movdqu(xmm10, Address(key, 3 * 16));
movdqu(xmm10, Address(table, 3 * 16));
movdqu(xmm0, xmm7);
// 64b fold
@ -7359,14 +7362,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
jmp(L_barrett);
bind(L_less_than_256);
kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
//barrett reduction
bind(L_barrett);
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
movdqu(xmm1, xmm7);
movdqu(xmm2, xmm7);
movdqu(xmm10, Address(key, 4 * 16));
movdqu(xmm10, Address(table, 4 * 16));
pclmulqdq(xmm7, xmm10, 0x0);
pxor(xmm7, xmm2);
@ -7378,7 +7381,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
pextrd(crc, xmm7, 2);
bind(L_cleanup);
notl(crc); // ~c
addptr(rsp, 16 * 2 + 8);
pop(r12);
}

View File

@ -6528,7 +6528,13 @@ address generate_avx_ghash_processBlocks() {
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
VM_Version::supports_avx512bw() &&
VM_Version::supports_avx512vl()) {
// The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
// However, the constant table for CRC32-C assumes the original crc value. Account for this
// difference before calling and after returning.
__ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
__ notl(crc);
__ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
__ notl(crc);
} else {
__ kernel_crc32(crc, buf, len, table, tmp1);
}
@ -6580,6 +6586,12 @@ address generate_avx_ghash_processBlocks() {
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
VM_Version::supports_avx512bw() &&
VM_Version::supports_avx512vl()) {
__ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
__ kernel_crc32_avx512(crc, buf, len, j, l, k);
} else {
#ifdef _WIN64
__ push(y);
__ push(z);
@ -6589,11 +6601,12 @@ address generate_avx_ghash_processBlocks() {
l, y, z,
c_farg0, c_farg1, c_farg2,
is_pclmulqdq_supported);
__ movl(rax, crc);
#ifdef _WIN64
__ pop(z);
__ pop(y);
#endif
}
__ movl(rax, crc);
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);

View File

@ -221,6 +221,23 @@ juint StubRoutines::x86::_crc_table_avx512[] =
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
};
juint StubRoutines::x86::_crc32c_table_avx512[] =
{
0xb9e02b86UL, 0x00000000UL, 0xdcb17aa4UL, 0x00000000UL,
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
0x06e38d70UL, 0x00000002UL, 0x6992cea2UL, 0x00000000UL,
0x493c7d27UL, 0x00000000UL, 0xdd45aab8UL, 0x00000000UL,
0xdea713f0UL, 0x00000000UL, 0x05ec76f0UL, 0x00000001UL,
0x47db8317UL, 0x00000000UL, 0x2ad91c30UL, 0x00000000UL,
0x0715ce53UL, 0x00000000UL, 0xc49f4f67UL, 0x00000000UL,
0x39d3b296UL, 0x00000000UL, 0x083a6eecUL, 0x00000000UL,
0x9e4addf8UL, 0x00000000UL, 0x740eef02UL, 0x00000000UL,
0xddc0152bUL, 0x00000000UL, 0x1c291d04UL, 0x00000000UL,
0xba4fc28eUL, 0x00000000UL, 0x3da6d0cbUL, 0x00000000UL,
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
};
juint StubRoutines::x86::_crc_by128_masks_avx512[] =
{
0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL,

View File

@ -137,6 +137,7 @@ class x86 {
#ifdef _LP64
static juint _crc_by128_masks_avx512[];
static juint _crc_table_avx512[];
static juint _crc32c_table_avx512[];
static juint _shuf_table_crc32_avx512[];
static juint _adler32_shuf0_table[];
static juint _adler32_shuf1_table[];
@ -256,6 +257,7 @@ class x86 {
static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; }
static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; }
static address crc_table_avx512_addr() { return (address)_crc_table_avx512; }
static address crc32c_table_avx512_addr() { return (address)_crc32c_table_avx512; }
static address ghash_polynomial512_addr() { return _ghash_poly512_addr; }
#endif // _LP64
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }

View File

@ -0,0 +1,60 @@
/*
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.zip.CRC32C;
import org.openjdk.jmh.annotations.*;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Fork(value = 2)
public class TestCRC32C {
private CRC32C crc32c;
private Random random;
private byte[] bytes;
@Param({"64", "128", "256", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"})
private int count;
public TestCRC32C() {
crc32c = new CRC32C();
random = new Random(2147483648L);
bytes = new byte[1000000];
random.nextBytes(bytes);
}
@Setup(Level.Iteration)
public void setupBytes() {
crc32c.reset();
}
@Benchmark
public void testCRC32CUpdate() {
crc32c.update(bytes, 0, count);
}
}