8300208: Optimize Adler32 stub for AVX-512 targets.
Reviewed-by: kvn, sviswanathan
This commit is contained in:
parent
af564e46b0
commit
6475501a01
@ -4870,6 +4870,23 @@ void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vec
|
||||
emit_operand(dst, src, 0);
|
||||
}
|
||||
|
||||
void Assembler::evpmovzxbd(XMMRegister dst, KRegister mask, Address src, int vector_len) {
|
||||
assert(VM_Version::supports_avx512vl(), "");
|
||||
assert(dst != xnoreg, "sanity");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
|
||||
attributes.set_embedded_opmask_register_specifier(mask);
|
||||
attributes.set_is_evex_instruction();
|
||||
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int8(0x31);
|
||||
emit_operand(dst, src, 0);
|
||||
}
|
||||
|
||||
void Assembler::evpmovzxbd(XMMRegister dst, Address src, int vector_len) {
|
||||
evpmovzxbd(dst, k0, src, vector_len);
|
||||
}
|
||||
|
||||
void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
|
||||
assert(VM_Version::supports_evex(), "");
|
||||
// Encoding: EVEX.NDS.XXX.66.0F.W0 DB /r
|
||||
|
@ -1873,6 +1873,8 @@ private:
|
||||
void pmovzxdq(XMMRegister dst, XMMRegister src);
|
||||
void vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
|
||||
void evpmovzxbd(XMMRegister dst, KRegister mask, Address src, int vector_len);
|
||||
void evpmovzxbd(XMMRegister dst, Address src, int vector_len);
|
||||
|
||||
// Sign extend moves
|
||||
void pmovsxbd(XMMRegister dst, XMMRegister src);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
|
||||
*
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -26,6 +26,7 @@
|
||||
#include "precompiled.hpp"
|
||||
#include "asm/assembler.hpp"
|
||||
#include "asm/assembler.inline.hpp"
|
||||
#include "utilities/globalDefinitions.hpp"
|
||||
#include "runtime/globals.hpp"
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
#include "macroAssembler_x86.hpp"
|
||||
@ -33,9 +34,11 @@
|
||||
|
||||
#define __ _masm->
|
||||
|
||||
ATTRIBUTE_ALIGNED(32) juint ADLER32_ASCALE_TABLE[] = {
|
||||
ATTRIBUTE_ALIGNED(64) juint ADLER32_ASCALE_TABLE[] = {
|
||||
0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
|
||||
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL
|
||||
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL,
|
||||
0x00000008UL, 0x00000009UL, 0x0000000AUL, 0x0000000BUL,
|
||||
0x0000000CUL, 0x0000000DUL, 0x0000000EUL, 0x0000000FUL
|
||||
};
|
||||
|
||||
ATTRIBUTE_ALIGNED(32) juint ADLER32_SHUF0_TABLE[] = {
|
||||
@ -44,8 +47,8 @@ ATTRIBUTE_ALIGNED(32) juint ADLER32_SHUF0_TABLE[] = {
|
||||
};
|
||||
|
||||
ATTRIBUTE_ALIGNED(32) juint ADLER32_SHUF1_TABLE[] = {
|
||||
0xFFFFFF08UL, 0xFFFFFF09, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
|
||||
0xFFFFFF0CUL, 0xFFFFFF0D, 0xFFFFFF0EUL, 0xFFFFFF0FUL
|
||||
0xFFFFFF08UL, 0xFFFFFF09UL, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
|
||||
0xFFFFFF0CUL, 0xFFFFFF0DUL, 0xFFFFFF0EUL, 0xFFFFFF0FUL
|
||||
};
|
||||
|
||||
|
||||
@ -67,12 +70,14 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
|
||||
address start = __ pc();
|
||||
|
||||
// Choose an appropriate LIMIT for inner loop based on the granularity
|
||||
// of intermediate results. For int, LIMIT of 5552 will ensure intermediate
|
||||
// results does not overflow Integer.MAX_VALUE before modulo operations.
|
||||
const int LIMIT = 5552;
|
||||
const int BASE = 65521;
|
||||
const int CHUNKSIZE = 16;
|
||||
const int CHUNKSIZE_M1 = CHUNKSIZE - 1;
|
||||
|
||||
|
||||
const Register init_d = c_rarg0;
|
||||
const Register data = r9;
|
||||
const Register size = r10;
|
||||
@ -101,17 +106,22 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
const XMMRegister xtmp1 = xmm3;
|
||||
const XMMRegister xsa = xmm4;
|
||||
const XMMRegister xtmp2 = xmm5;
|
||||
const XMMRegister xtmp3 = xmm8;
|
||||
const XMMRegister xtmp4 = xmm9;
|
||||
const XMMRegister xtmp5 = xmm10;
|
||||
|
||||
Label SLOOP1, SLOOP1A, SKIP_LOOP_1A, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
|
||||
Label SLOOP1, SPRELOOP1A_AVX2, SLOOP1A_AVX2, SLOOP1A_AVX3, AVX3_REDUCE, SKIP_LOOP_1A;
|
||||
Label SKIP_LOOP_1A_AVX3, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
__ push(r12);
|
||||
__ push(r13);
|
||||
__ push(r14);
|
||||
__ movq(xtmp3, r12);
|
||||
__ movq(xtmp4, r13);
|
||||
__ movq(xtmp5, r14);
|
||||
|
||||
__ vmovdqu(yshuf0, ExternalAddress((address)ADLER32_SHUF0_TABLE), r14 /*rscratch*/);
|
||||
__ vmovdqu(yshuf1, ExternalAddress((address)ADLER32_SHUF1_TABLE), r14 /*rscratch*/);
|
||||
|
||||
__ movptr(data, c_rarg1); //data
|
||||
__ movl(size, c_rarg2); //length
|
||||
|
||||
@ -121,7 +131,6 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ cmpl(size, 32);
|
||||
__ jcc(Assembler::below, LT64);
|
||||
__ movdl(xa, init_d); //vmovd - 32bit
|
||||
__ vpxor(yb, yb, yb, Assembler::AVX_256bit);
|
||||
|
||||
__ bind(SLOOP1);
|
||||
__ movl(s, LIMIT);
|
||||
@ -132,7 +141,88 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ jcc(Assembler::aboveEqual, SKIP_LOOP_1A);
|
||||
|
||||
__ align32();
|
||||
__ bind(SLOOP1A);
|
||||
if (VM_Version::supports_avx512vl()) {
|
||||
// AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
|
||||
__ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
|
||||
__ jcc(Assembler::belowEqual, SPRELOOP1A_AVX2);
|
||||
|
||||
__ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));
|
||||
__ vpxor(yb, yb, yb, Assembler::AVX_512bit);
|
||||
|
||||
// Some notes on vectorized main loop algorithm.
|
||||
// Additions are performed in slices of 16 bytes in the main loop.
|
||||
// input size : 64 bytes (a0 - a63).
|
||||
// Iteration0 : ya = [a0 - a15]
|
||||
// yb = [a0 - a15]
|
||||
// Iteration1 : ya = [a0 - a15] + [a16 - a31]
|
||||
// yb = 2 x [a0 - a15] + [a16 - a31]
|
||||
// Iteration2 : ya = [a0 - a15] + [a16 - a31] + [a32 - a47]
|
||||
// yb = 3 x [a0 - a15] + 2 x [a16 - a31] + [a32 - a47]
|
||||
// Iteration4 : ya = [a0 - a15] + [a16 - a31] + [a32 - a47] + [a48 - a63]
|
||||
// yb = 4 x [a0 - a15] + 3 x [a16 - a31] + 2 x [a32 - a47] + [a48 - a63]
|
||||
// Before performing reduction we must scale the intermediate result appropriately.
|
||||
// Since addition was performed in chunks of 16 bytes, thus to match the scalar implementation
|
||||
// Oth lane element must be repeatedly added 16 times, 1st element 15 times and so on so forth.
|
||||
// Thus we first multiply yb by 16 followed by subtracting appropriately scaled ya value.
|
||||
// yb = 16 x yb - [0 - 15] x ya
|
||||
// = 64 x [0 - 15] + 48 x [16 - 31] + 32 x [32 - 47] + 16 x [48 - 63] - [0 - 15] x ya
|
||||
// = 64 x a0 + 63 x a1 + 62 x a2 ...... + a63
|
||||
__ bind(SLOOP1A_AVX3);
|
||||
__ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
|
||||
__ evpmovzxbd(ydata1, Address(data, CHUNKSIZE), Assembler::AVX_512bit);
|
||||
__ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
|
||||
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
|
||||
__ vpaddd(ya, ya, ydata1, Assembler::AVX_512bit);
|
||||
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
|
||||
__ addptr(data, 2*CHUNKSIZE);
|
||||
__ cmpptr(data, end);
|
||||
__ jcc(Assembler::below, SLOOP1A_AVX3);
|
||||
|
||||
__ addptr(end, CHUNKSIZE);
|
||||
__ cmpptr(data, end);
|
||||
__ jcc(Assembler::aboveEqual, AVX3_REDUCE);
|
||||
|
||||
__ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
|
||||
__ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
|
||||
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
|
||||
__ addptr(data, CHUNKSIZE);
|
||||
|
||||
__ bind(AVX3_REDUCE);
|
||||
__ vpslld(yb, yb, 4, Assembler::AVX_512bit); //b is scaled by 16(avx512))
|
||||
__ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_512bit, r14 /*rscratch*/);
|
||||
|
||||
// compute horizontal sums of ya, yb, ysa
|
||||
__ vextracti64x4(xtmp0, ya, 1);
|
||||
__ vextracti64x4(xtmp1, yb, 1);
|
||||
__ vextracti64x4(xtmp2, ysa, 1);
|
||||
__ vpaddd(xtmp0, xtmp0, ya, Assembler::AVX_256bit);
|
||||
__ vpaddd(xtmp1, xtmp1, yb, Assembler::AVX_256bit);
|
||||
__ vpaddd(xtmp2, xtmp2, ysa, Assembler::AVX_256bit);
|
||||
__ vextracti128(xa, xtmp0, 1);
|
||||
__ vextracti128(xb, xtmp1, 1);
|
||||
__ vextracti128(xsa, xtmp2, 1);
|
||||
__ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
|
||||
__ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
|
||||
__ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
|
||||
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
|
||||
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
|
||||
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
|
||||
|
||||
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
|
||||
|
||||
__ addptr(end, CHUNKSIZE_M1);
|
||||
__ testl(s, CHUNKSIZE_M1);
|
||||
__ jcc(Assembler::notEqual, DO_FINAL);
|
||||
__ jmp(SKIP_LOOP_1A_AVX3);
|
||||
}
|
||||
|
||||
__ align32();
|
||||
__ bind(SPRELOOP1A_AVX2);
|
||||
__ vpxor(yb, yb, yb, Assembler::AVX_256bit);
|
||||
__ bind(SLOOP1A_AVX2);
|
||||
__ vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
|
||||
__ addptr(data, CHUNKSIZE);
|
||||
__ vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
|
||||
@ -142,18 +232,12 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
|
||||
__ vpaddd(yb, yb, ya, Assembler::AVX_256bit);
|
||||
__ cmpptr(data, end);
|
||||
__ jcc(Assembler::below, SLOOP1A);
|
||||
__ jcc(Assembler::below, SLOOP1A_AVX2);
|
||||
|
||||
__ bind(SKIP_LOOP_1A);
|
||||
__ addptr(end, CHUNKSIZE_M1);
|
||||
__ testl(s, CHUNKSIZE_M1);
|
||||
__ jcc(Assembler::notEqual, DO_FINAL);
|
||||
|
||||
// either we're done, or we just did LIMIT
|
||||
__ subl(size, s);
|
||||
|
||||
// reduce
|
||||
__ vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
|
||||
__ vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8(avx)
|
||||
__ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_256bit, r14 /*rscratch*/);
|
||||
|
||||
// compute horizontal sums of ya, yb, ysa
|
||||
@ -170,13 +254,22 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
|
||||
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
|
||||
|
||||
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
|
||||
|
||||
__ addptr(end, CHUNKSIZE_M1);
|
||||
__ testl(s, CHUNKSIZE_M1);
|
||||
__ jcc(Assembler::notEqual, DO_FINAL);
|
||||
|
||||
__ bind(SKIP_LOOP_1A_AVX3);
|
||||
// either we're done, or we just did LIMIT
|
||||
__ subl(size, s);
|
||||
|
||||
__ movdl(rax, xa);
|
||||
__ xorl(rdx, rdx);
|
||||
__ movl(rcx, BASE);
|
||||
__ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
|
||||
__ movl(a_d, rdx);
|
||||
|
||||
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
|
||||
__ movdl(rax, xb);
|
||||
__ addl(rax, b_d);
|
||||
__ xorl(rdx, rdx);
|
||||
@ -189,7 +282,6 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
|
||||
// continue loop
|
||||
__ movdl(xa, a_d);
|
||||
__ vpxor(yb, yb, yb, Assembler::AVX_256bit);
|
||||
__ jmp(SLOOP1);
|
||||
|
||||
__ bind(FINISH);
|
||||
@ -205,26 +297,7 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ jcc(Assembler::notZero, FINAL_LOOP);
|
||||
__ jmp(ZERO_SIZE);
|
||||
|
||||
// handle remaining 1...15 bytes
|
||||
__ bind(DO_FINAL);
|
||||
// reduce
|
||||
__ vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
|
||||
__ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_256bit, r14 /*rscratch*/); //scaled a
|
||||
|
||||
__ vextracti128(xtmp0, ya, 1);
|
||||
__ vextracti128(xtmp1, yb, 1);
|
||||
__ vextracti128(xtmp2, ysa, 1);
|
||||
__ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
|
||||
__ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
|
||||
__ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
|
||||
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
|
||||
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
|
||||
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
|
||||
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
|
||||
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
|
||||
|
||||
__ movdl(a_d, xa);
|
||||
__ movdl(rax, xb);
|
||||
__ addl(b_d, rax);
|
||||
@ -255,9 +328,10 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ movl(rax, rdx);
|
||||
|
||||
__ bind(END);
|
||||
__ pop(r14);
|
||||
__ pop(r13);
|
||||
__ pop(r12);
|
||||
|
||||
__ movq(r14, xtmp5);
|
||||
__ movq(r13, xtmp4);
|
||||
__ movq(r12, xtmp3);
|
||||
|
||||
__ leave();
|
||||
__ ret(0);
|
||||
|
@ -146,7 +146,7 @@ public class TestAdler32 {
|
||||
if (adler0.getValue() != adler1.getValue()) {
|
||||
System.err.printf("ERROR: adler0 = %08x, adler1 = %08x\n",
|
||||
adler0.getValue(), adler1.getValue());
|
||||
return false;
|
||||
throw new AssertionError("TEST FAILED", null);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -27,20 +27,16 @@ import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.Adler32;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Fork(value = 2)
|
||||
@Warmup(iterations = 2, time = 30, timeUnit = TimeUnit.SECONDS)
|
||||
@Measurement(iterations = 3, time = 60, timeUnit = TimeUnit.SECONDS)
|
||||
|
||||
public class TestAdler32 {
|
||||
|
||||
private Adler32 adler32;
|
||||
private Random random;
|
||||
private byte[] bytes;
|
||||
|
||||
@Param({"64", "128", "256", "512", /* "1024", */ "2048", /* "4096", "8192", */ "16384", /* "32768", */ "65536"})
|
||||
@Param({"64", "128", "256", "512", "1024", "2048", "5012", "8192", "16384", "32768", "65536"})
|
||||
private int count;
|
||||
|
||||
public TestAdler32() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user