8266332: Adler32 intrinsic for x86 64-bit platforms

Co-authored-by: Xubo Zhang <xubo.zhang@intel.com>
Co-authored-by: Greg B Tucker <greg.b.tucker@intel.com>
Co-authored-by: Pengfei Li <pli@openjdk.org>
Reviewed-by: sviswanathan, jbhateja, kvn, neliasso
This commit is contained in:
Xubo Zhang 2021-05-19 23:44:23 +00:00 committed by Vladimir Kozlov
parent b961f2535c
commit 8e3549fc73
13 changed files with 394 additions and 6 deletions

View File

@ -8030,6 +8030,18 @@ void Assembler::vbroadcastsd(XMMRegister dst, Address src, int vector_len) {
emit_operand(dst, src);
}
void Assembler::vbroadcastf128(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(vector_len == AVX_256bit, "");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
// swap src<->dst for encoding
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x1A);
emit_operand(dst, src);
}
// gpr source broadcast forms

View File

@ -2442,11 +2442,12 @@ private:
void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
// scalar single/double precision replicate
// scalar single/double/128bit precision replicate
void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
void vbroadcastss(XMMRegister dst, Address src, int vector_len);
void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
void vbroadcastf128(XMMRegister dst, Address src, int vector_len);
// gpr sourced byte/word/dword/qword replicate
void evpbroadcastb(XMMRegister dst, Register src, int vector_len);

View File

@ -3231,6 +3231,16 @@ void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int
Assembler::vpmullw(dst, nds, src, vector_len);
}
void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
assert((UseAVX > 0), "AVX support is needed");
if (reachable(src)) {
Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
} else {
lea(scratch_reg, src);
Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
}
}
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
Assembler::vpsubb(dst, nds, src, vector_len);

View File

@ -1307,6 +1307,13 @@ public:
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
Assembler::vpmulld(dst, nds, src, vector_len);
};
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
Assembler::vpmulld(dst, nds, src, vector_len);
}
void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
@ -1764,6 +1771,7 @@ public:
void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
void updateBytesAdler32(Register adler32, Register buf, Register length, XMMRegister shuf0, XMMRegister shuf1, ExternalAddress scale);
#endif // _LP64
// CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic

View File

@ -0,0 +1,211 @@
/*
* Copyright (c) 2021, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
#ifdef _LP64
void MacroAssembler::updateBytesAdler32(Register init_d, Register data, Register size, XMMRegister yshuf0, XMMRegister yshuf1, ExternalAddress ascaletab)
{
const int LIMIT = 5552;
const int BASE = 65521;
const int CHUNKSIZE = 16;
const int CHUNKSIZE_M1 = CHUNKSIZE - 1;
const Register s = r11;
const Register a_d = r12; //r12d
const Register b_d = r8; //r8d
const Register end = r13;
const XMMRegister ya = xmm0;
const XMMRegister yb = xmm1;
const XMMRegister ydata0 = xmm2;
const XMMRegister ydata1 = xmm3;
const XMMRegister ysa = xmm4;
const XMMRegister ydata = ysa;
const XMMRegister ytmp0 = ydata0;
const XMMRegister ytmp1 = ydata1;
const XMMRegister ytmp2 = xmm5;
const XMMRegister xa = xmm0;
const XMMRegister xb = xmm1;
const XMMRegister xtmp0 = xmm2;
const XMMRegister xtmp1 = xmm3;
const XMMRegister xsa = xmm4;
const XMMRegister xtmp2 = xmm5;
assert_different_registers(init_d, data, size, s, a_d, b_d, end, rax);
Label SLOOP1, SLOOP1A, SKIP_LOOP_1A, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
push(r12);
push(r13);
push(r14);
movl(b_d, init_d); //adler
shrl(b_d, 16);
andl(init_d, 0xFFFF);
cmpl(size, 32);
jcc(Assembler::below, LT64);
movdl(xa, init_d); //vmovd - 32bit
vpxor(yb, yb, yb, Assembler::AVX_256bit);
bind(SLOOP1);
movl(s, LIMIT);
cmpl(s, size);
cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
lea(end, Address(s, data, Address::times_1, -CHUNKSIZE_M1));
cmpptr(data, end);
jcc(Assembler::aboveEqual, SKIP_LOOP_1A);
align(32);
bind(SLOOP1A);
vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
addptr(data, CHUNKSIZE);
vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
vpaddd(ya, ya, ydata0, Assembler::AVX_256bit);
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
vpshufb(ydata1, ydata, yshuf1, Assembler::AVX_256bit);
vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
cmpptr(data, end);
jcc(Assembler::below, SLOOP1A);
bind(SKIP_LOOP_1A);
addptr(end, CHUNKSIZE_M1);
testl(s, CHUNKSIZE_M1);
jcc(Assembler::notEqual, DO_FINAL);
// either we're done, or we just did LIMIT
subl(size, s);
// reduce
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14);
// compute horizontal sums of ya, yb, ysa
vextracti128(xtmp0, ya, 1);
vextracti128(xtmp1, yb, 1);
vextracti128(xtmp2, ysa, 1);
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
movdl(rax, xa);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
movl(a_d, rdx);
vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
movdl(rax, xb);
addl(rax, b_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
movl(b_d, rdx);
testl(size, size);
jcc(Assembler::zero, FINISH);
// continue loop
movdl(xa, a_d);
vpxor(yb, yb, yb, Assembler::AVX_256bit);
jmp(SLOOP1);
bind(FINISH);
movl(rax, b_d);
shll(rax, 16);
orl(rax, a_d);
jmp(END);
bind(LT64);
movl(a_d, init_d);
lea(end, Address(data, size, Address::times_1));
testl(size, size);
jcc(Assembler::notZero, FINAL_LOOP);
jmp(ZERO_SIZE);
// handle remaining 1...15 bytes
bind(DO_FINAL);
// reduce
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14); //scaled a
vextracti128(xtmp0, ya, 1);
vextracti128(xtmp1, yb, 1);
vextracti128(xtmp2, ysa, 1);
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
movdl(a_d, xa);
movdl(rax, xb);
addl(b_d, rax);
align(32);
bind(FINAL_LOOP);
movzbl(rax, Address(data, 0)); //movzx eax, byte[data]
addl(a_d, rax);
addptr(data, 1);
addl(b_d, a_d);
cmpptr(data, end);
jcc(Assembler::below, FINAL_LOOP);
bind(ZERO_SIZE);
movl(rax, a_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // div ecx -- divide edx:eax by ecx, quot->eax, rem->edx
movl(a_d, rdx);
movl(rax, b_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
shll(rdx, 16);
orl(rdx, a_d);
movl(rax, rdx);
bind(END);
pop(r14);
pop(r13);
pop(r12);
}
#endif

View File

@ -5790,6 +5790,47 @@ address generate_avx_ghash_processBlocks() {
return start;
}
/***
* Arguments:
*
* Inputs:
* c_rarg0 - int adler
* c_rarg1 - byte* buff
* c_rarg2 - int len
*
* Output:
* rax - int adler result
*/
address generate_updateBytesAdler32() {
assert(UseAdler32Intrinsics, "need AVX2");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
address start = __ pc();
const Register data = r9;
const Register size = r10;
const XMMRegister yshuf0 = xmm6;
const XMMRegister yshuf1 = xmm7;
assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
__ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
__ movptr(data, c_rarg1); //data
__ movl(size, c_rarg2); //length
__ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
__ leave();
__ ret(0);
return start;
}
/**
* Arguments:
*
@ -6754,6 +6795,11 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
}
if (UseAdler32Intrinsics) {
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
}
if (UseLibmIntrinsic && InlineIntrinsics) {
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||

View File

@ -224,6 +224,25 @@ juint StubRoutines::x86::_shuf_table_crc32_avx512[] =
0x83828100UL, 0x87868584UL, 0x8b8a8988UL, 0x8f8e8d8cUL,
0x03020100UL, 0x07060504UL, 0x0b0a0908UL, 0x000e0d0cUL
};
juint StubRoutines::x86::_adler32_ascale_table[] =
{
0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL
};
juint StubRoutines::x86::_adler32_shuf0_table[] =
{
0xFFFFFF00UL, 0xFFFFFF01UL, 0xFFFFFF02UL, 0xFFFFFF03UL,
0xFFFFFF04UL, 0xFFFFFF05UL, 0xFFFFFF06UL, 0xFFFFFF07UL
};
juint StubRoutines::x86::_adler32_shuf1_table[] =
{
0xFFFFFF08UL, 0xFFFFFF09, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
0xFFFFFF0CUL, 0xFFFFFF0D, 0xFFFFFF0EUL, 0xFFFFFF0FUL
};
#endif // _LP64
#define D 32

View File

@ -119,6 +119,9 @@ class x86 {
static juint _crc_by128_masks_avx512[];
static juint _crc_table_avx512[];
static juint _shuf_table_crc32_avx512[];
static juint _adler32_shuf0_table[];
static juint _adler32_shuf1_table[];
static juint _adler32_ascale_table[];
#endif // _LP64
// table for CRC32C
static juint* _crc32c_table;

View File

@ -898,6 +898,24 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
#ifdef _LP64
if (supports_avx2()) {
if (FLAG_IS_DEFAULT(UseAdler32Intrinsics)) {
UseAdler32Intrinsics = true;
}
} else if (UseAdler32Intrinsics) {
if (!FLAG_IS_DEFAULT(UseAdler32Intrinsics)) {
warning("Adler32 Intrinsics requires avx2 instructions (not available on this CPU)");
}
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
#else
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
#endif
if (supports_sse4_2() && supports_clmul()) {
if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
UseCRC32CIntrinsics = true;
@ -993,11 +1011,6 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseSHA, false);
}
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
if (!supports_rtm() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag
// setting during arguments processing. See use_biased_locking().

View File

@ -93,6 +93,7 @@ bool vmIntrinsics::preserves_state(vmIntrinsics::ID id) {
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
case vmIntrinsics::_updateBytesAdler32:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:

View File

@ -231,6 +231,7 @@ JVMCIObjectArray CompilerToVM::initialize_intrinsics(JVMCI_TRAPS) {
X86_ONLY(do_intx_flag(UseAVX)) \
do_bool_flag(UseBiasedLocking) \
do_bool_flag(UseCRC32Intrinsics) \
do_bool_flag(UseAdler32Intrinsics) \
do_bool_flag(UseCompressedClassPointers) \
do_bool_flag(UseCompressedOops) \
X86_ONLY(do_bool_flag(UseCountLeadingZerosInstruction)) \

View File

@ -574,6 +574,7 @@ typedef HashtableEntry<InstanceKlass*, mtClass> KlassHashtableEntry;
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _crc32c_table_addr, address) \
static_field(StubRoutines, _updateBytesCRC32C, address) \
static_field(StubRoutines, _updateBytesAdler32, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \
static_field(StubRoutines, _bigIntegerRightShiftWorker, address) \

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.zip.Adler32;
import org.openjdk.jmh.annotations.*;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Fork(value = 2)
@Warmup(iterations = 2, time = 30, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 3, time = 60, timeUnit = TimeUnit.SECONDS)
public class TestAdler32 {
private Adler32 adler32;
private Random random;
private byte[] bytes;
@Param({"64", "128", "256", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"})
private int count;
public TestAdler32() {
adler32 = new Adler32();
random = new Random(2147483648L);
bytes = new byte[1000000];
random.nextBytes(bytes);
}
@Setup(Level.Iteration)
public void setupBytes() {
adler32.reset();
}
@Benchmark
public void testAdler32Update() {
adler32.update(bytes, 0, count);
}
}