8281375: Accelerate bitCount operation for AVX2 and AVX512 target.

Reviewed-by: sviswanathan, thartmann
This commit is contained in:
Jatin Bhateja 2022-03-14 07:11:23 +00:00
parent 3cf83a671e
commit fde3149896
16 changed files with 269 additions and 13 deletions

View File

@ -163,4 +163,10 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
}
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP

View File

@ -155,4 +155,9 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = false;
// Returns pre-selection estimated cost of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
}
#endif // CPU_ARM_MATCHER_ARM_HPP

View File

@ -164,4 +164,10 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
}
#endif // CPU_PPC_MATCHER_PPC_HPP

View File

@ -153,4 +153,9 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
}
#endif // CPU_S390_MATCHER_S390_HPP

View File

@ -8317,8 +8317,28 @@ void Assembler::vpbroadcastw(XMMRegister dst, Address src, int vector_len) {
emit_operand(dst, src);
}
// xmm/mem sourced byte/word/dword/qword replicate
void Assembler::vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xF6, (0xC0 | encode));
}
void Assembler::vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6A, (0xC0 | encode));
}
void Assembler::vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x62, (0xC0 | encode));
}
// xmm/mem sourced byte/word/dword/qword replicate
void Assembler::evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);

View File

@ -1933,10 +1933,17 @@ private:
// Interleave Low Doublewords
void punpckldq(XMMRegister dst, XMMRegister src);
void punpckldq(XMMRegister dst, Address src);
void vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
// Interleave High Doublewords
void vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
// Interleave Low Quadwords
void punpcklqdq(XMMRegister dst, XMMRegister src);
// Vector sum of absolute difference.
void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
#ifndef _LP64 // no 32bit push/pop on amd64
void pushl(Address src);
#endif

View File

@ -4321,6 +4321,94 @@ void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, in
}
}
//
// Following is lookup table based popcount computation algorithm:-
// Index Bit set count
// [ 0000 -> 0,
// 0001 -> 1,
// 0010 -> 1,
// 0011 -> 2,
// 0100 -> 1,
// 0101 -> 2,
// 0110 -> 2,
// 0111 -> 3,
// 1000 -> 1,
// 1001 -> 2,
// 1010 -> 3,
// 1011 -> 3,
// 1100 -> 2,
// 1101 -> 3,
// 1111 -> 4 ]
// a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
// shuffle indices for lookup table access.
// b. Right shift each byte of vector lane by 4 positions.
// c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
// shuffle indices for lookup table access.
// d. Add the bitset count of upper and lower 4 bits of each byte.
// e. Unpack double words to quad words and compute sum of absolute difference of bitset
// count of all the bytes of a quadword.
// f. Perform step e. for upper 128bit vector lane.
// g. Pack the bitset count of quadwords back to double word.
// h. Unpacking and packing operations are not needed for 64bit vector lane.
void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
int vec_enc) {
if (VM_Version::supports_avx512_vpopcntdq()) {
vpopcntd(dst, src, vec_enc);
} else {
assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
movl(rtmp, 0x0F0F0F0F);
movdl(xtmp1, rtmp);
vpbroadcastd(xtmp1, xtmp1, vec_enc);
if (Assembler::AVX_512bit == vec_enc) {
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
} else {
vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
}
vpand(xtmp3, src, xtmp1, vec_enc);
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
vpsrlw(dst, src, 4, vec_enc);
vpand(dst, dst, xtmp1, vec_enc);
vpshufb(dst, xtmp2, dst, vec_enc);
vpaddb(xtmp3, dst, xtmp3, vec_enc);
vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
vpsadbw(dst, dst, xtmp1, vec_enc);
vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
vpackuswb(dst, xtmp2, dst, vec_enc);
}
}
void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
int vec_enc) {
if (VM_Version::supports_avx512_vpopcntdq()) {
vpopcntq(dst, src, vec_enc);
} else if (vec_enc == Assembler::AVX_512bit) {
assert(VM_Version::supports_avx512bw(), "");
movl(rtmp, 0x0F0F0F0F);
movdl(xtmp1, rtmp);
vpbroadcastd(xtmp1, xtmp1, vec_enc);
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
vpandq(xtmp3, src, xtmp1, vec_enc);
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
vpsrlw(dst, src, 4, vec_enc);
vpandq(dst, dst, xtmp1, vec_enc);
vpshufb(dst, xtmp2, dst, vec_enc);
vpaddb(xtmp3, dst, xtmp3, vec_enc);
vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
vpsadbw(dst, xtmp3, xtmp1, vec_enc);
} else {
// We do not see any performance benefit of running
// above instruction sequence on 256 bit vector which
// can operate over maximum 4 long elements.
ShouldNotReachHere();
}
evpmovqd(dst, dst, vec_enc);
}
#ifndef _LP64
void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
assert(VM_Version::supports_avx512bw(), "");

View File

@ -317,4 +317,12 @@ public:
void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
bool merge, BasicType bt, int vlen_enc);
void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
int vec_enc);
void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
int vec_enc);
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP

View File

@ -183,4 +183,13 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
default: return 0;
case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
}
}
#endif // CPU_X86_MATCHER_X86_HPP

View File

@ -588,6 +588,30 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_popcount_avx_lut(const char *stub_name) {
__ align64();
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
__ emit_data(0x02010100, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x04030302, relocInfo::none, 0);
__ emit_data(0x02010100, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x04030302, relocInfo::none, 0);
__ emit_data(0x02010100, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x04030302, relocInfo::none, 0);
__ emit_data(0x02010100, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x03020201, relocInfo::none, 0);
__ emit_data(0x04030302, relocInfo::none, 0);
return start;
}
address generate_iota_indices(const char *stub_name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
@ -4004,6 +4028,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
}
// support for verify_oop (must happen after universe_init)
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();

View File

@ -795,6 +795,21 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_popcount_avx_lut(const char *stub_name) {
__ align64();
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
__ emit_data64(0x0302020102010100, relocInfo::none);
__ emit_data64(0x0403030203020201, relocInfo::none);
__ emit_data64(0x0302020102010100, relocInfo::none);
__ emit_data64(0x0403030203020201, relocInfo::none);
__ emit_data64(0x0302020102010100, relocInfo::none);
__ emit_data64(0x0403030203020201, relocInfo::none);
__ emit_data64(0x0302020102010100, relocInfo::none);
__ emit_data64(0x0403030203020201, relocInfo::none);
return start;
}
address generate_iota_indices(const char *stub_name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
@ -7713,6 +7728,11 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
}
// support for verify_oop (must happen after universe_init)
if (VerifyOops) {
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();

View File

@ -59,6 +59,7 @@ address StubRoutines::x86::_vector_double_sign_flip = NULL;
address StubRoutines::x86::_vector_byte_perm_mask = NULL;
address StubRoutines::x86::_vector_long_sign_mask = NULL;
address StubRoutines::x86::_vector_iota_indices = NULL;
address StubRoutines::x86::_vector_popcount_lut = NULL;
address StubRoutines::x86::_vector_32_bit_mask = NULL;
address StubRoutines::x86::_vector_64_bit_mask = NULL;
#ifdef _LP64

View File

@ -177,6 +177,7 @@ class x86 {
static address _vector_short_shuffle_mask;
static address _vector_long_shuffle_mask;
static address _vector_iota_indices;
static address _vector_popcount_lut;
#ifdef _LP64
static juint _k256_W[];
static address _k256_W_adr;
@ -340,6 +341,9 @@ class x86 {
return _vector_iota_indices;
}
static address vector_popcount_lut() {
return _vector_popcount_lut;
}
#ifdef _LP64
static address k256_W_addr() { return _k256_W_adr; }
static address k512_W_addr() { return _k512_W_addr; }

View File

@ -1405,8 +1405,12 @@ const bool Matcher::match_rule_supported(int opcode) {
}
break;
case Op_PopCountVI:
if (!UsePopCountInstruction || (UseAVX < 2)) {
return false;
}
break;
case Op_PopCountVL:
if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
if (!UsePopCountInstruction || (UseAVX <= 2)) {
return false;
}
break;
@ -1861,6 +1865,18 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_PopCountVI:
if (!VM_Version::supports_avx512_vpopcntdq() &&
(vlen == 16) && !VM_Version::supports_avx512bw()) {
return false;
}
break;
case Op_PopCountVL:
if (!VM_Version::supports_avx512_vpopcntdq() &&
((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
return false;
}
break;
}
return true; // Per default match rules are supported.
}
@ -8571,28 +8587,54 @@ instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
// --------------------------------- PopCount --------------------------------------
instruct vpopcountI(vec dst, vec src) %{
instruct vpopcountI_popcntd(vec dst, vec src) %{
predicate(VM_Version::supports_avx512_vpopcntdq());
match(Set dst (PopCountVI src));
format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
ins_encode %{
assert(UsePopCountInstruction, "not enabled");
int vlen_enc = vector_length_encoding(this);
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcountL(vec dst, vec src) %{
match(Set dst (PopCountVL src));
format %{ "vpopcntq $dst,$src\t! vector popcount packedL" %}
instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
predicate(!VM_Version::supports_avx512_vpopcntdq());
match(Set dst (PopCountVI src));
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
format %{ "vector_popcount_int $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
ins_encode %{
assert(UsePopCountInstruction, "not enabled");
int vlen_enc = vector_length_encoding(this);
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcountL_popcntd(vec dst, vec src) %{
predicate(VM_Version::supports_avx512_vpopcntdq());
match(Set dst (PopCountVL src));
format %{ "vector_popcount_long $dst, $src\t! vector popcount packedL" %}
ins_encode %{
assert(UsePopCountInstruction, "not enabled");
int vlen_enc = vector_length_encoding(this, $src);
__ vpopcntq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
predicate(!VM_Version::supports_avx512_vpopcntdq());
match(Set dst (PopCountVL src));
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
format %{ "vector_popcount_long $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
ins_encode %{
assert(UsePopCountInstruction, "not enabled");
int vlen_enc = vector_length_encoding(this, $src);
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

View File

@ -956,6 +956,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
(stride_con < 0 && ((max_jint + stride_con) < limit_type->_lo)))
return false; // overflow
// Rudimentary cost model to estimate loop unrolling
// factor.
// Adjust body_size to determine if we unroll or not
uint body_size = _body.size();
// Key test to unroll loop in CRC32 java code
@ -968,6 +970,11 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
case Op_ModL: body_size += 30; break;
case Op_DivL: body_size += 30; break;
case Op_MulL: body_size += 10; break;
case Op_PopCountVI:
case Op_PopCountVL: {
const TypeVect* vt = n->bottom_type()->is_vect();
body_size += Matcher::vector_op_pre_select_sz_estimate(n->Opcode(), vt->element_basic_type(), vt->length());
} break;
case Op_StrComp:
case Op_StrEquals:
case Op_StrIndexOf:

View File

@ -24,9 +24,8 @@
/**
* @test
* @summary Test vectorization of popcount for Long
* @requires vm.cpu.features ~= ".*avx512dq.*"
* @requires vm.cpu.features ~= ".*vpopcntdq.*"
* @requires vm.compiler2.enabled
* @requires vm.cpu.features ~= ".*avx512bw.*"
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
* @library /test/lib /
* @run driver compiler.vectorization.TestPopCountVectorLong