8281375: Accelerate bitCount operation for AVX2 and AVX512 target.
Reviewed-by: sviswanathan, thartmann
This commit is contained in:
parent
3cf83a671e
commit
fde3149896
@ -163,4 +163,10 @@
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = true;
|
||||
|
||||
// Returns pre-selection estimated size of a vector operation.
|
||||
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP
|
||||
|
@ -155,4 +155,9 @@
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = false;
|
||||
|
||||
// Returns pre-selection estimated cost of a vector operation.
|
||||
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // CPU_ARM_MATCHER_ARM_HPP
|
||||
|
@ -164,4 +164,10 @@
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = true;
|
||||
|
||||
// Returns pre-selection estimated cost of a vector operation.
|
||||
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // CPU_PPC_MATCHER_PPC_HPP
|
||||
|
@ -153,4 +153,9 @@
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = true;
|
||||
|
||||
// Returns pre-selection estimated cost of a vector operation.
|
||||
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // CPU_S390_MATCHER_S390_HPP
|
||||
|
@ -8317,8 +8317,28 @@ void Assembler::vpbroadcastw(XMMRegister dst, Address src, int vector_len) {
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
// xmm/mem sourced byte/word/dword/qword replicate
|
||||
void Assembler::vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
assert(UseAVX > 0, "requires some form of AVX");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int16((unsigned char)0xF6, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
assert(UseAVX > 0, "requires some form of AVX");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int16(0x6A, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
assert(UseAVX > 0, "requires some form of AVX");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int16(0x62, (0xC0 | encode));
|
||||
}
|
||||
|
||||
// xmm/mem sourced byte/word/dword/qword replicate
|
||||
void Assembler::evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
|
||||
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
|
||||
|
@ -1933,10 +1933,17 @@ private:
|
||||
// Interleave Low Doublewords
|
||||
void punpckldq(XMMRegister dst, XMMRegister src);
|
||||
void punpckldq(XMMRegister dst, Address src);
|
||||
void vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
// Interleave High Doublewords
|
||||
void vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
// Interleave Low Quadwords
|
||||
void punpcklqdq(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Vector sum of absolute difference.
|
||||
void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
#ifndef _LP64 // no 32bit push/pop on amd64
|
||||
void pushl(Address src);
|
||||
#endif
|
||||
|
@ -4321,6 +4321,94 @@ void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, in
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Following is lookup table based popcount computation algorithm:-
|
||||
// Index Bit set count
|
||||
// [ 0000 -> 0,
|
||||
// 0001 -> 1,
|
||||
// 0010 -> 1,
|
||||
// 0011 -> 2,
|
||||
// 0100 -> 1,
|
||||
// 0101 -> 2,
|
||||
// 0110 -> 2,
|
||||
// 0111 -> 3,
|
||||
// 1000 -> 1,
|
||||
// 1001 -> 2,
|
||||
// 1010 -> 3,
|
||||
// 1011 -> 3,
|
||||
// 1100 -> 2,
|
||||
// 1101 -> 3,
|
||||
// 1111 -> 4 ]
|
||||
// a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
|
||||
// shuffle indices for lookup table access.
|
||||
// b. Right shift each byte of vector lane by 4 positions.
|
||||
// c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
|
||||
// shuffle indices for lookup table access.
|
||||
// d. Add the bitset count of upper and lower 4 bits of each byte.
|
||||
// e. Unpack double words to quad words and compute sum of absolute difference of bitset
|
||||
// count of all the bytes of a quadword.
|
||||
// f. Perform step e. for upper 128bit vector lane.
|
||||
// g. Pack the bitset count of quadwords back to double word.
|
||||
// h. Unpacking and packing operations are not needed for 64bit vector lane.
|
||||
void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||
int vec_enc) {
|
||||
if (VM_Version::supports_avx512_vpopcntdq()) {
|
||||
vpopcntd(dst, src, vec_enc);
|
||||
} else {
|
||||
assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
|
||||
movl(rtmp, 0x0F0F0F0F);
|
||||
movdl(xtmp1, rtmp);
|
||||
vpbroadcastd(xtmp1, xtmp1, vec_enc);
|
||||
if (Assembler::AVX_512bit == vec_enc) {
|
||||
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
|
||||
} else {
|
||||
vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
|
||||
}
|
||||
vpand(xtmp3, src, xtmp1, vec_enc);
|
||||
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
|
||||
vpsrlw(dst, src, 4, vec_enc);
|
||||
vpand(dst, dst, xtmp1, vec_enc);
|
||||
vpshufb(dst, xtmp2, dst, vec_enc);
|
||||
vpaddb(xtmp3, dst, xtmp3, vec_enc);
|
||||
vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
|
||||
vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
|
||||
vpsadbw(dst, dst, xtmp1, vec_enc);
|
||||
vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
|
||||
vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
|
||||
vpackuswb(dst, xtmp2, dst, vec_enc);
|
||||
}
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||
int vec_enc) {
|
||||
if (VM_Version::supports_avx512_vpopcntdq()) {
|
||||
vpopcntq(dst, src, vec_enc);
|
||||
} else if (vec_enc == Assembler::AVX_512bit) {
|
||||
assert(VM_Version::supports_avx512bw(), "");
|
||||
movl(rtmp, 0x0F0F0F0F);
|
||||
movdl(xtmp1, rtmp);
|
||||
vpbroadcastd(xtmp1, xtmp1, vec_enc);
|
||||
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
|
||||
vpandq(xtmp3, src, xtmp1, vec_enc);
|
||||
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
|
||||
vpsrlw(dst, src, 4, vec_enc);
|
||||
vpandq(dst, dst, xtmp1, vec_enc);
|
||||
vpshufb(dst, xtmp2, dst, vec_enc);
|
||||
vpaddb(xtmp3, dst, xtmp3, vec_enc);
|
||||
vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
|
||||
vpsadbw(dst, xtmp3, xtmp1, vec_enc);
|
||||
} else {
|
||||
// We do not see any performance benefit of running
|
||||
// above instruction sequence on 256 bit vector which
|
||||
// can operate over maximum 4 long elements.
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
evpmovqd(dst, dst, vec_enc);
|
||||
}
|
||||
|
||||
#ifndef _LP64
|
||||
void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
|
||||
assert(VM_Version::supports_avx512bw(), "");
|
||||
|
@ -317,4 +317,12 @@ public:
|
||||
void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
|
||||
bool merge, BasicType bt, int vlen_enc);
|
||||
|
||||
void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||
int vec_enc);
|
||||
|
||||
void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||
int vec_enc);
|
||||
|
||||
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
|
||||
|
@ -183,4 +183,13 @@
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = true;
|
||||
|
||||
// Returns pre-selection estimated cost of a vector operation.
|
||||
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||
switch(vopc) {
|
||||
default: return 0;
|
||||
case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
|
||||
case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CPU_X86_MATCHER_X86_HPP
|
||||
|
@ -588,6 +588,30 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_popcount_avx_lut(const char *stub_name) {
|
||||
__ align64();
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
address generate_iota_indices(const char *stub_name) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
@ -4004,6 +4028,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
|
||||
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
||||
|
||||
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
|
||||
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
|
||||
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
|
||||
}
|
||||
|
||||
// support for verify_oop (must happen after universe_init)
|
||||
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
||||
|
||||
|
@ -795,6 +795,21 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_popcount_avx_lut(const char *stub_name) {
|
||||
__ align64();
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_iota_indices(const char *stub_name) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
@ -7713,6 +7728,11 @@ address generate_avx_ghash_processBlocks() {
|
||||
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
|
||||
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
||||
|
||||
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
|
||||
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
|
||||
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
|
||||
}
|
||||
|
||||
// support for verify_oop (must happen after universe_init)
|
||||
if (VerifyOops) {
|
||||
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
||||
|
@ -59,6 +59,7 @@ address StubRoutines::x86::_vector_double_sign_flip = NULL;
|
||||
address StubRoutines::x86::_vector_byte_perm_mask = NULL;
|
||||
address StubRoutines::x86::_vector_long_sign_mask = NULL;
|
||||
address StubRoutines::x86::_vector_iota_indices = NULL;
|
||||
address StubRoutines::x86::_vector_popcount_lut = NULL;
|
||||
address StubRoutines::x86::_vector_32_bit_mask = NULL;
|
||||
address StubRoutines::x86::_vector_64_bit_mask = NULL;
|
||||
#ifdef _LP64
|
||||
|
@ -177,6 +177,7 @@ class x86 {
|
||||
static address _vector_short_shuffle_mask;
|
||||
static address _vector_long_shuffle_mask;
|
||||
static address _vector_iota_indices;
|
||||
static address _vector_popcount_lut;
|
||||
#ifdef _LP64
|
||||
static juint _k256_W[];
|
||||
static address _k256_W_adr;
|
||||
@ -340,6 +341,9 @@ class x86 {
|
||||
return _vector_iota_indices;
|
||||
}
|
||||
|
||||
static address vector_popcount_lut() {
|
||||
return _vector_popcount_lut;
|
||||
}
|
||||
#ifdef _LP64
|
||||
static address k256_W_addr() { return _k256_W_adr; }
|
||||
static address k512_W_addr() { return _k512_W_addr; }
|
||||
|
@ -1405,8 +1405,12 @@ const bool Matcher::match_rule_supported(int opcode) {
|
||||
}
|
||||
break;
|
||||
case Op_PopCountVI:
|
||||
if (!UsePopCountInstruction || (UseAVX < 2)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_PopCountVL:
|
||||
if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
|
||||
if (!UsePopCountInstruction || (UseAVX <= 2)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@ -1861,6 +1865,18 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_PopCountVI:
|
||||
if (!VM_Version::supports_avx512_vpopcntdq() &&
|
||||
(vlen == 16) && !VM_Version::supports_avx512bw()) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_PopCountVL:
|
||||
if (!VM_Version::supports_avx512_vpopcntdq() &&
|
||||
((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return true; // Per default match rules are supported.
|
||||
}
|
||||
@ -8571,28 +8587,54 @@ instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
|
||||
|
||||
// --------------------------------- PopCount --------------------------------------
|
||||
|
||||
instruct vpopcountI(vec dst, vec src) %{
|
||||
instruct vpopcountI_popcntd(vec dst, vec src) %{
|
||||
predicate(VM_Version::supports_avx512_vpopcntdq());
|
||||
match(Set dst (PopCountVI src));
|
||||
format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
|
||||
format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
|
||||
ins_encode %{
|
||||
assert(UsePopCountInstruction, "not enabled");
|
||||
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
|
||||
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcountL(vec dst, vec src) %{
|
||||
match(Set dst (PopCountVL src));
|
||||
format %{ "vpopcntq $dst,$src\t! vector popcount packedL" %}
|
||||
instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
|
||||
predicate(!VM_Version::supports_avx512_vpopcntdq());
|
||||
match(Set dst (PopCountVI src));
|
||||
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
|
||||
format %{ "vector_popcount_int $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UsePopCountInstruction, "not enabled");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
|
||||
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcountL_popcntd(vec dst, vec src) %{
|
||||
predicate(VM_Version::supports_avx512_vpopcntdq());
|
||||
match(Set dst (PopCountVL src));
|
||||
format %{ "vector_popcount_long $dst, $src\t! vector popcount packedL" %}
|
||||
ins_encode %{
|
||||
assert(UsePopCountInstruction, "not enabled");
|
||||
int vlen_enc = vector_length_encoding(this, $src);
|
||||
__ vpopcntq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
|
||||
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
|
||||
predicate(!VM_Version::supports_avx512_vpopcntdq());
|
||||
match(Set dst (PopCountVL src));
|
||||
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
|
||||
format %{ "vector_popcount_long $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UsePopCountInstruction, "not enabled");
|
||||
int vlen_enc = vector_length_encoding(this, $src);
|
||||
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
|
||||
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
@ -956,6 +956,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
(stride_con < 0 && ((max_jint + stride_con) < limit_type->_lo)))
|
||||
return false; // overflow
|
||||
|
||||
// Rudimentary cost model to estimate loop unrolling
|
||||
// factor.
|
||||
// Adjust body_size to determine if we unroll or not
|
||||
uint body_size = _body.size();
|
||||
// Key test to unroll loop in CRC32 java code
|
||||
@ -968,6 +970,11 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
case Op_ModL: body_size += 30; break;
|
||||
case Op_DivL: body_size += 30; break;
|
||||
case Op_MulL: body_size += 10; break;
|
||||
case Op_PopCountVI:
|
||||
case Op_PopCountVL: {
|
||||
const TypeVect* vt = n->bottom_type()->is_vect();
|
||||
body_size += Matcher::vector_op_pre_select_sz_estimate(n->Opcode(), vt->element_basic_type(), vt->length());
|
||||
} break;
|
||||
case Op_StrComp:
|
||||
case Op_StrEquals:
|
||||
case Op_StrIndexOf:
|
||||
|
@ -24,9 +24,8 @@
|
||||
/**
|
||||
* @test
|
||||
* @summary Test vectorization of popcount for Long
|
||||
* @requires vm.cpu.features ~= ".*avx512dq.*"
|
||||
* @requires vm.cpu.features ~= ".*vpopcntdq.*"
|
||||
* @requires vm.compiler2.enabled
|
||||
* @requires vm.cpu.features ~= ".*avx512bw.*"
|
||||
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.vectorization.TestPopCountVectorLong
|
||||
|
Loading…
x
Reference in New Issue
Block a user