8281375: Accelerate bitCount operation for AVX2 and AVX512 target.
Reviewed-by: sviswanathan, thartmann
This commit is contained in:
parent
3cf83a671e
commit
fde3149896
@ -163,4 +163,10 @@
|
|||||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||||
static const bool supports_encode_ascii_array = true;
|
static const bool supports_encode_ascii_array = true;
|
||||||
|
|
||||||
|
// Returns pre-selection estimated size of a vector operation.
|
||||||
|
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP
|
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP
|
||||||
|
@ -155,4 +155,9 @@
|
|||||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||||
static const bool supports_encode_ascii_array = false;
|
static const bool supports_encode_ascii_array = false;
|
||||||
|
|
||||||
|
// Returns pre-selection estimated cost of a vector operation.
|
||||||
|
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif // CPU_ARM_MATCHER_ARM_HPP
|
#endif // CPU_ARM_MATCHER_ARM_HPP
|
||||||
|
@ -164,4 +164,10 @@
|
|||||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||||
static const bool supports_encode_ascii_array = true;
|
static const bool supports_encode_ascii_array = true;
|
||||||
|
|
||||||
|
// Returns pre-selection estimated cost of a vector operation.
|
||||||
|
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // CPU_PPC_MATCHER_PPC_HPP
|
#endif // CPU_PPC_MATCHER_PPC_HPP
|
||||||
|
@ -153,4 +153,9 @@
|
|||||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||||
static const bool supports_encode_ascii_array = true;
|
static const bool supports_encode_ascii_array = true;
|
||||||
|
|
||||||
|
// Returns pre-selection estimated cost of a vector operation.
|
||||||
|
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif // CPU_S390_MATCHER_S390_HPP
|
#endif // CPU_S390_MATCHER_S390_HPP
|
||||||
|
@ -8317,8 +8317,28 @@ void Assembler::vpbroadcastw(XMMRegister dst, Address src, int vector_len) {
|
|||||||
emit_operand(dst, src);
|
emit_operand(dst, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
// xmm/mem sourced byte/word/dword/qword replicate
|
void Assembler::vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||||
|
assert(UseAVX > 0, "requires some form of AVX");
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int16((unsigned char)0xF6, (0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||||
|
assert(UseAVX > 0, "requires some form of AVX");
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int16(0x6A, (0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||||
|
assert(UseAVX > 0, "requires some form of AVX");
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int16(0x62, (0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
// xmm/mem sourced byte/word/dword/qword replicate
|
||||||
void Assembler::evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
|
void Assembler::evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
|
||||||
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
|
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
|
||||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
|
||||||
|
@ -1933,10 +1933,17 @@ private:
|
|||||||
// Interleave Low Doublewords
|
// Interleave Low Doublewords
|
||||||
void punpckldq(XMMRegister dst, XMMRegister src);
|
void punpckldq(XMMRegister dst, XMMRegister src);
|
||||||
void punpckldq(XMMRegister dst, Address src);
|
void punpckldq(XMMRegister dst, Address src);
|
||||||
|
void vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
|
// Interleave High Doublewords
|
||||||
|
void vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
// Interleave Low Quadwords
|
// Interleave Low Quadwords
|
||||||
void punpcklqdq(XMMRegister dst, XMMRegister src);
|
void punpcklqdq(XMMRegister dst, XMMRegister src);
|
||||||
|
|
||||||
|
// Vector sum of absolute difference.
|
||||||
|
void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
#ifndef _LP64 // no 32bit push/pop on amd64
|
#ifndef _LP64 // no 32bit push/pop on amd64
|
||||||
void pushl(Address src);
|
void pushl(Address src);
|
||||||
#endif
|
#endif
|
||||||
|
@ -4321,6 +4321,94 @@ void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, in
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Following is lookup table based popcount computation algorithm:-
|
||||||
|
// Index Bit set count
|
||||||
|
// [ 0000 -> 0,
|
||||||
|
// 0001 -> 1,
|
||||||
|
// 0010 -> 1,
|
||||||
|
// 0011 -> 2,
|
||||||
|
// 0100 -> 1,
|
||||||
|
// 0101 -> 2,
|
||||||
|
// 0110 -> 2,
|
||||||
|
// 0111 -> 3,
|
||||||
|
// 1000 -> 1,
|
||||||
|
// 1001 -> 2,
|
||||||
|
// 1010 -> 3,
|
||||||
|
// 1011 -> 3,
|
||||||
|
// 1100 -> 2,
|
||||||
|
// 1101 -> 3,
|
||||||
|
// 1111 -> 4 ]
|
||||||
|
// a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
|
||||||
|
// shuffle indices for lookup table access.
|
||||||
|
// b. Right shift each byte of vector lane by 4 positions.
|
||||||
|
// c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
|
||||||
|
// shuffle indices for lookup table access.
|
||||||
|
// d. Add the bitset count of upper and lower 4 bits of each byte.
|
||||||
|
// e. Unpack double words to quad words and compute sum of absolute difference of bitset
|
||||||
|
// count of all the bytes of a quadword.
|
||||||
|
// f. Perform step e. for upper 128bit vector lane.
|
||||||
|
// g. Pack the bitset count of quadwords back to double word.
|
||||||
|
// h. Unpacking and packing operations are not needed for 64bit vector lane.
|
||||||
|
void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||||
|
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||||
|
int vec_enc) {
|
||||||
|
if (VM_Version::supports_avx512_vpopcntdq()) {
|
||||||
|
vpopcntd(dst, src, vec_enc);
|
||||||
|
} else {
|
||||||
|
assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
|
||||||
|
movl(rtmp, 0x0F0F0F0F);
|
||||||
|
movdl(xtmp1, rtmp);
|
||||||
|
vpbroadcastd(xtmp1, xtmp1, vec_enc);
|
||||||
|
if (Assembler::AVX_512bit == vec_enc) {
|
||||||
|
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
|
||||||
|
} else {
|
||||||
|
vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
|
||||||
|
}
|
||||||
|
vpand(xtmp3, src, xtmp1, vec_enc);
|
||||||
|
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
|
||||||
|
vpsrlw(dst, src, 4, vec_enc);
|
||||||
|
vpand(dst, dst, xtmp1, vec_enc);
|
||||||
|
vpshufb(dst, xtmp2, dst, vec_enc);
|
||||||
|
vpaddb(xtmp3, dst, xtmp3, vec_enc);
|
||||||
|
vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
|
||||||
|
vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
|
||||||
|
vpsadbw(dst, dst, xtmp1, vec_enc);
|
||||||
|
vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
|
||||||
|
vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
|
||||||
|
vpackuswb(dst, xtmp2, dst, vec_enc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||||
|
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||||
|
int vec_enc) {
|
||||||
|
if (VM_Version::supports_avx512_vpopcntdq()) {
|
||||||
|
vpopcntq(dst, src, vec_enc);
|
||||||
|
} else if (vec_enc == Assembler::AVX_512bit) {
|
||||||
|
assert(VM_Version::supports_avx512bw(), "");
|
||||||
|
movl(rtmp, 0x0F0F0F0F);
|
||||||
|
movdl(xtmp1, rtmp);
|
||||||
|
vpbroadcastd(xtmp1, xtmp1, vec_enc);
|
||||||
|
evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
|
||||||
|
vpandq(xtmp3, src, xtmp1, vec_enc);
|
||||||
|
vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
|
||||||
|
vpsrlw(dst, src, 4, vec_enc);
|
||||||
|
vpandq(dst, dst, xtmp1, vec_enc);
|
||||||
|
vpshufb(dst, xtmp2, dst, vec_enc);
|
||||||
|
vpaddb(xtmp3, dst, xtmp3, vec_enc);
|
||||||
|
vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
|
||||||
|
vpsadbw(dst, xtmp3, xtmp1, vec_enc);
|
||||||
|
} else {
|
||||||
|
// We do not see any performance benefit of running
|
||||||
|
// above instruction sequence on 256 bit vector which
|
||||||
|
// can operate over maximum 4 long elements.
|
||||||
|
ShouldNotReachHere();
|
||||||
|
}
|
||||||
|
evpmovqd(dst, dst, vec_enc);
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef _LP64
|
#ifndef _LP64
|
||||||
void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
|
void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
|
||||||
assert(VM_Version::supports_avx512bw(), "");
|
assert(VM_Version::supports_avx512bw(), "");
|
||||||
|
@ -317,4 +317,12 @@ public:
|
|||||||
void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
|
void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
|
||||||
bool merge, BasicType bt, int vlen_enc);
|
bool merge, BasicType bt, int vlen_enc);
|
||||||
|
|
||||||
|
void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||||
|
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||||
|
int vec_enc);
|
||||||
|
|
||||||
|
void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
|
||||||
|
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||||
|
int vec_enc);
|
||||||
|
|
||||||
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
|
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
|
||||||
|
@ -183,4 +183,13 @@
|
|||||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||||
static const bool supports_encode_ascii_array = true;
|
static const bool supports_encode_ascii_array = true;
|
||||||
|
|
||||||
|
// Returns pre-selection estimated cost of a vector operation.
|
||||||
|
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
|
||||||
|
switch(vopc) {
|
||||||
|
default: return 0;
|
||||||
|
case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
|
||||||
|
case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif // CPU_X86_MATCHER_X86_HPP
|
#endif // CPU_X86_MATCHER_X86_HPP
|
||||||
|
@ -588,6 +588,30 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
address generate_popcount_avx_lut(const char *stub_name) {
|
||||||
|
__ align64();
|
||||||
|
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||||
|
address start = __ pc();
|
||||||
|
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x02010100, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x03020201, relocInfo::none, 0);
|
||||||
|
__ emit_data(0x04030302, relocInfo::none, 0);
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
address generate_iota_indices(const char *stub_name) {
|
address generate_iota_indices(const char *stub_name) {
|
||||||
__ align(CodeEntryAlignment);
|
__ align(CodeEntryAlignment);
|
||||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||||
@ -4004,6 +4028,11 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
|
StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
|
||||||
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
||||||
|
|
||||||
|
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
|
||||||
|
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
|
||||||
|
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
|
||||||
|
}
|
||||||
|
|
||||||
// support for verify_oop (must happen after universe_init)
|
// support for verify_oop (must happen after universe_init)
|
||||||
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
||||||
|
|
||||||
|
@ -795,6 +795,21 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
address generate_popcount_avx_lut(const char *stub_name) {
|
||||||
|
__ align64();
|
||||||
|
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||||
|
address start = __ pc();
|
||||||
|
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||||
|
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||||
|
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||||
|
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||||
|
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||||
|
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||||
|
__ emit_data64(0x0302020102010100, relocInfo::none);
|
||||||
|
__ emit_data64(0x0403030203020201, relocInfo::none);
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
address generate_iota_indices(const char *stub_name) {
|
address generate_iota_indices(const char *stub_name) {
|
||||||
__ align(CodeEntryAlignment);
|
__ align(CodeEntryAlignment);
|
||||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||||
@ -7713,6 +7728,11 @@ address generate_avx_ghash_processBlocks() {
|
|||||||
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
|
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
|
||||||
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
|
||||||
|
|
||||||
|
if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
|
||||||
|
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
|
||||||
|
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
|
||||||
|
}
|
||||||
|
|
||||||
// support for verify_oop (must happen after universe_init)
|
// support for verify_oop (must happen after universe_init)
|
||||||
if (VerifyOops) {
|
if (VerifyOops) {
|
||||||
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
||||||
|
@ -59,6 +59,7 @@ address StubRoutines::x86::_vector_double_sign_flip = NULL;
|
|||||||
address StubRoutines::x86::_vector_byte_perm_mask = NULL;
|
address StubRoutines::x86::_vector_byte_perm_mask = NULL;
|
||||||
address StubRoutines::x86::_vector_long_sign_mask = NULL;
|
address StubRoutines::x86::_vector_long_sign_mask = NULL;
|
||||||
address StubRoutines::x86::_vector_iota_indices = NULL;
|
address StubRoutines::x86::_vector_iota_indices = NULL;
|
||||||
|
address StubRoutines::x86::_vector_popcount_lut = NULL;
|
||||||
address StubRoutines::x86::_vector_32_bit_mask = NULL;
|
address StubRoutines::x86::_vector_32_bit_mask = NULL;
|
||||||
address StubRoutines::x86::_vector_64_bit_mask = NULL;
|
address StubRoutines::x86::_vector_64_bit_mask = NULL;
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
|
@ -177,6 +177,7 @@ class x86 {
|
|||||||
static address _vector_short_shuffle_mask;
|
static address _vector_short_shuffle_mask;
|
||||||
static address _vector_long_shuffle_mask;
|
static address _vector_long_shuffle_mask;
|
||||||
static address _vector_iota_indices;
|
static address _vector_iota_indices;
|
||||||
|
static address _vector_popcount_lut;
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
static juint _k256_W[];
|
static juint _k256_W[];
|
||||||
static address _k256_W_adr;
|
static address _k256_W_adr;
|
||||||
@ -340,6 +341,9 @@ class x86 {
|
|||||||
return _vector_iota_indices;
|
return _vector_iota_indices;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static address vector_popcount_lut() {
|
||||||
|
return _vector_popcount_lut;
|
||||||
|
}
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
static address k256_W_addr() { return _k256_W_adr; }
|
static address k256_W_addr() { return _k256_W_adr; }
|
||||||
static address k512_W_addr() { return _k512_W_addr; }
|
static address k512_W_addr() { return _k512_W_addr; }
|
||||||
|
@ -1405,8 +1405,12 @@ const bool Matcher::match_rule_supported(int opcode) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Op_PopCountVI:
|
case Op_PopCountVI:
|
||||||
|
if (!UsePopCountInstruction || (UseAVX < 2)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case Op_PopCountVL:
|
case Op_PopCountVL:
|
||||||
if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
|
if (!UsePopCountInstruction || (UseAVX <= 2)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -1861,6 +1865,18 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case Op_PopCountVI:
|
||||||
|
if (!VM_Version::supports_avx512_vpopcntdq() &&
|
||||||
|
(vlen == 16) && !VM_Version::supports_avx512bw()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case Op_PopCountVL:
|
||||||
|
if (!VM_Version::supports_avx512_vpopcntdq() &&
|
||||||
|
((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return true; // Per default match rules are supported.
|
return true; // Per default match rules are supported.
|
||||||
}
|
}
|
||||||
@ -8571,28 +8587,54 @@ instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
|
|||||||
|
|
||||||
// --------------------------------- PopCount --------------------------------------
|
// --------------------------------- PopCount --------------------------------------
|
||||||
|
|
||||||
instruct vpopcountI(vec dst, vec src) %{
|
instruct vpopcountI_popcntd(vec dst, vec src) %{
|
||||||
|
predicate(VM_Version::supports_avx512_vpopcntdq());
|
||||||
match(Set dst (PopCountVI src));
|
match(Set dst (PopCountVI src));
|
||||||
format %{ "vpopcntd $dst,$src\t! vector popcount packedI" %}
|
format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
assert(UsePopCountInstruction, "not enabled");
|
assert(UsePopCountInstruction, "not enabled");
|
||||||
|
|
||||||
int vlen_enc = vector_length_encoding(this);
|
int vlen_enc = vector_length_encoding(this);
|
||||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
|
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
|
||||||
%}
|
%}
|
||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
instruct vpopcountL(vec dst, vec src) %{
|
instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
|
||||||
match(Set dst (PopCountVL src));
|
predicate(!VM_Version::supports_avx512_vpopcntdq());
|
||||||
format %{ "vpopcntq $dst,$src\t! vector popcount packedL" %}
|
match(Set dst (PopCountVI src));
|
||||||
|
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
|
||||||
|
format %{ "vector_popcount_int $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
assert(UsePopCountInstruction, "not enabled");
|
assert(UsePopCountInstruction, "not enabled");
|
||||||
|
int vlen_enc = vector_length_encoding(this);
|
||||||
|
__ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
|
||||||
|
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vpopcountL_popcntd(vec dst, vec src) %{
|
||||||
|
predicate(VM_Version::supports_avx512_vpopcntdq());
|
||||||
|
match(Set dst (PopCountVL src));
|
||||||
|
format %{ "vector_popcount_long $dst, $src\t! vector popcount packedL" %}
|
||||||
|
ins_encode %{
|
||||||
|
assert(UsePopCountInstruction, "not enabled");
|
||||||
int vlen_enc = vector_length_encoding(this, $src);
|
int vlen_enc = vector_length_encoding(this, $src);
|
||||||
__ vpopcntq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
|
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
|
||||||
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
|
||||||
|
predicate(!VM_Version::supports_avx512_vpopcntdq());
|
||||||
|
match(Set dst (PopCountVL src));
|
||||||
|
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
|
||||||
|
format %{ "vector_popcount_long $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
|
||||||
|
ins_encode %{
|
||||||
|
assert(UsePopCountInstruction, "not enabled");
|
||||||
|
int vlen_enc = vector_length_encoding(this, $src);
|
||||||
|
__ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
|
||||||
|
$xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
|
||||||
%}
|
%}
|
||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
@ -956,6 +956,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
|||||||
(stride_con < 0 && ((max_jint + stride_con) < limit_type->_lo)))
|
(stride_con < 0 && ((max_jint + stride_con) < limit_type->_lo)))
|
||||||
return false; // overflow
|
return false; // overflow
|
||||||
|
|
||||||
|
// Rudimentary cost model to estimate loop unrolling
|
||||||
|
// factor.
|
||||||
// Adjust body_size to determine if we unroll or not
|
// Adjust body_size to determine if we unroll or not
|
||||||
uint body_size = _body.size();
|
uint body_size = _body.size();
|
||||||
// Key test to unroll loop in CRC32 java code
|
// Key test to unroll loop in CRC32 java code
|
||||||
@ -968,6 +970,11 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
|||||||
case Op_ModL: body_size += 30; break;
|
case Op_ModL: body_size += 30; break;
|
||||||
case Op_DivL: body_size += 30; break;
|
case Op_DivL: body_size += 30; break;
|
||||||
case Op_MulL: body_size += 10; break;
|
case Op_MulL: body_size += 10; break;
|
||||||
|
case Op_PopCountVI:
|
||||||
|
case Op_PopCountVL: {
|
||||||
|
const TypeVect* vt = n->bottom_type()->is_vect();
|
||||||
|
body_size += Matcher::vector_op_pre_select_sz_estimate(n->Opcode(), vt->element_basic_type(), vt->length());
|
||||||
|
} break;
|
||||||
case Op_StrComp:
|
case Op_StrComp:
|
||||||
case Op_StrEquals:
|
case Op_StrEquals:
|
||||||
case Op_StrIndexOf:
|
case Op_StrIndexOf:
|
||||||
|
@ -24,9 +24,8 @@
|
|||||||
/**
|
/**
|
||||||
* @test
|
* @test
|
||||||
* @summary Test vectorization of popcount for Long
|
* @summary Test vectorization of popcount for Long
|
||||||
* @requires vm.cpu.features ~= ".*avx512dq.*"
|
|
||||||
* @requires vm.cpu.features ~= ".*vpopcntdq.*"
|
|
||||||
* @requires vm.compiler2.enabled
|
* @requires vm.compiler2.enabled
|
||||||
|
* @requires vm.cpu.features ~= ".*avx512bw.*"
|
||||||
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
|
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
|
||||||
* @library /test/lib /
|
* @library /test/lib /
|
||||||
* @run driver compiler.vectorization.TestPopCountVectorLong
|
* @run driver compiler.vectorization.TestPopCountVectorLong
|
||||||
|
Loading…
x
Reference in New Issue
Block a user