8283232: x86: Improve vector broadcast operations
Reviewed-by: kvn, jbhateja
This commit is contained in:
parent
966ab219b4
commit
92d2982f3f
@ -896,6 +896,8 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
|
||||
tail_size = 1;
|
||||
break;
|
||||
|
||||
case 0x10: // movups
|
||||
case 0x11: // movups
|
||||
case 0x12: // movlps
|
||||
case 0x28: // movaps
|
||||
case 0x2E: // ucomiss
|
||||
@ -2561,10 +2563,22 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) {
|
||||
emit_int16(0x12, 0xC0 | encode);
|
||||
}
|
||||
|
||||
void Assembler::movddup(XMMRegister dst, Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse3(), ""));
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_DUP, /* input_size_in_bits */ EVEX_64bit);
|
||||
attributes.set_rex_vex_w_reverted();
|
||||
simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x12);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vmovddup(XMMRegister dst, Address src, int vector_len) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_DUP, /* input_size_in_bits */ EVEX_64bit);
|
||||
attributes.set_rex_vex_w_reverted();
|
||||
simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x12);
|
||||
@ -3505,6 +3519,46 @@ void Assembler::movswl(Register dst, Register src) { // movsxw
|
||||
emit_int24(0x0F, (unsigned char)0xBF, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::movups(XMMRegister dst, Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse(), ""));
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit);
|
||||
simd_prefix(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x10);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vmovups(XMMRegister dst, Address src, int vector_len) {
|
||||
assert(vector_len == AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit);
|
||||
simd_prefix(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x10);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::movups(Address dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse(), ""));
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit);
|
||||
simd_prefix(src, xnoreg, dst, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x11);
|
||||
emit_operand(src, dst);
|
||||
}
|
||||
|
||||
void Assembler::vmovups(Address dst, XMMRegister src, int vector_len) {
|
||||
assert(vector_len == AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit);
|
||||
simd_prefix(src, xnoreg, dst, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x11);
|
||||
emit_operand(src, dst);
|
||||
}
|
||||
|
||||
void Assembler::movw(Address dst, int imm16) {
|
||||
InstructionMark im(this);
|
||||
|
||||
@ -5156,7 +5210,7 @@ void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, i
|
||||
emit_int24(0x43, (0xC0 | encode), imm8 & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::pshufpd(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
void Assembler::shufpd(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
assert(isByte(imm8), "invalid value");
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
@ -5164,14 +5218,14 @@ void Assembler::pshufpd(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::vpshufpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
|
||||
void Assembler::vshufpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_rex_vex_w_reverted();
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::pshufps(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
void Assembler::shufps(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
assert(isByte(imm8), "invalid value");
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
@ -5179,7 +5233,7 @@ void Assembler::pshufps(XMMRegister dst, XMMRegister src, int imm8) {
|
||||
emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::vpshufps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
|
||||
void Assembler::vshufps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||
emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF);
|
||||
|
@ -1492,6 +1492,7 @@ private:
|
||||
void movb(Register dst, Address src);
|
||||
|
||||
void movddup(XMMRegister dst, XMMRegister src);
|
||||
void movddup(XMMRegister dst, Address src);
|
||||
void vmovddup(XMMRegister dst, Address src, int vector_len);
|
||||
|
||||
void kandbl(KRegister dst, KRegister src1, KRegister src2);
|
||||
@ -1663,6 +1664,11 @@ private:
|
||||
void movswq(Register dst, Register src);
|
||||
#endif
|
||||
|
||||
void movups(XMMRegister dst, Address src);
|
||||
void vmovups(XMMRegister dst, Address src, int vector_len);
|
||||
void movups(Address dst, XMMRegister src);
|
||||
void vmovups(Address dst, XMMRegister src, int vector_len);
|
||||
|
||||
void movw(Address dst, int imm16);
|
||||
void movw(Register dst, Address src);
|
||||
void movw(Address dst, Register src);
|
||||
@ -1942,10 +1948,10 @@ private:
|
||||
void pshuflw(XMMRegister dst, Address src, int mode);
|
||||
|
||||
//shuffle floats and doubles
|
||||
void pshufps(XMMRegister, XMMRegister, int);
|
||||
void pshufpd(XMMRegister, XMMRegister, int);
|
||||
void vpshufps(XMMRegister, XMMRegister, XMMRegister, int, int);
|
||||
void vpshufpd(XMMRegister, XMMRegister, XMMRegister, int, int);
|
||||
void shufps(XMMRegister, XMMRegister, int);
|
||||
void shufpd(XMMRegister, XMMRegister, int);
|
||||
void vshufps(XMMRegister, XMMRegister, XMMRegister, int, int);
|
||||
void vshufpd(XMMRegister, XMMRegister, XMMRegister, int, int);
|
||||
|
||||
// Shuffle packed values at 128 bit granularity
|
||||
void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
|
||||
|
@ -1643,12 +1643,12 @@ void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegi
|
||||
|
||||
void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
|
||||
switch (vlen_in_bytes) {
|
||||
case 4: movdl(dst, src); break;
|
||||
case 8: movq(dst, src); break;
|
||||
case 16: movdqu(dst, src); break;
|
||||
case 32: vmovdqu(dst, src); break;
|
||||
case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break;
|
||||
default: ShouldNotReachHere();
|
||||
case 4: movdl(dst, src); break;
|
||||
case 8: movq(dst, src); break;
|
||||
case 16: movdqu(dst, src); break;
|
||||
case 32: vmovdqu(dst, src); break;
|
||||
case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
|
||||
default: ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1661,6 +1661,38 @@ void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vle
|
||||
}
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
|
||||
int vlen_enc = vector_length_encoding(vlen);
|
||||
if (VM_Version::supports_avx()) {
|
||||
if (bt == T_LONG) {
|
||||
if (VM_Version::supports_avx2()) {
|
||||
vpbroadcastq(dst, src, vlen_enc, noreg);
|
||||
} else {
|
||||
vmovddup(dst, src, vlen_enc, noreg);
|
||||
}
|
||||
} else if (bt == T_DOUBLE) {
|
||||
if (vlen_enc != Assembler::AVX_128bit) {
|
||||
vbroadcastsd(dst, src, vlen_enc, noreg);
|
||||
} else {
|
||||
vmovddup(dst, src, vlen_enc, noreg);
|
||||
}
|
||||
} else {
|
||||
if (VM_Version::supports_avx2() && is_integral_type(bt)) {
|
||||
vpbroadcastd(dst, src, vlen_enc, noreg);
|
||||
} else {
|
||||
vbroadcastss(dst, src, vlen_enc, noreg);
|
||||
}
|
||||
}
|
||||
} else if (VM_Version::supports_sse3()) {
|
||||
movddup(dst, src);
|
||||
} else {
|
||||
movq(dst, src);
|
||||
if (vlen == 16) {
|
||||
punpcklqdq(dst, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
|
||||
ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
|
||||
if (vlen_in_bytes <= 4) {
|
||||
@ -2317,9 +2349,9 @@ void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src
|
||||
if (typ == T_FLOAT) {
|
||||
if (UseAVX == 0) {
|
||||
movdqu(dst, src);
|
||||
pshufps(dst, dst, eindex);
|
||||
shufps(dst, dst, eindex);
|
||||
} else {
|
||||
vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
|
||||
vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
|
||||
}
|
||||
} else {
|
||||
if (UseAVX == 0) {
|
||||
|
@ -159,6 +159,7 @@ public:
|
||||
|
||||
void load_vector(XMMRegister dst, Address src, int vlen_in_bytes);
|
||||
void load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch = rscratch1);
|
||||
void load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen);
|
||||
void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
|
||||
|
||||
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
|
||||
|
@ -2732,6 +2732,15 @@ void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
|
||||
if (reachable(src)) {
|
||||
Assembler::movddup(dst, as_Address(src));
|
||||
} else {
|
||||
lea(rscratch, src);
|
||||
Assembler::movddup(dst, Address(rscratch, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
if (reachable(src)) {
|
||||
Assembler::vmovddup(dst, as_Address(src), vector_len);
|
||||
@ -3288,9 +3297,13 @@ void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src,
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
|
||||
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
|
||||
Assembler::vpbroadcastw(dst, src, vector_len);
|
||||
void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
if (reachable(src)) {
|
||||
Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch, src);
|
||||
Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
@ -3311,6 +3324,15 @@ void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vecto
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
if (reachable(src)) {
|
||||
Assembler::vbroadcastss(dst, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch, src);
|
||||
Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
|
||||
Assembler::vpcmpeqb(dst, nds, src, vector_len);
|
||||
@ -4354,10 +4376,14 @@ void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file,
|
||||
|
||||
void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
|
||||
if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
|
||||
// Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
|
||||
// waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
|
||||
vpternlogd(dst, 0xFF, dst, dst, vector_len);
|
||||
} else if (VM_Version::supports_avx()) {
|
||||
vpcmpeqd(dst, dst, dst, vector_len);
|
||||
} else {
|
||||
assert(UseAVX > 0, "");
|
||||
vpcmpeqb(dst, dst, dst, vector_len);
|
||||
assert(VM_Version::supports_sse2(), "");
|
||||
pcmpeqd(dst, dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1114,6 +1114,12 @@ public:
|
||||
void addpd(XMMRegister dst, Address src) { Assembler::addpd(dst, src); }
|
||||
void addpd(XMMRegister dst, AddressLiteral src);
|
||||
|
||||
using Assembler::vbroadcastsd;
|
||||
void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
|
||||
using Assembler::vbroadcastss;
|
||||
void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
|
||||
void divsd(XMMRegister dst, XMMRegister src) { Assembler::divsd(dst, src); }
|
||||
void divsd(XMMRegister dst, Address src) { Assembler::divsd(dst, src); }
|
||||
void divsd(XMMRegister dst, AddressLiteral src);
|
||||
@ -1150,6 +1156,11 @@ public:
|
||||
void kmov(Register dst, KRegister src);
|
||||
void kmov(KRegister dst, Register src);
|
||||
|
||||
using Assembler::movddup;
|
||||
void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = rscratch1);
|
||||
using Assembler::vmovddup;
|
||||
void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
|
||||
// AVX Unaligned forms
|
||||
void vmovdqu(Address dst, XMMRegister src);
|
||||
void vmovdqu(XMMRegister dst, Address src);
|
||||
@ -1157,7 +1168,6 @@ public:
|
||||
void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
|
||||
void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len);
|
||||
|
||||
|
||||
// AVX512 Unaligned
|
||||
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len);
|
||||
void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len);
|
||||
@ -1229,9 +1239,6 @@ public:
|
||||
void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); }
|
||||
void movsd(XMMRegister dst, AddressLiteral src);
|
||||
|
||||
using Assembler::vmovddup;
|
||||
void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
|
||||
void mulpd(XMMRegister dst, XMMRegister src) { Assembler::mulpd(dst, src); }
|
||||
void mulpd(XMMRegister dst, Address src) { Assembler::mulpd(dst, src); }
|
||||
void mulpd(XMMRegister dst, AddressLiteral src);
|
||||
@ -1337,16 +1344,11 @@ public:
|
||||
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
|
||||
void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
|
||||
|
||||
void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
|
||||
using Assembler::vpbroadcastd;
|
||||
void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
|
||||
using Assembler::vbroadcastsd;
|
||||
void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
using Assembler::vpbroadcastq;
|
||||
void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
|
||||
void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); }
|
||||
void vpbroadcastq(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); }
|
||||
|
||||
|
||||
|
||||
void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
|
@ -4107,37 +4107,43 @@ instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rReg
|
||||
// ====================REPLICATE=======================================
|
||||
|
||||
// Replicate byte scalar to be vector
|
||||
instruct ReplB_reg(vec dst, rRegI src) %{
|
||||
instruct vReplB_reg(vec dst, rRegI src) %{
|
||||
predicate(UseAVX >= 2);
|
||||
match(Set dst (ReplicateB src));
|
||||
format %{ "replicateB $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
|
||||
assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
__ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
if (vlen >= 16) {
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
if (vlen >= 32) {
|
||||
assert(vlen == 32, "sanity");
|
||||
__ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
}
|
||||
__ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplB_reg(vec dst, rRegI src) %{
|
||||
predicate(UseAVX < 2);
|
||||
match(Set dst (ReplicateB src));
|
||||
format %{ "replicateB $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
__ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
if (vlen >= 16) {
|
||||
assert(vlen == 16, "");
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplB_mem(vec dst, memory mem) %{
|
||||
predicate(VM_Version::supports_avx2());
|
||||
predicate(UseAVX >= 2);
|
||||
match(Set dst (ReplicateB (LoadB mem)));
|
||||
format %{ "replicateB $dst,$mem" %}
|
||||
ins_encode %{
|
||||
@ -4147,48 +4153,45 @@ instruct ReplB_mem(vec dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplB_imm(vec dst, immI con) %{
|
||||
match(Set dst (ReplicateB con));
|
||||
format %{ "replicateB $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_BYTE, vreplicate_imm(T_BYTE, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// ====================ReplicateS=======================================
|
||||
|
||||
instruct ReplS_reg(vec dst, rRegI src) %{
|
||||
instruct vReplS_reg(vec dst, rRegI src) %{
|
||||
predicate(UseAVX >= 2);
|
||||
match(Set dst (ReplicateS src));
|
||||
format %{ "replicateS $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
|
||||
assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
if (vlen >= 8) {
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
if (vlen >= 16) {
|
||||
assert(vlen == 16, "sanity");
|
||||
__ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
}
|
||||
__ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplS_reg(vec dst, rRegI src) %{
|
||||
predicate(UseAVX < 2);
|
||||
match(Set dst (ReplicateS src));
|
||||
format %{ "replicateS $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
if (vlen >= 8) {
|
||||
assert(vlen == 8, "");
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplS_mem(vec dst, memory mem) %{
|
||||
predicate(VM_Version::supports_avx2());
|
||||
predicate(UseAVX >= 2);
|
||||
match(Set dst (ReplicateS (LoadS mem)));
|
||||
format %{ "replicateS $dst,$mem" %}
|
||||
ins_encode %{
|
||||
@ -4198,16 +4201,6 @@ instruct ReplS_mem(vec dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplS_imm(vec dst, immI con) %{
|
||||
match(Set dst (ReplicateS con));
|
||||
format %{ "replicateS $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_SHORT, vreplicate_imm(T_SHORT, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// ====================ReplicateI=======================================
|
||||
|
||||
instruct ReplI_reg(vec dst, rRegI src) %{
|
||||
@ -4215,20 +4208,15 @@ instruct ReplI_reg(vec dst, rRegI src) %{
|
||||
format %{ "replicateI $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
__ movdl($dst$$XMMRegister, $src$$Register);
|
||||
__ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
if (vlen >= 8) {
|
||||
assert(vlen == 8, "sanity");
|
||||
__ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
@ -4238,25 +4226,32 @@ instruct ReplI_mem(vec dst, memory mem) %{
|
||||
match(Set dst (ReplicateI (LoadI mem)));
|
||||
format %{ "replicateI $dst,$mem" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen <= 4) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_avx2()) {
|
||||
__ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
} else if (VM_Version::supports_avx()) {
|
||||
__ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
} else {
|
||||
__ movdl($dst$$XMMRegister, $mem$$Address);
|
||||
__ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
} else {
|
||||
assert(VM_Version::supports_avx2(), "sanity");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplI_imm(vec dst, immI con) %{
|
||||
match(Set dst (ReplicateB con));
|
||||
match(Set dst (ReplicateS con));
|
||||
match(Set dst (ReplicateI con));
|
||||
format %{ "replicateI $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_INT, vreplicate_imm(T_INT, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
|
||||
vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
|
||||
(VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
|
||||
type2aelembytes(Matcher::vector_element_basic_type(this))));
|
||||
BasicType bt = Matcher::vector_element_basic_type(this);
|
||||
int vlen = Matcher::vector_length_in_bytes(this);
|
||||
__ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -4268,23 +4263,21 @@ instruct ReplI_zero(vec dst, immI_0 zero) %{
|
||||
match(Set dst (ReplicateI zero));
|
||||
format %{ "replicateI $dst,$zero" %}
|
||||
ins_encode %{
|
||||
uint vsize = Matcher::vector_length_in_bytes(this);
|
||||
if (vsize <= 16) {
|
||||
__ pxor($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
} else {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
|
||||
__ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
__ pxor($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct ReplI_M1(vec dst, immI_M1 con) %{
|
||||
predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
|
||||
predicate(UseSSE >= 2);
|
||||
match(Set dst (ReplicateB con));
|
||||
match(Set dst (ReplicateS con));
|
||||
match(Set dst (ReplicateI con));
|
||||
effect(TEMP dst);
|
||||
format %{ "vallones $dst" %}
|
||||
ins_encode %{
|
||||
int vector_len = vector_length_encoding(this);
|
||||
@ -4301,23 +4294,16 @@ instruct ReplL_reg(vec dst, rRegL src) %{
|
||||
match(Set dst (ReplicateL src));
|
||||
format %{ "replicateL $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
__ movdq($dst$$XMMRegister, $src$$Register);
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
} else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
|
||||
__ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
assert(vlen == 4, "sanity");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ movdq($dst$$XMMRegister, $src$$Register);
|
||||
__ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
assert(vlen == 4, "sanity");
|
||||
__ movdq($dst$$XMMRegister, $src$$Register);
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
__ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
@ -4382,14 +4368,14 @@ instruct ReplL_mem(vec dst, memory mem) %{
|
||||
match(Set dst (ReplicateL (LoadL mem)));
|
||||
format %{ "replicateL $dst,$mem" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_avx2()) {
|
||||
__ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
} else if (VM_Version::supports_sse3()) {
|
||||
__ movddup($dst$$XMMRegister, $mem$$Address);
|
||||
} else {
|
||||
__ movq($dst$$XMMRegister, $mem$$Address);
|
||||
__ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
} else {
|
||||
assert(VM_Version::supports_avx2(), "sanity");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
@ -4400,8 +4386,9 @@ instruct ReplL_imm(vec dst, immL con) %{
|
||||
match(Set dst (ReplicateL con));
|
||||
format %{ "replicateL $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
|
||||
int vlen = Matcher::vector_length_in_bytes(this);
|
||||
__ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -4410,21 +4397,19 @@ instruct ReplL_zero(vec dst, immL0 zero) %{
|
||||
match(Set dst (ReplicateL zero));
|
||||
format %{ "replicateL $dst,$zero" %}
|
||||
ins_encode %{
|
||||
int vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
__ pxor($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
} else {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
|
||||
__ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
__ pxor($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct ReplL_M1(vec dst, immL_M1 con) %{
|
||||
predicate(UseAVX > 0);
|
||||
predicate(UseSSE >= 2);
|
||||
match(Set dst (ReplicateL con));
|
||||
effect(TEMP dst);
|
||||
format %{ "vallones $dst" %}
|
||||
ins_encode %{
|
||||
int vector_len = vector_length_encoding(this);
|
||||
@ -4435,38 +4420,43 @@ instruct ReplL_M1(vec dst, immL_M1 con) %{
|
||||
|
||||
// ====================ReplicateF=======================================
|
||||
|
||||
instruct ReplF_reg(vec dst, vlRegF src) %{
|
||||
instruct vReplF_reg(vec dst, vlRegF src) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (ReplicateF src));
|
||||
format %{ "replicateF $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen <= 4) {
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
__ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
|
||||
} else {
|
||||
assert(vlen == 8, "sanity");
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
|
||||
__ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
|
||||
__ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplF_reg(vec dst, vlRegF src) %{
|
||||
predicate(UseAVX == 0);
|
||||
match(Set dst (ReplicateF src));
|
||||
format %{ "replicateF $dst,$src" %}
|
||||
ins_encode %{
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplF_mem(vec dst, memory mem) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (ReplicateF (LoadF mem)));
|
||||
format %{ "replicateF $dst,$mem" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen <= 4) {
|
||||
__ movdl($dst$$XMMRegister, $mem$$Address);
|
||||
__ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
|
||||
} else {
|
||||
assert(VM_Version::supports_avx(), "sanity");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
}
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -4476,8 +4466,10 @@ instruct ReplF_imm(vec dst, immF con) %{
|
||||
match(Set dst (ReplicateF con));
|
||||
format %{ "replicateF $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
|
||||
VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
|
||||
int vlen = Matcher::vector_length_in_bytes(this);
|
||||
__ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -4486,12 +4478,11 @@ instruct ReplF_zero(vec dst, immF0 zero) %{
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "replicateF $dst,$zero" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen <= 4) {
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
|
||||
__ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
@ -4500,37 +4491,46 @@ instruct ReplF_zero(vec dst, immF0 zero) %{
|
||||
// ====================ReplicateD=======================================
|
||||
|
||||
// Replicate double (8 bytes) scalar to be vector
|
||||
instruct ReplD_reg(vec dst, vlRegD src) %{
|
||||
instruct vReplD_reg(vec dst, vlRegD src) %{
|
||||
predicate(UseSSE >= 3);
|
||||
match(Set dst (ReplicateD src));
|
||||
format %{ "replicateD $dst,$src" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (vlen <= 2) {
|
||||
__ movddup($dst$$XMMRegister, $src$$XMMRegister);
|
||||
} else if (VM_Version::supports_avx2()) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
|
||||
} else {
|
||||
assert(vlen == 4, "sanity");
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
|
||||
__ movddup($dst$$XMMRegister, $src$$XMMRegister);
|
||||
__ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplD_reg(vec dst, vlRegD src) %{
|
||||
predicate(UseSSE < 3);
|
||||
match(Set dst (ReplicateD src));
|
||||
format %{ "replicateD $dst,$src" %}
|
||||
ins_encode %{
|
||||
__ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct ReplD_mem(vec dst, memory mem) %{
|
||||
predicate(UseSSE >= 3);
|
||||
match(Set dst (ReplicateD (LoadD mem)));
|
||||
format %{ "replicateD $dst,$mem" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
__ movq($dst$$XMMRegister, $mem$$Address);
|
||||
__ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
|
||||
} else {
|
||||
assert(VM_Version::supports_avx(), "sanity");
|
||||
if (Matcher::vector_length(this) >= 4) {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
} else {
|
||||
__ movddup($dst$$XMMRegister, $mem$$Address);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
@ -4541,8 +4541,9 @@ instruct ReplD_imm(vec dst, immD con) %{
|
||||
match(Set dst (ReplicateD con));
|
||||
format %{ "replicateD $dst,$con" %}
|
||||
ins_encode %{
|
||||
InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, Matcher::vector_length(this)));
|
||||
__ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
|
||||
InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
|
||||
int vlen = Matcher::vector_length_in_bytes(this);
|
||||
__ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -4551,12 +4552,11 @@ instruct ReplD_zero(vec dst, immD0 zero) %{
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "replicateD $dst,$zero" %}
|
||||
ins_encode %{
|
||||
uint vlen = Matcher::vector_length(this);
|
||||
if (vlen == 2) {
|
||||
__ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
|
||||
__ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
} else {
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
@ -8335,7 +8335,7 @@ instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
|
||||
effect(TEMP_DEF dst, TEMP vtmp);
|
||||
ins_encode %{
|
||||
int vlen_enc = Assembler::AVX_128bit;
|
||||
__ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
|
||||
__ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
|
||||
__ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
|
||||
__ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
|
||||
__ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
|
||||
|
@ -3334,20 +3334,20 @@ void ADLParser::constant_parse_expression(EncClass* encoding, char* ec_name) {
|
||||
if (_curchar == '(') {
|
||||
parens_depth++;
|
||||
encoding->add_code("(");
|
||||
next_char();
|
||||
next_char_or_line();
|
||||
}
|
||||
else if (_curchar == ')') {
|
||||
parens_depth--;
|
||||
if (parens_depth > 0)
|
||||
encoding->add_code(")");
|
||||
next_char();
|
||||
next_char_or_line();
|
||||
}
|
||||
else {
|
||||
// (1)
|
||||
// Check if there is a string to pass through to output
|
||||
char *start = _ptr; // Record start of the next string
|
||||
while ((_curchar != '$') && (_curchar != '(') && (_curchar != ')')) {
|
||||
next_char();
|
||||
next_char_or_line();
|
||||
}
|
||||
// If a string was found, terminate it and record in EncClass
|
||||
if (start != _ptr) {
|
||||
|
@ -65,7 +65,7 @@ InstructForm::InstructForm(const char *id, InstructForm *instr, MatchRule *rule)
|
||||
: _ident(id), _ideal_only(false),
|
||||
_localNames(instr->_localNames),
|
||||
_effects(instr->_effects),
|
||||
_is_mach_constant(false),
|
||||
_is_mach_constant(instr->_is_mach_constant),
|
||||
_needs_constant_base(false),
|
||||
_has_call(false)
|
||||
{
|
||||
@ -4090,12 +4090,6 @@ int MatchRule::is_expensive() const {
|
||||
strcmp(opType,"ReverseBytesL")==0 ||
|
||||
strcmp(opType,"ReverseBytesUS")==0 ||
|
||||
strcmp(opType,"ReverseBytesS")==0 ||
|
||||
strcmp(opType,"ReplicateB")==0 ||
|
||||
strcmp(opType,"ReplicateS")==0 ||
|
||||
strcmp(opType,"ReplicateI")==0 ||
|
||||
strcmp(opType,"ReplicateL")==0 ||
|
||||
strcmp(opType,"ReplicateF")==0 ||
|
||||
strcmp(opType,"ReplicateD")==0 ||
|
||||
strcmp(opType,"PopulateIndex")==0 ||
|
||||
strcmp(opType,"AddReductionVI")==0 ||
|
||||
strcmp(opType,"AddReductionVL")==0 ||
|
||||
@ -4111,8 +4105,9 @@ int MatchRule::is_expensive() const {
|
||||
strcmp(opType,"OrReductionV")==0 ||
|
||||
strcmp(opType,"XorReductionV")==0 ||
|
||||
strcmp(opType,"MaskAll")==0 ||
|
||||
0 /* 0 to line up columns nicely */ )
|
||||
0 /* 0 to line up columns nicely */ ) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -429,11 +429,11 @@ class AbstractAssembler : public ResourceObj {
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
address array_constant(BasicType bt, GrowableArray<jvalue>* c) {
|
||||
address array_constant(BasicType bt, GrowableArray<jvalue>* c, int alignment) {
|
||||
CodeSection* c1 = _code_section;
|
||||
int len = c->length();
|
||||
int size = type2aelembytes(bt) * len;
|
||||
address ptr = start_a_const(size, MIN2(round_up_power_of_2(size), 8));
|
||||
address ptr = start_a_const(size, alignment);
|
||||
if (ptr != NULL) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
jvalue e = c->at(i);
|
||||
|
@ -36,7 +36,30 @@ bool ConstantTable::Constant::operator==(const Constant& other) {
|
||||
if (type() != other.type() ) return false;
|
||||
if (can_be_reused() != other.can_be_reused()) return false;
|
||||
if (is_array() || other.is_array()) {
|
||||
return is_array() && other.is_array() && _v._array == other._v._array;
|
||||
if (is_array() != other.is_array() ||
|
||||
get_array()->length() != other.get_array()->length()) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < get_array()->length(); i++) {
|
||||
jvalue ele1 = get_array()->at(i);
|
||||
jvalue ele2 = other.get_array()->at(i);
|
||||
bool is_eq;
|
||||
switch (type()) {
|
||||
case T_BOOLEAN: is_eq = ele1.z == ele2.z; break;
|
||||
case T_BYTE: is_eq = ele1.b == ele2.b; break;
|
||||
case T_CHAR: is_eq = ele1.c == ele2.c; break;
|
||||
case T_SHORT: is_eq = ele1.s == ele2.s; break;
|
||||
case T_INT: is_eq = ele1.i == ele2.i; break;
|
||||
case T_LONG: is_eq = ele1.j == ele2.j; break;
|
||||
case T_FLOAT: is_eq = jint_cast(ele1.f) == jint_cast(ele2.f); break;
|
||||
case T_DOUBLE: is_eq = jlong_cast(ele1.d) == jlong_cast(ele2.d); break;
|
||||
default: ShouldNotReachHere(); is_eq = false;
|
||||
}
|
||||
if (!is_eq) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// For floating point values we compare the bit pattern.
|
||||
switch (type()) {
|
||||
@ -104,7 +127,7 @@ void ConstantTable::calculate_offsets_and_size() {
|
||||
// Align offset for type.
|
||||
int typesize = constant_size(con);
|
||||
assert(typesize <= 8 || con->is_array(), "sanity");
|
||||
offset = align_up(offset, MIN2(round_up_power_of_2(typesize), 8));
|
||||
offset = align_up(offset, con->alignment());
|
||||
con->set_offset(offset); // set constant's offset
|
||||
|
||||
if (con->type() == T_VOID) {
|
||||
@ -127,7 +150,7 @@ bool ConstantTable::emit(CodeBuffer& cb) const {
|
||||
Constant con = _constants.at(i);
|
||||
address constant_addr = NULL;
|
||||
if (con.is_array()) {
|
||||
constant_addr = _masm.array_constant(con.type(), con.get_array());
|
||||
constant_addr = _masm.array_constant(con.type(), con.get_array(), con.alignment());
|
||||
} else {
|
||||
switch (con.type()) {
|
||||
case T_INT: constant_addr = _masm.int_constant( con.get_jint() ); break;
|
||||
@ -229,12 +252,18 @@ ConstantTable::Constant ConstantTable::add(Metadata* metadata) {
|
||||
return con;
|
||||
}
|
||||
|
||||
ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt, GrowableArray<jvalue>* array) {
|
||||
Constant con(bt, array);
|
||||
ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt,
|
||||
GrowableArray<jvalue>* array, int alignment) {
|
||||
Constant con(bt, array, alignment);
|
||||
add(con);
|
||||
return con;
|
||||
}
|
||||
|
||||
ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt,
|
||||
GrowableArray<jvalue>* array) {
|
||||
return add(n, bt, array, array->length() * type2aelembytes(bt));
|
||||
}
|
||||
|
||||
ConstantTable::Constant ConstantTable::add(MachConstantNode* n, MachOper* oper) {
|
||||
jvalue value;
|
||||
BasicType type = oper->type()->basic_type();
|
||||
|
@ -39,6 +39,7 @@ public:
|
||||
private:
|
||||
BasicType _type;
|
||||
bool _is_array;
|
||||
int _alignment;
|
||||
union {
|
||||
jvalue _value;
|
||||
Metadata* _metadata;
|
||||
@ -49,7 +50,7 @@ public:
|
||||
bool _can_be_reused; // true (default) if the value can be shared with other users.
|
||||
|
||||
public:
|
||||
Constant() : _type(T_ILLEGAL), _is_array(false), _offset(-1), _freq(0.0f), _can_be_reused(true) { _v._value.l = 0; }
|
||||
Constant() : _type(T_ILLEGAL), _is_array(false), _alignment(-1), _offset(-1), _freq(0.0f), _can_be_reused(true) { _v._value.l = 0; }
|
||||
Constant(BasicType type, jvalue value, float freq = 0.0f, bool can_be_reused = true) :
|
||||
_type(type),
|
||||
_is_array(false),
|
||||
@ -59,24 +60,28 @@ public:
|
||||
{
|
||||
assert(type != T_METADATA, "wrong constructor");
|
||||
_v._value = value;
|
||||
_alignment = type == T_VOID ? sizeof(jobject) : type2aelembytes(type);
|
||||
}
|
||||
Constant(Metadata* metadata, bool can_be_reused = true) :
|
||||
_type(T_METADATA),
|
||||
_is_array(false),
|
||||
_alignment(sizeof(Metadata*)),
|
||||
_offset(-1),
|
||||
_freq(0.0f),
|
||||
_can_be_reused(can_be_reused)
|
||||
{
|
||||
_v._metadata = metadata;
|
||||
}
|
||||
Constant(BasicType type, GrowableArray<jvalue>* array) :
|
||||
Constant(BasicType type, GrowableArray<jvalue>* array, int alignment, bool can_be_reused = true) :
|
||||
_type(type),
|
||||
_is_array(true),
|
||||
_alignment(alignment),
|
||||
_offset(-1),
|
||||
_freq(0.0f),
|
||||
_can_be_reused(false)
|
||||
_can_be_reused(can_be_reused)
|
||||
{
|
||||
assert(is_java_primitive(type), "not applicable for %s", type2name(type));
|
||||
assert(is_power_of_2(alignment), "invalid alignment %d", alignment);
|
||||
_v._array = new GrowableArray<jvalue>(array->length());
|
||||
for (jvalue ele : *array) {
|
||||
_v._array->append(ele);
|
||||
@ -87,6 +92,7 @@ public:
|
||||
|
||||
BasicType type() const { return _type; }
|
||||
bool is_array() const { return _is_array; }
|
||||
int alignment() const { return _alignment; }
|
||||
|
||||
jint get_jint() const { return _v._value.i; }
|
||||
jlong get_jlong() const { return _v._value.j; }
|
||||
@ -145,6 +151,7 @@ public:
|
||||
Constant add(MachConstantNode* n, BasicType type, jvalue value);
|
||||
Constant add(Metadata* metadata);
|
||||
Constant add(MachConstantNode* n, BasicType bt, GrowableArray<jvalue>* array);
|
||||
Constant add(MachConstantNode* n, BasicType bt, GrowableArray<jvalue>* array, int alignment);
|
||||
Constant add(MachConstantNode* n, MachOper* oper);
|
||||
Constant add(MachConstantNode* n, jint i) {
|
||||
jvalue value; value.i = i;
|
||||
|
@ -474,14 +474,15 @@ bool MachNode::rematerialize() const {
|
||||
}
|
||||
|
||||
// Stretching lots of inputs - don't do it.
|
||||
if (req() > 2) {
|
||||
// A MachContant has the last input being the constant base
|
||||
if (req() > (is_MachConstant() ? 3U : 2U)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (req() == 2 && in(1) && in(1)->ideal_reg() == Op_RegFlags) {
|
||||
if (req() >= 2 && in(1) && in(1)->ideal_reg() == Op_RegFlags) {
|
||||
// In(1) will be rematerialized, too.
|
||||
// Stretching lots of inputs - don't do it.
|
||||
if (in(1)->req() > 2) {
|
||||
if (in(1)->req() > (in(1)->is_MachConstant() ? 3U : 2U)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -491,7 +492,7 @@ bool MachNode::rematerialize() const {
|
||||
uint idx = oper_input_base();
|
||||
if (req() > idx) {
|
||||
const RegMask &rm = in_RegMask(idx);
|
||||
if (rm.is_bound(ideal_reg())) {
|
||||
if (rm.is_NotEmpty() && rm.is_bound(ideal_reg())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.jdk.incubator.vector;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import jdk.incubator.vector.DoubleVector;
|
||||
import jdk.incubator.vector.FloatVector;
|
||||
import jdk.incubator.vector.IntVector;
|
||||
import jdk.incubator.vector.LongVector;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@Fork(1)
|
||||
public class SpiltReplicate {
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
public long broadcastInt() {
|
||||
var species = IntVector.SPECIES_PREFERRED;
|
||||
var sum = IntVector.zero(species);
|
||||
return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.reinterpretAsLongs()
|
||||
.lane(0);
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
public long broadcastLong() {
|
||||
var species = LongVector.SPECIES_PREFERRED;
|
||||
var sum = LongVector.zero(species);
|
||||
return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.reinterpretAsLongs()
|
||||
.lane(0);
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
public long broadcastFloat() {
|
||||
var species = FloatVector.SPECIES_PREFERRED;
|
||||
var sum = FloatVector.zero(species);
|
||||
return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.reinterpretAsLongs()
|
||||
.lane(0);
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
public long broadcastDouble() {
|
||||
var species = DoubleVector.SPECIES_PREFERRED;
|
||||
var sum = DoubleVector.zero(species);
|
||||
return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8)
|
||||
.add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16)
|
||||
.add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24)
|
||||
.add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32)
|
||||
.reinterpretAsLongs()
|
||||
.lane(0);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testInt() {
|
||||
broadcastInt();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testLong() {
|
||||
broadcastLong();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testFloat() {
|
||||
broadcastFloat();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testDouble() {
|
||||
broadcastDouble();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user