8294865: x86: Improve the code generation of MulVB and MulVL
Reviewed-by: kvn, vlivanov
This commit is contained in:
parent
2087424736
commit
404e8de155
@ -7219,7 +7219,7 @@ void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
|
||||
emit_int16(0x40, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
void Assembler::evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
assert(UseAVX > 2, "requires some form of EVEX");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
attributes.set_is_evex_instruction();
|
||||
@ -7254,7 +7254,7 @@ void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vecto
|
||||
emit_operand(dst, src, 0);
|
||||
}
|
||||
|
||||
void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
|
||||
void Assembler::evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
|
||||
assert(UseAVX > 2, "requires some form of EVEX");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
|
@ -2519,11 +2519,11 @@ private:
|
||||
void pmuludq(XMMRegister dst, XMMRegister src);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpmulhuw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
// Minimum of packed integers
|
||||
|
@ -1757,7 +1757,7 @@ void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegis
|
||||
}
|
||||
break;
|
||||
case Op_MulReductionVL: assert(UseAVX > 2, "required");
|
||||
vpmullq(dst, dst, src, vector_len); break;
|
||||
evpmullq(dst, dst, src, vector_len); break;
|
||||
default: assert(false, "wrong opcode");
|
||||
}
|
||||
}
|
||||
@ -1805,7 +1805,7 @@ void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegis
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
|
||||
case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
|
||||
default: assert(false, "wrong opcode");
|
||||
}
|
||||
}
|
||||
|
@ -191,10 +191,13 @@
|
||||
switch(vopc) {
|
||||
default:
|
||||
return 0;
|
||||
case Op_MulVB:
|
||||
return 7;
|
||||
case Op_MulVL:
|
||||
return VM_Version::supports_avx512vldq() ? 0 : 6;
|
||||
case Op_VectorCastF2X: // fall through
|
||||
case Op_VectorCastD2X: {
|
||||
case Op_VectorCastD2X:
|
||||
return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);
|
||||
}
|
||||
case Op_CountTrailingZerosV:
|
||||
case Op_CountLeadingZerosV:
|
||||
return VM_Version::supports_avx512cd() && (ety == T_INT || ety == T_LONG) ? 0 : 40;
|
||||
@ -210,9 +213,8 @@
|
||||
case Op_ReverseV:
|
||||
return VM_Version::supports_gfni() ? 0 : 30;
|
||||
case Op_RoundVF: // fall through
|
||||
case Op_RoundVD: {
|
||||
case Op_RoundVD:
|
||||
return 30;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1734,7 +1734,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
|
||||
break;
|
||||
case Op_AbsVD:
|
||||
case Op_NegVD:
|
||||
case Op_MulVL:
|
||||
if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
|
||||
return false; // 512bit vpmullq, vandpd and vxorpd are not available
|
||||
}
|
||||
@ -5640,114 +5639,66 @@ instruct vsubD_mem(vec dst, vec src, memory mem) %{
|
||||
// --------------------------------- MUL --------------------------------------
|
||||
|
||||
// Byte vector mul
|
||||
instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp) %{
|
||||
predicate(Matcher::vector_length(n) == 4 ||
|
||||
Matcher::vector_length(n) == 8);
|
||||
instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
|
||||
predicate(Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (MulVB src1 src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{"vector_mulB $dst,$src1,$src2" %}
|
||||
effect(TEMP dst, TEMP xtmp);
|
||||
format %{ "mulVB $dst, $src1, $src2\t! using $xtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseSSE > 3, "required");
|
||||
__ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
|
||||
__ pand($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
|
||||
__ psllw($dst$$XMMRegister, 8);
|
||||
__ psrlw($dst$$XMMRegister, 8);
|
||||
__ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
|
||||
predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
|
||||
instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
|
||||
predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (MulVB src1 src2));
|
||||
effect(TEMP dst, TEMP tmp1, TEMP tmp2);
|
||||
format %{"vector_mulB $dst,$src1,$src2" %}
|
||||
effect(TEMP dst, TEMP xtmp);
|
||||
format %{ "mulVB $dst, $src1, $src2\t! using $xtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseSSE > 3, "required");
|
||||
__ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
|
||||
__ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
|
||||
__ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
__ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
|
||||
__ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
|
||||
__ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
|
||||
__ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
// Odd-index elements
|
||||
__ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
|
||||
__ psrlw($dst$$XMMRegister, 8);
|
||||
__ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ psrlw($xtmp$$XMMRegister, 8);
|
||||
__ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
|
||||
__ psllw($dst$$XMMRegister, 8);
|
||||
// Even-index elements
|
||||
__ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ psllw($xtmp$$XMMRegister, 8);
|
||||
__ psrlw($xtmp$$XMMRegister, 8);
|
||||
// Combine
|
||||
__ por($dst$$XMMRegister, $xtmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp) %{
|
||||
predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
|
||||
instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
|
||||
predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (MulVB src1 src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{"vector_mulB $dst,$src1,$src2" %}
|
||||
effect(TEMP xtmp1, TEMP xtmp2);
|
||||
format %{ "vmulVB $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = Assembler::AVX_256bit;
|
||||
__ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
|
||||
__ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
|
||||
__ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
|
||||
__ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
|
||||
predicate(Matcher::vector_length(n) == 32);
|
||||
match(Set dst (MulVB src1 src2));
|
||||
effect(TEMP dst, TEMP tmp1, TEMP tmp2);
|
||||
format %{"vector_mulB $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 1, "required");
|
||||
int vlen_enc = Assembler::AVX_256bit;
|
||||
__ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
|
||||
__ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
|
||||
__ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
|
||||
__ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
|
||||
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
|
||||
predicate(Matcher::vector_length(n) == 64);
|
||||
match(Set dst (MulVB src1 src2));
|
||||
effect(TEMP dst, TEMP tmp1, TEMP tmp2);
|
||||
format %{"vector_mulB $dst,$src1,$src2\n\t" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
int vlen_enc = Assembler::AVX_512bit;
|
||||
__ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
|
||||
__ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
|
||||
__ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
|
||||
__ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
__ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
|
||||
__ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
|
||||
__ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
// Odd-index elements
|
||||
__ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
|
||||
__ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
|
||||
__ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
|
||||
__ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
|
||||
// Even-index elements
|
||||
__ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
|
||||
__ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
|
||||
// Combine
|
||||
__ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -5756,7 +5707,7 @@ instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
|
||||
instruct vmulS(vec dst, vec src) %{
|
||||
predicate(UseAVX == 0);
|
||||
match(Set dst (MulVS dst src));
|
||||
format %{ "pmullw $dst,$src\t! mul packedS" %}
|
||||
format %{ "pmullw $dst,$src\t! mul packedS" %}
|
||||
ins_encode %{
|
||||
__ pmullw($dst$$XMMRegister, $src$$XMMRegister);
|
||||
%}
|
||||
@ -5822,78 +5773,76 @@ instruct vmulI_mem(vec dst, vec src, memory mem) %{
|
||||
%}
|
||||
|
||||
// Longs vector mul
|
||||
instruct vmulL_reg(vec dst, vec src1, vec src2) %{
|
||||
predicate(VM_Version::supports_avx512dq());
|
||||
instruct evmulL_reg(vec dst, vec src1, vec src2) %{
|
||||
predicate((Matcher::vector_length_in_bytes(n) == 64 &&
|
||||
VM_Version::supports_avx512dq()) ||
|
||||
VM_Version::supports_avx512vldq());
|
||||
match(Set dst (MulVL src1 src2));
|
||||
format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
|
||||
format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmulL_mem(vec dst, vec src, memory mem) %{
|
||||
predicate(VM_Version::supports_avx512dq() &&
|
||||
(Matcher::vector_length_in_bytes(n->in(1)) > 8));
|
||||
instruct evmulL_mem(vec dst, vec src, memory mem) %{
|
||||
predicate((Matcher::vector_length_in_bytes(n) == 64 &&
|
||||
VM_Version::supports_avx512dq()) ||
|
||||
(Matcher::vector_length_in_bytes(n) > 8 &&
|
||||
VM_Version::supports_avx512vldq()));
|
||||
match(Set dst (MulVL src (LoadVector mem)));
|
||||
format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
|
||||
format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
__ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
__ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
|
||||
predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
|
||||
match(Set dst (MulVL dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "pshufd $tmp,$src2, 177\n\t"
|
||||
"pmulld $tmp,$dst\n\t"
|
||||
"phaddd $tmp,$tmp\n\t"
|
||||
"pmovzxdq $tmp,$tmp\n\t"
|
||||
"psllq $tmp, 32\n\t"
|
||||
"pmuludq $dst,$src2\n\t"
|
||||
"paddq $dst,$tmp\n\t! mul packed2L" %}
|
||||
|
||||
instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
|
||||
predicate(UseAVX == 0);
|
||||
match(Set dst (MulVL src1 src2));
|
||||
effect(TEMP dst, TEMP xtmp);
|
||||
format %{ "mulVL $dst, $src1, $src2\t! using $xtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(VM_Version::supports_sse4_1(), "required");
|
||||
int vlen_enc = Assembler::AVX_128bit;
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
|
||||
__ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
|
||||
__ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ psllq($tmp$$XMMRegister, 32);
|
||||
__ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
// Get the lo-hi products, only the lower 32 bits is in concerns
|
||||
__ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
|
||||
__ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
|
||||
__ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
|
||||
__ psllq($dst$$XMMRegister, 32);
|
||||
// Get the lo-lo products
|
||||
__ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
|
||||
predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
|
||||
instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
|
||||
predicate(UseAVX > 0 &&
|
||||
((Matcher::vector_length_in_bytes(n) == 64 &&
|
||||
!VM_Version::supports_avx512dq()) ||
|
||||
(Matcher::vector_length_in_bytes(n) < 64 &&
|
||||
!VM_Version::supports_avx512vldq())));
|
||||
match(Set dst (MulVL src1 src2));
|
||||
effect(TEMP tmp1, TEMP tmp);
|
||||
format %{ "vpshufd $tmp,$src2\n\t"
|
||||
"vpmulld $tmp,$src1,$tmp\n\t"
|
||||
"vphaddd $tmp,$tmp,$tmp\n\t"
|
||||
"vpmovzxdq $tmp,$tmp\n\t"
|
||||
"vpsllq $tmp,$tmp\n\t"
|
||||
"vpmuludq $tmp1,$src1,$src2\n\t"
|
||||
"vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
|
||||
effect(TEMP xtmp1, TEMP xtmp2);
|
||||
format %{ "vmulVL $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = Assembler::AVX_256bit;
|
||||
__ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
|
||||
__ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
|
||||
__ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
|
||||
__ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
|
||||
__ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
|
||||
__ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
// Get the lo-hi products, only the lower 32 bits is in concerns
|
||||
__ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
|
||||
__ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
|
||||
__ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
|
||||
__ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
|
||||
__ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
|
||||
// Get the lo-lo products
|
||||
__ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||
__ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
Loading…
x
Reference in New Issue
Block a user