8221092: UseAVX=3 has performance degredation on Skylake (X7) processors

Fix for UseAVX=3 has performance degredation on Skylake (X7) processors

Reviewed-by: kvn
This commit is contained in:
Vivek Deshpande 2019-10-04 11:45:16 -07:00
parent 4325a8443b
commit dce5f5dbc8
6 changed files with 168 additions and 148 deletions

View File

@ -211,5 +211,15 @@ define_pd_global(bool, ThreadLocalHandshakes, false);
"Use BMI2 instructions") \
\
diagnostic(bool, UseLibmIntrinsic, true, \
"Use Libm Intrinsics")
"Use Libm Intrinsics") \
\
/* Minimum array size in bytes to use AVX512 intrinsics */ \
/* for copy, inflate and fill which don't bail out early based on any */ \
/* condition. When this value is set to zero compare operations like */ \
/* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
diagnostic(int, AVX3Threshold, 4096, \
"Minimum array size in bytes to use AVX512 intrinsics" \
"for copy, inflate and fill. When this value is set as zero" \
"compare operations can also use AVX512 intrinsics.") \
range(0, max_jint)
#endif // CPU_X86_GLOBALS_X86_HPP

View File

@ -6593,7 +6593,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
bind(COMPARE_WIDE_VECTORS_LOOP);
#ifdef _LP64
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
cmpl(cnt2, stride2x2);
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
@ -6853,7 +6853,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
if ((UseAVX > 2) && // AVX512
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@ -6926,7 +6926,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
} else {
movl(result, len); // copy
if (UseAVX == 2 && UseSSE >= 2) {
if (UseAVX >= 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
@ -7099,14 +7099,12 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar
lea(ary2, Address(ary2, limit, Address::times_1));
negptr(limit);
bind(COMPARE_WIDE_VECTORS);
#ifdef _LP64
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
cmpl(limit, -64);
jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
@ -7139,7 +7137,7 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar
}//if (VM_Version::supports_avx512vlbw())
#endif //_LP64
bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
vpxor(vec1, vec2);
@ -7365,32 +7363,33 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
movdl(xtmp, value);
if (UseAVX > 2 && UseUnalignedLoadStores) {
if (UseAVX >= 2 && UseUnalignedLoadStores) {
Label L_check_fill_32_bytes;
if (UseAVX > 2) {
// Fill 64-byte chunks
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
// If number of bytes to fill < AVX3Threshold, perform fill using AVX2
cmpl(count, AVX3Threshold);
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
subl(count, 16 << shift);
jccb(Assembler::less, L_check_fill_32_bytes);
align(16);
BIND(L_fill_64_bytes_loop_avx3);
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
addptr(to, 64);
subl(count, 16 << shift);
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
jmpb(L_check_fill_32_bytes);
BIND(L_check_fill_64_bytes_avx2);
}
// Fill 64-byte chunks
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
subl(count, 16 << shift);
jcc(Assembler::less, L_check_fill_32_bytes);
align(16);
BIND(L_fill_64_bytes_loop);
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
addptr(to, 64);
subl(count, 16 << shift);
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
BIND(L_check_fill_32_bytes);
addl(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
vmovdqu(Address(to, 0), xtmp);
addptr(to, 32);
subl(count, 8 << shift);
BIND(L_check_fill_8_bytes);
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
// Fill 64-byte chunks
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
Label L_fill_64_bytes_loop;
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
subl(count, 16 << shift);
@ -8104,12 +8103,13 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
shlq(length);
xorq(result, result);
if ((UseAVX > 2) &&
if ((AVX3Threshold == 0) && (UseAVX > 2) &&
VM_Version::supports_avx512vlbw()) {
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
cmpq(length, 64);
jcc(Assembler::less, VECTOR32_TAIL);
movq(tmp1, length);
andq(tmp1, 0x3F); // tail count
andq(length, ~(0x3F)); //vector count
@ -9566,7 +9566,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
// save length for return
push(len);
if ((UseAVX > 2) && // AVX512
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@ -9758,7 +9758,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
// }
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
Label copy_chars_loop, done, below_threshold;
Label copy_chars_loop, done, below_threshold, avx3_threshold;
// rsi: src
// rdi: dst
// rdx: len
@ -9768,7 +9768,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
// rdi holds start addr of destination char[]
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
movl(tmp2, len);
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@ -9780,9 +9780,11 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
testl(len, -16);
jcc(Assembler::zero, below_threshold);
testl(len, -1 * AVX3Threshold);
jcc(Assembler::zero, avx3_threshold);
// In order to use only one arithmetic operation for the main loop we use
// this pre-calculation
movl(tmp2, len);
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
andl(len, -32); // vector count
jccb(Assembler::zero, copy_tail);
@ -9813,12 +9815,11 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
jmp(done);
bind(avx3_threshold);
}
if (UseSSE42Intrinsics) {
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
movl(tmp2, len);
if (UseAVX > 1) {
andl(tmp2, (16 - 1));
andl(len, -16);
@ -9843,13 +9844,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(below_threshold);
bind(copy_new_tail);
if ((UseAVX > 2) &&
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
movl(tmp2, len);
} else {
movl(len, tmp2);
}
movl(len, tmp2);
andl(tmp2, 0x00000007);
andl(len, 0xFFFFFFF8);
jccb(Assembler::zero, copy_tail);

View File

@ -1288,30 +1288,58 @@ class StubGenerator: public StubCodeGenerator {
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX > 2) {
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
__ BIND(L_copy_bytes);
__ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
__ jccb(Assembler::less, L_above_threshold);
__ jmpb(L_below_threshold);
__ bind(L_loop_avx512);
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
} else if (UseAVX == 2) {
__ bind(L_above_threshold);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop_avx512);
__ jmpb(L_32_byte_head);
__ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
__ bind(L_below_threshold);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop_avx2);
__ bind(L_32_byte_head);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
__ BIND(L_loop);
if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@ -1368,31 +1396,59 @@ class StubGenerator: public StubCodeGenerator {
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX > 2) {
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
__ BIND(L_copy_bytes);
__ cmpptr(qword_count, (AVX3Threshold / 8));
__ jccb(Assembler::greater, L_above_threshold);
__ jmpb(L_below_threshold);
__ BIND(L_loop_avx512);
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
} else if (UseAVX == 2) {
__ bind(L_above_threshold);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop_avx512);
__ jmpb(L_32_byte_head);
__ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
}
__ BIND(L_copy_bytes);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
__ bind(L_below_threshold);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop_avx2);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
__ bind(L_32_byte_head);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
} else {
__ BIND(L_loop);
if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
}
__ BIND(L_copy_bytes);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
}
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));

View File

@ -381,6 +381,10 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ cmpl(rax, 0xE0);
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
__ movl(rax, Address(rsi, 0));
__ cmpl(rax, 0x50654); // If it is Skylake
__ jcc(Assembler::equal, legacy_setup);
// If UseAVX is unitialized or is set by the user to include EVEX
if (use_evex) {
// EVEX setup: run in lowest evex mode
@ -465,6 +469,11 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ cmpl(rax, 0xE0);
__ jcc(Assembler::notEqual, legacy_save_restore);
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
__ movl(rax, Address(rsi, 0));
__ cmpl(rax, 0x50654); // If it is Skylake
__ jcc(Assembler::equal, legacy_save_restore);
// If UseAVX is unitialized or is set by the user to include EVEX
if (use_evex) {
// EVEX check: run in lowest evex mode
@ -660,6 +669,9 @@ void VM_Version::get_processor_features() {
}
if (FLAG_IS_DEFAULT(UseAVX)) {
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
if (is_intel_family_core() && _model == CPU_MODEL_SKYLAKE && _stepping < 5) {
FLAG_SET_DEFAULT(UseAVX, 2); //Set UseAVX=2 for Skylake
}
} else if (UseAVX > use_avx_limit) {
warning("UseAVX=%d is not supported on this CPU, setting it to UseAVX=%d", (int) UseAVX, use_avx_limit);
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
@ -1059,6 +1071,13 @@ void VM_Version::get_processor_features() {
}
#endif // COMPILER2 && ASSERT
if (!FLAG_IS_DEFAULT(AVX3Threshold)) {
if (!is_power_of_2(AVX3Threshold)) {
warning("AVX3Threshold must be a power of 2");
FLAG_SET_DEFAULT(AVX3Threshold, 4096);
}
}
#ifdef _LP64
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;

View File

@ -366,7 +366,7 @@ enum Extended_Family {
CPU_MODEL_HASWELL_E3 = 0x3c,
CPU_MODEL_HASWELL_E7 = 0x3f,
CPU_MODEL_BROADWELL = 0x3d,
CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3
CPU_MODEL_SKYLAKE = 0x55
};
// cpuid information block. All info derived from executing cpuid with

View File

@ -3861,7 +3861,7 @@ instruct Repl16F_mem(legVecZ dst, memory mem) %{
%}
instruct Repl2F_zero(vecD dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
predicate(n->as_Vector()->length() == 2);
match(Set dst (ReplicateF zero));
format %{ "xorps $dst,$dst\t! replicate2F zero" %}
ins_encode %{
@ -3871,7 +3871,7 @@ instruct Repl2F_zero(vecD dst, immF0 zero) %{
%}
instruct Repl4F_zero(vecX dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
predicate(n->as_Vector()->length() == 4);
match(Set dst (ReplicateF zero));
format %{ "xorps $dst,$dst\t! replicate4F zero" %}
ins_encode %{
@ -3881,7 +3881,7 @@ instruct Repl4F_zero(vecX dst, immF0 zero) %{
%}
instruct Repl8F_zero(vecY dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
predicate(n->as_Vector()->length() == 8 && UseAVX > 0);
match(Set dst (ReplicateF zero));
format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %}
ins_encode %{
@ -3955,7 +3955,7 @@ instruct Repl8D_mem(legVecZ dst, memory mem) %{
// Replicate double (8 byte) scalar zero to be vector
instruct Repl2D_zero(vecX dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
predicate(n->as_Vector()->length() == 2);
match(Set dst (ReplicateD zero));
format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
ins_encode %{
@ -3965,7 +3965,7 @@ instruct Repl2D_zero(vecX dst, immD0 zero) %{
%}
instruct Repl4D_zero(vecY dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
predicate(n->as_Vector()->length() == 4 && UseAVX > 0);
match(Set dst (ReplicateD zero));
format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
ins_encode %{
@ -4890,42 +4890,6 @@ instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
ins_pipe( pipe_slow );
%}
instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
match(Set dst (ReplicateF zero));
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2F zero" %}
ins_encode %{
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
int vector_len = 2;
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( fpu_reg_reg );
%}
instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
match(Set dst (ReplicateF zero));
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4F zero" %}
ins_encode %{
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
int vector_len = 2;
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( fpu_reg_reg );
%}
instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
match(Set dst (ReplicateF zero));
format %{ "vpxor $dst k0,$dst,$dst\t! replicate8F zero" %}
ins_encode %{
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
int vector_len = 2;
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( fpu_reg_reg );
%}
instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
match(Set dst (ReplicateF zero));
@ -4982,30 +4946,6 @@ instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
ins_pipe( pipe_slow );
%}
instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
match(Set dst (ReplicateD zero));
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2D zero" %}
ins_encode %{
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
int vector_len = 2;
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( fpu_reg_reg );
%}
instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
match(Set dst (ReplicateD zero));
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4D zero" %}
ins_encode %{
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
int vector_len = 2;
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( fpu_reg_reg );
%}
instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
match(Set dst (ReplicateD zero));