8221092: UseAVX=3 has performance degredation on Skylake (X7) processors
Fix for UseAVX=3 has performance degredation on Skylake (X7) processors Reviewed-by: kvn
This commit is contained in:
parent
4325a8443b
commit
dce5f5dbc8
@ -211,5 +211,15 @@ define_pd_global(bool, ThreadLocalHandshakes, false);
|
||||
"Use BMI2 instructions") \
|
||||
\
|
||||
diagnostic(bool, UseLibmIntrinsic, true, \
|
||||
"Use Libm Intrinsics")
|
||||
"Use Libm Intrinsics") \
|
||||
\
|
||||
/* Minimum array size in bytes to use AVX512 intrinsics */ \
|
||||
/* for copy, inflate and fill which don't bail out early based on any */ \
|
||||
/* condition. When this value is set to zero compare operations like */ \
|
||||
/* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
|
||||
diagnostic(int, AVX3Threshold, 4096, \
|
||||
"Minimum array size in bytes to use AVX512 intrinsics" \
|
||||
"for copy, inflate and fill. When this value is set as zero" \
|
||||
"compare operations can also use AVX512 intrinsics.") \
|
||||
range(0, max_jint)
|
||||
#endif // CPU_X86_GLOBALS_X86_HPP
|
||||
|
@ -6593,7 +6593,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP);
|
||||
|
||||
#ifdef _LP64
|
||||
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
||||
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
||||
cmpl(cnt2, stride2x2);
|
||||
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
|
||||
@ -6853,7 +6853,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
|
||||
testl(len, len);
|
||||
jcc(Assembler::zero, FALSE_LABEL);
|
||||
|
||||
if ((UseAVX > 2) && // AVX512
|
||||
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
|
||||
VM_Version::supports_avx512vlbw() &&
|
||||
VM_Version::supports_bmi2()) {
|
||||
|
||||
@ -6926,7 +6926,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
|
||||
} else {
|
||||
movl(result, len); // copy
|
||||
|
||||
if (UseAVX == 2 && UseSSE >= 2) {
|
||||
if (UseAVX >= 2 && UseSSE >= 2) {
|
||||
// With AVX2, use 32-byte vector compare
|
||||
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
||||
|
||||
@ -7099,14 +7099,12 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar
|
||||
lea(ary2, Address(ary2, limit, Address::times_1));
|
||||
negptr(limit);
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS);
|
||||
|
||||
#ifdef _LP64
|
||||
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
||||
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
||||
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
|
||||
|
||||
cmpl(limit, -64);
|
||||
jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
|
||||
|
||||
@ -7139,7 +7137,7 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar
|
||||
|
||||
}//if (VM_Version::supports_avx512vlbw())
|
||||
#endif //_LP64
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS);
|
||||
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
|
||||
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
|
||||
vpxor(vec1, vec2);
|
||||
@ -7365,32 +7363,33 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
assert( UseSSE >= 2, "supported cpu only" );
|
||||
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
|
||||
movdl(xtmp, value);
|
||||
if (UseAVX > 2 && UseUnalignedLoadStores) {
|
||||
if (UseAVX >= 2 && UseUnalignedLoadStores) {
|
||||
Label L_check_fill_32_bytes;
|
||||
if (UseAVX > 2) {
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
|
||||
|
||||
// If number of bytes to fill < AVX3Threshold, perform fill using AVX2
|
||||
cmpl(count, AVX3Threshold);
|
||||
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
|
||||
|
||||
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
|
||||
|
||||
subl(count, 16 << shift);
|
||||
jccb(Assembler::less, L_check_fill_32_bytes);
|
||||
align(16);
|
||||
|
||||
BIND(L_fill_64_bytes_loop_avx3);
|
||||
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
addptr(to, 64);
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
|
||||
jmpb(L_check_fill_32_bytes);
|
||||
|
||||
BIND(L_check_fill_64_bytes_avx2);
|
||||
}
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
|
||||
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
|
||||
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::less, L_check_fill_32_bytes);
|
||||
align(16);
|
||||
|
||||
BIND(L_fill_64_bytes_loop);
|
||||
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
addptr(to, 64);
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
|
||||
|
||||
BIND(L_check_fill_32_bytes);
|
||||
addl(count, 8 << shift);
|
||||
jccb(Assembler::less, L_check_fill_8_bytes);
|
||||
vmovdqu(Address(to, 0), xtmp);
|
||||
addptr(to, 32);
|
||||
subl(count, 8 << shift);
|
||||
|
||||
BIND(L_check_fill_8_bytes);
|
||||
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
|
||||
Label L_fill_64_bytes_loop;
|
||||
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
|
||||
|
||||
subl(count, 16 << shift);
|
||||
@ -8104,12 +8103,13 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
|
||||
shlq(length);
|
||||
xorq(result, result);
|
||||
|
||||
if ((UseAVX > 2) &&
|
||||
if ((AVX3Threshold == 0) && (UseAVX > 2) &&
|
||||
VM_Version::supports_avx512vlbw()) {
|
||||
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
|
||||
|
||||
cmpq(length, 64);
|
||||
jcc(Assembler::less, VECTOR32_TAIL);
|
||||
|
||||
movq(tmp1, length);
|
||||
andq(tmp1, 0x3F); // tail count
|
||||
andq(length, ~(0x3F)); //vector count
|
||||
@ -9566,7 +9566,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
|
||||
// save length for return
|
||||
push(len);
|
||||
|
||||
if ((UseAVX > 2) && // AVX512
|
||||
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
|
||||
VM_Version::supports_avx512vlbw() &&
|
||||
VM_Version::supports_bmi2()) {
|
||||
|
||||
@ -9758,7 +9758,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
|
||||
// }
|
||||
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
|
||||
XMMRegister tmp1, Register tmp2) {
|
||||
Label copy_chars_loop, done, below_threshold;
|
||||
Label copy_chars_loop, done, below_threshold, avx3_threshold;
|
||||
// rsi: src
|
||||
// rdi: dst
|
||||
// rdx: len
|
||||
@ -9768,7 +9768,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
||||
// rdi holds start addr of destination char[]
|
||||
// rdx holds length
|
||||
assert_different_registers(src, dst, len, tmp2);
|
||||
|
||||
movl(tmp2, len);
|
||||
if ((UseAVX > 2) && // AVX512
|
||||
VM_Version::supports_avx512vlbw() &&
|
||||
VM_Version::supports_bmi2()) {
|
||||
@ -9780,9 +9780,11 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
||||
testl(len, -16);
|
||||
jcc(Assembler::zero, below_threshold);
|
||||
|
||||
testl(len, -1 * AVX3Threshold);
|
||||
jcc(Assembler::zero, avx3_threshold);
|
||||
|
||||
// In order to use only one arithmetic operation for the main loop we use
|
||||
// this pre-calculation
|
||||
movl(tmp2, len);
|
||||
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
|
||||
andl(len, -32); // vector count
|
||||
jccb(Assembler::zero, copy_tail);
|
||||
@ -9813,12 +9815,11 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
||||
evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
|
||||
|
||||
jmp(done);
|
||||
bind(avx3_threshold);
|
||||
}
|
||||
if (UseSSE42Intrinsics) {
|
||||
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
|
||||
|
||||
movl(tmp2, len);
|
||||
|
||||
if (UseAVX > 1) {
|
||||
andl(tmp2, (16 - 1));
|
||||
andl(len, -16);
|
||||
@ -9843,13 +9844,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
||||
|
||||
bind(below_threshold);
|
||||
bind(copy_new_tail);
|
||||
if ((UseAVX > 2) &&
|
||||
VM_Version::supports_avx512vlbw() &&
|
||||
VM_Version::supports_bmi2()) {
|
||||
movl(tmp2, len);
|
||||
} else {
|
||||
movl(len, tmp2);
|
||||
}
|
||||
movl(len, tmp2);
|
||||
andl(tmp2, 0x00000007);
|
||||
andl(len, 0xFFFFFFF8);
|
||||
jccb(Assembler::zero, copy_tail);
|
||||
|
@ -1288,30 +1288,58 @@ class StubGenerator: public StubCodeGenerator {
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX > 2) {
|
||||
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
|
||||
__ jccb(Assembler::less, L_above_threshold);
|
||||
__ jmpb(L_below_threshold);
|
||||
|
||||
__ bind(L_loop_avx512);
|
||||
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ bind(L_above_threshold);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop_avx512);
|
||||
__ jmpb(L_32_byte_head);
|
||||
|
||||
__ bind(L_loop_avx2);
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
|
||||
__ bind(L_below_threshold);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop_avx2);
|
||||
|
||||
__ bind(L_32_byte_head);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
|
||||
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
|
||||
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
|
||||
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
|
||||
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
}
|
||||
__ BIND(L_copy_bytes);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
// Copy trailing 32 bytes
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
|
||||
@ -1368,31 +1396,59 @@ class StubGenerator: public StubCodeGenerator {
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX > 2) {
|
||||
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ cmpptr(qword_count, (AVX3Threshold / 8));
|
||||
__ jccb(Assembler::greater, L_above_threshold);
|
||||
__ jmpb(L_below_threshold);
|
||||
|
||||
__ BIND(L_loop_avx512);
|
||||
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ bind(L_above_threshold);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop_avx512);
|
||||
__ jmpb(L_32_byte_head);
|
||||
|
||||
__ bind(L_loop_avx2);
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
|
||||
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
|
||||
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
|
||||
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
|
||||
}
|
||||
__ BIND(L_copy_bytes);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
__ bind(L_below_threshold);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop_avx2);
|
||||
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
__ bind(L_32_byte_head);
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
} else {
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
|
||||
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
|
||||
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
|
||||
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop);
|
||||
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
}
|
||||
// Copy trailing 32 bytes
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
|
||||
|
@ -381,6 +381,10 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
__ cmpl(rax, 0xE0);
|
||||
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
|
||||
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
|
||||
__ movl(rax, Address(rsi, 0));
|
||||
__ cmpl(rax, 0x50654); // If it is Skylake
|
||||
__ jcc(Assembler::equal, legacy_setup);
|
||||
// If UseAVX is unitialized or is set by the user to include EVEX
|
||||
if (use_evex) {
|
||||
// EVEX setup: run in lowest evex mode
|
||||
@ -465,6 +469,11 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
__ cmpl(rax, 0xE0);
|
||||
__ jcc(Assembler::notEqual, legacy_save_restore);
|
||||
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
|
||||
__ movl(rax, Address(rsi, 0));
|
||||
__ cmpl(rax, 0x50654); // If it is Skylake
|
||||
__ jcc(Assembler::equal, legacy_save_restore);
|
||||
|
||||
// If UseAVX is unitialized or is set by the user to include EVEX
|
||||
if (use_evex) {
|
||||
// EVEX check: run in lowest evex mode
|
||||
@ -660,6 +669,9 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(UseAVX)) {
|
||||
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
|
||||
if (is_intel_family_core() && _model == CPU_MODEL_SKYLAKE && _stepping < 5) {
|
||||
FLAG_SET_DEFAULT(UseAVX, 2); //Set UseAVX=2 for Skylake
|
||||
}
|
||||
} else if (UseAVX > use_avx_limit) {
|
||||
warning("UseAVX=%d is not supported on this CPU, setting it to UseAVX=%d", (int) UseAVX, use_avx_limit);
|
||||
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
|
||||
@ -1059,6 +1071,13 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
#endif // COMPILER2 && ASSERT
|
||||
|
||||
if (!FLAG_IS_DEFAULT(AVX3Threshold)) {
|
||||
if (!is_power_of_2(AVX3Threshold)) {
|
||||
warning("AVX3Threshold must be a power of 2");
|
||||
FLAG_SET_DEFAULT(AVX3Threshold, 4096);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
|
||||
UseMultiplyToLenIntrinsic = true;
|
||||
|
@ -366,7 +366,7 @@ enum Extended_Family {
|
||||
CPU_MODEL_HASWELL_E3 = 0x3c,
|
||||
CPU_MODEL_HASWELL_E7 = 0x3f,
|
||||
CPU_MODEL_BROADWELL = 0x3d,
|
||||
CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3
|
||||
CPU_MODEL_SKYLAKE = 0x55
|
||||
};
|
||||
|
||||
// cpuid information block. All info derived from executing cpuid with
|
||||
|
@ -3861,7 +3861,7 @@ instruct Repl16F_mem(legVecZ dst, memory mem) %{
|
||||
%}
|
||||
|
||||
instruct Repl2F_zero(vecD dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
|
||||
predicate(n->as_Vector()->length() == 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate2F zero" %}
|
||||
ins_encode %{
|
||||
@ -3871,7 +3871,7 @@ instruct Repl2F_zero(vecD dst, immF0 zero) %{
|
||||
%}
|
||||
|
||||
instruct Repl4F_zero(vecX dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
|
||||
predicate(n->as_Vector()->length() == 4);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate4F zero" %}
|
||||
ins_encode %{
|
||||
@ -3881,7 +3881,7 @@ instruct Repl4F_zero(vecX dst, immF0 zero) %{
|
||||
%}
|
||||
|
||||
instruct Repl8F_zero(vecY dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX > 0);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %}
|
||||
ins_encode %{
|
||||
@ -3955,7 +3955,7 @@ instruct Repl8D_mem(legVecZ dst, memory mem) %{
|
||||
|
||||
// Replicate double (8 byte) scalar zero to be vector
|
||||
instruct Repl2D_zero(vecX dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
|
||||
predicate(n->as_Vector()->length() == 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
|
||||
ins_encode %{
|
||||
@ -3965,7 +3965,7 @@ instruct Repl2D_zero(vecX dst, immD0 zero) %{
|
||||
%}
|
||||
|
||||
instruct Repl4D_zero(vecY dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX > 0);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
|
||||
ins_encode %{
|
||||
@ -4890,42 +4890,6 @@ instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate8F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
@ -4982,30 +4946,6 @@ instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2D zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4D zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
|
Loading…
x
Reference in New Issue
Block a user