8275047: Optimize existing fill stubs for AVX-512 target

Reviewed-by: kvn, redestad
This commit is contained in:
Jatin Bhateja 2021-10-26 12:34:56 +00:00
parent 63e0f344e9
commit 4be88d5482
6 changed files with 262 additions and 43 deletions

View File

@ -5026,7 +5026,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
BIND(L_loop); BIND(L_loop);
if (MaxVectorSize >= 32) { if (MaxVectorSize >= 32) {
fill64_avx(base, 0, xtmp, use64byteVector); fill64(base, 0, xtmp, use64byteVector);
} else { } else {
movdqu(Address(base, 0), xtmp); movdqu(Address(base, 0), xtmp);
movdqu(Address(base, 16), xtmp); movdqu(Address(base, 16), xtmp);
@ -5043,7 +5043,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
if (use64byteVector) { if (use64byteVector) {
addptr(cnt, 8); addptr(cnt, 8);
jccb(Assembler::equal, L_end); jccb(Assembler::equal, L_end);
fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true); fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
jmp(L_end); jmp(L_end);
} else { } else {
addptr(cnt, 4); addptr(cnt, 4);
@ -5062,7 +5062,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
addptr(cnt, 4); addptr(cnt, 4);
jccb(Assembler::lessEqual, L_end); jccb(Assembler::lessEqual, L_end);
if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) { if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp); fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
} else { } else {
decrement(cnt); decrement(cnt);
@ -5086,7 +5086,7 @@ void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegiste
// 64 byte initialization loop. // 64 byte initialization loop.
vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
for (int i = 0; i < vector64_count; i++) { for (int i = 0; i < vector64_count; i++) {
fill64_avx(base, i * 64, xtmp, use64byteVector); fill64(base, i * 64, xtmp, use64byteVector);
} }
// Clear remaining 64 byte tail. // Clear remaining 64 byte tail.
@ -5207,6 +5207,15 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
Label L_exit; Label L_exit;
Label L_fill_2_bytes, L_fill_4_bytes; Label L_fill_2_bytes, L_fill_4_bytes;
#if defined(COMPILER2) && defined(_LP64)
if(MaxVectorSize >=32 &&
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
generate_fill_avx3(t, to, value, count, rtmp, xtmp);
return;
}
#endif
int shift = -1; int shift = -1;
switch (t) { switch (t) {
case T_BYTE: case T_BYTE:
@ -5427,6 +5436,30 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
BIND(L_exit); BIND(L_exit);
} }
void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
switch(type) {
case T_BYTE:
case T_BOOLEAN:
evpbroadcastb(dst, src, vector_len);
break;
case T_SHORT:
case T_CHAR:
evpbroadcastw(dst, src, vector_len);
break;
case T_INT:
case T_FLOAT:
evpbroadcastd(dst, src, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evpbroadcastq(dst, src, vector_len);
break;
default:
fatal("Unhandled type : %s", type2name(type));
break;
}
}
// encode char[] to byte[] in ISO_8859_1 or ASCII // encode char[] to byte[] in ISO_8859_1 or ASCII
//@IntrinsicCandidate //@IntrinsicCandidate
//private static int implEncodeISOArray(byte[] sa, int sp, //private static int implEncodeISOArray(byte[] sa, int sp,
@ -8236,59 +8269,234 @@ void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMR
#if COMPILER2_OR_JVMCI #if COMPILER2_OR_JVMCI
void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
Register length, Register temp, int vec_enc) {
// Computing mask for predicated vector store.
movptr(temp, -1);
bzhiq(temp, temp, length);
kmov(mask, temp);
evmovdqu(bt, mask, dst, xmm, vec_enc);
}
// Set memory operation for length "less than" 64 bytes. // Set memory operation for length "less than" 64 bytes.
void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp, void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length, XMMRegister xmm, KRegister mask, Register length,
Register temp, bool use64byteVector) { Register temp, bool use64byteVector) {
assert(MaxVectorSize >= 32, "vector length should be >= 32"); assert(MaxVectorSize >= 32, "vector length should be >= 32");
assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
if (!use64byteVector) { if (!use64byteVector) {
fill32_avx(dst, disp, xmm); fill32(dst, disp, xmm);
subptr(length, 32 >> shift); subptr(length, 32 >> shift);
fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp); fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
} else { } else {
assert(MaxVectorSize == 64, "vector length != 64"); assert(MaxVectorSize == 64, "vector length != 64");
movl(temp, 1); fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
shlxl(temp, temp, length);
subptr(temp, 1);
kmovwl(mask, temp);
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
} }
} }
void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp, void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length, XMMRegister xmm, KRegister mask, Register length,
Register temp) { Register temp) {
assert(MaxVectorSize >= 32, "vector length should be >= 32"); assert(MaxVectorSize >= 32, "vector length should be >= 32");
assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
movl(temp, 1); fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
shlxl(temp, temp, length);
subptr(temp, 1);
kmovwl(mask, temp);
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
} }
void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) { void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
assert(MaxVectorSize >= 32, "vector length should be >= 32"); assert(MaxVectorSize >= 32, "vector length should be >= 32");
vmovdqu(Address(dst, disp), xmm); vmovdqu(Address(dst, disp), xmm);
} }
void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
assert(MaxVectorSize >= 32, "vector length should be >= 32"); assert(MaxVectorSize >= 32, "vector length should be >= 32");
BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG}; BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG};
if (!use64byteVector) { if (!use64byteVector) {
fill32_avx(dst, disp, xmm); fill32(dst, disp, xmm);
fill32_avx(dst, disp + 32, xmm); fill32(dst, disp + 32, xmm);
} else { } else {
evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit); evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
} }
} }
#ifdef _LP64
void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
Register count, Register rtmp, XMMRegister xtmp) {
Label L_exit;
Label L_fill_start;
Label L_fill_64_bytes;
Label L_fill_96_bytes;
Label L_fill_128_bytes;
Label L_fill_128_bytes_loop;
Label L_fill_128_loop_header;
Label L_fill_128_bytes_loop_header;
Label L_fill_128_bytes_loop_pre_header;
Label L_fill_zmm_sequence;
int shift = -1;
switch(type) {
case T_BYTE: shift = 0;
break;
case T_SHORT: shift = 1;
break;
case T_INT: shift = 2;
break;
/* Uncomment when LONG fill stubs are supported.
case T_LONG: shift = 3;
break;
*/
default:
fatal("Unhandled type: %s\n", type2name(type));
}
if (AVX3Threshold != 0 || MaxVectorSize == 32) {
if (MaxVectorSize == 64) {
cmpq(count, AVX3Threshold >> shift);
jcc(Assembler::greater, L_fill_zmm_sequence);
}
evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
bind(L_fill_start);
cmpq(count, 32 >> shift);
jccb(Assembler::greater, L_fill_64_bytes);
fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
jmp(L_exit);
bind(L_fill_64_bytes);
cmpq(count, 64 >> shift);
jccb(Assembler::greater, L_fill_96_bytes);
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
jmp(L_exit);
bind(L_fill_96_bytes);
cmpq(count, 96 >> shift);
jccb(Assembler::greater, L_fill_128_bytes);
fill64(to, 0, xtmp);
subq(count, 64 >> shift);
fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
jmp(L_exit);
bind(L_fill_128_bytes);
cmpq(count, 128 >> shift);
jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
fill64(to, 0, xtmp);
fill32(to, 64, xtmp);
subq(count, 96 >> shift);
fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
jmp(L_exit);
bind(L_fill_128_bytes_loop_pre_header);
{
mov(rtmp, to);
andq(rtmp, 31);
jccb(Assembler::zero, L_fill_128_bytes_loop_header);
negq(rtmp);
addq(rtmp, 32);
mov64(r8, -1L);
bzhiq(r8, r8, rtmp);
kmovql(k2, r8);
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
addq(to, rtmp);
shrq(rtmp, shift);
subq(count, rtmp);
}
cmpq(count, 128 >> shift);
jcc(Assembler::less, L_fill_start);
bind(L_fill_128_bytes_loop_header);
subq(count, 128 >> shift);
align32();
bind(L_fill_128_bytes_loop);
fill64(to, 0, xtmp);
fill64(to, 64, xtmp);
addq(to, 128);
subq(count, 128 >> shift);
jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
addq(count, 128 >> shift);
jcc(Assembler::zero, L_exit);
jmp(L_fill_start);
}
if (MaxVectorSize == 64) {
// Sequence using 64 byte ZMM register.
Label L_fill_128_bytes_zmm;
Label L_fill_192_bytes_zmm;
Label L_fill_192_bytes_loop_zmm;
Label L_fill_192_bytes_loop_header_zmm;
Label L_fill_192_bytes_loop_pre_header_zmm;
Label L_fill_start_zmm_sequence;
bind(L_fill_zmm_sequence);
evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
bind(L_fill_start_zmm_sequence);
cmpq(count, 64 >> shift);
jccb(Assembler::greater, L_fill_128_bytes_zmm);
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
jmp(L_exit);
bind(L_fill_128_bytes_zmm);
cmpq(count, 128 >> shift);
jccb(Assembler::greater, L_fill_192_bytes_zmm);
fill64(to, 0, xtmp, true);
subq(count, 64 >> shift);
fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
jmp(L_exit);
bind(L_fill_192_bytes_zmm);
cmpq(count, 192 >> shift);
jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
fill64(to, 0, xtmp, true);
fill64(to, 64, xtmp, true);
subq(count, 128 >> shift);
fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
jmp(L_exit);
bind(L_fill_192_bytes_loop_pre_header_zmm);
{
movq(rtmp, to);
andq(rtmp, 63);
jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
negq(rtmp);
addq(rtmp, 64);
mov64(r8, -1L);
bzhiq(r8, r8, rtmp);
kmovql(k2, r8);
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
addq(to, rtmp);
shrq(rtmp, shift);
subq(count, rtmp);
}
cmpq(count, 192 >> shift);
jcc(Assembler::less, L_fill_start_zmm_sequence);
bind(L_fill_192_bytes_loop_header_zmm);
subq(count, 192 >> shift);
align32();
bind(L_fill_192_bytes_loop_zmm);
fill64(to, 0, xtmp, true);
fill64(to, 64, xtmp, true);
fill64(to, 128, xtmp, true);
addq(to, 192);
subq(count, 192 >> shift);
jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
addq(count, 192 >> shift);
jcc(Assembler::zero, L_exit);
jmp(L_fill_start_zmm_sequence);
}
bind(L_exit);
}
#endif
#endif //COMPILER2_OR_JVMCI #endif //COMPILER2_OR_JVMCI

View File

@ -1305,6 +1305,7 @@ public:
void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
int comparison, bool is_signed, int vector_len, Register scratch_reg); int comparison, bool is_signed, int vector_len, Register scratch_reg);
void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
// Emit comparison instruction for the specified comparison predicate. // Emit comparison instruction for the specified comparison predicate.
void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg); void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg);
@ -1838,17 +1839,20 @@ public:
void byte_array_inflate(Register src, Register dst, Register len, void byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2, KRegister mask = knoreg); XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
void fill64_masked_avx(uint shift, Register dst, int disp, void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
Register length, Register temp, int vec_enc);
void fill64_masked(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length, XMMRegister xmm, KRegister mask, Register length,
Register temp, bool use64byteVector = false); Register temp, bool use64byteVector = false);
void fill32_masked_avx(uint shift, Register dst, int disp, void fill32_masked(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length, XMMRegister xmm, KRegister mask, Register length,
Register temp); Register temp);
void fill32_avx(Register dst, int disp, XMMRegister xmm); void fill32(Register dst, int disp, XMMRegister xmm);
void fill64_avx(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false); void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
#ifdef _LP64 #ifdef _LP64
void convert_f2i(Register dst, XMMRegister src); void convert_f2i(Register dst, XMMRegister src);
@ -1885,6 +1889,10 @@ public:
void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
bool conjoint, int shift = Address::times_1, int offset = 0, bool conjoint, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false); bool use64byteVector = false);
void generate_fill_avx3(BasicType type, Register to, Register value,
Register count, Register rtmp, XMMRegister xtmp);
#endif // COMPILER2_OR_JVMCI #endif // COMPILER2_OR_JVMCI
#endif // _LP64 #endif // _LP64

View File

@ -2113,13 +2113,14 @@ class StubGenerator: public StubCodeGenerator {
BLOCK_COMMENT("Entry:"); BLOCK_COMMENT("Entry:");
const Register to = c_rarg0; // source array address const Register to = c_rarg0; // destination array address
const Register value = c_rarg1; // value const Register value = c_rarg1; // value
const Register count = c_rarg2; // elements count const Register count = c_rarg2; // elements count
__ mov(r11, count);
__ enter(); // required for proper stackwalking of RuntimeStub frame __ enter(); // required for proper stackwalking of RuntimeStub frame
__ generate_fill(t, aligned, to, value, count, rax, xmm0); __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
__ vzeroupper(); __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame __ leave(); // required for proper stackwalking of RuntimeStub frame

View File

@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
enum platform_dependent_constants { enum platform_dependent_constants {
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small) code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
code_size2 = 35300 LP64_ONLY(+32000) // simply increase if too small (assembler will crash if too small) code_size2 = 35300 LP64_ONLY(+35000) // simply increase if too small (assembler will crash if too small)
}; };
class x86 { class x86 {

View File

@ -1469,6 +1469,14 @@ void VM_Version::get_processor_features() {
#endif #endif
} }
#ifdef COMPILER2
if (FLAG_IS_DEFAULT(OptimizeFill)) {
if (MaxVectorSize < 32 || !VM_Version::supports_avx512vlbw()) {
OptimizeFill = false;
}
}
#endif
#ifdef _LP64 #ifdef _LP64
if (UseSSE42Intrinsics) { if (UseSSE42Intrinsics) {
if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) { if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
@ -1585,12 +1593,6 @@ void VM_Version::get_processor_features() {
// Modern processors allow misaligned memory operations for vectors. // Modern processors allow misaligned memory operations for vectors.
AlignVector = !UseUnalignedLoadStores; AlignVector = !UseUnalignedLoadStores;
} }
if (FLAG_IS_DEFAULT(OptimizeFill)) {
// 8247307: On x86, the auto-vectorized loop array fill code shows
// better performance than the array fill stubs. We should reenable
// this after the x86 stubs get improved.
OptimizeFill = false;
}
#endif // COMPILER2 #endif // COMPILER2
if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) { if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -36,12 +36,12 @@ import org.openjdk.jmh.annotations.Warmup;
import java.util.Arrays; import java.util.Arrays;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime) @BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.NANOSECONDS) @OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread) @State(Scope.Thread)
public class ArraysFill { public class ArraysFill {
@Param({"10", "266", "2048"}) @Param({"10", "16", "31", "59", "89", "126", "250", "266", "511", "1021", "2047", "2048", "4095", "8195"})
public int size; public int size;
public byte[] testByteArray; public byte[] testByteArray;