8275047: Optimize existing fill stubs for AVX-512 target
Reviewed-by: kvn, redestad
This commit is contained in:
parent
63e0f344e9
commit
4be88d5482
@ -5026,7 +5026,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
|
||||
|
||||
BIND(L_loop);
|
||||
if (MaxVectorSize >= 32) {
|
||||
fill64_avx(base, 0, xtmp, use64byteVector);
|
||||
fill64(base, 0, xtmp, use64byteVector);
|
||||
} else {
|
||||
movdqu(Address(base, 0), xtmp);
|
||||
movdqu(Address(base, 16), xtmp);
|
||||
@ -5043,7 +5043,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
|
||||
if (use64byteVector) {
|
||||
addptr(cnt, 8);
|
||||
jccb(Assembler::equal, L_end);
|
||||
fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
|
||||
fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
|
||||
jmp(L_end);
|
||||
} else {
|
||||
addptr(cnt, 4);
|
||||
@ -5062,7 +5062,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
|
||||
addptr(cnt, 4);
|
||||
jccb(Assembler::lessEqual, L_end);
|
||||
if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
|
||||
fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
|
||||
fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
|
||||
} else {
|
||||
decrement(cnt);
|
||||
|
||||
@ -5086,7 +5086,7 @@ void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegiste
|
||||
// 64 byte initialization loop.
|
||||
vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
|
||||
for (int i = 0; i < vector64_count; i++) {
|
||||
fill64_avx(base, i * 64, xtmp, use64byteVector);
|
||||
fill64(base, i * 64, xtmp, use64byteVector);
|
||||
}
|
||||
|
||||
// Clear remaining 64 byte tail.
|
||||
@ -5207,6 +5207,15 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
Label L_exit;
|
||||
Label L_fill_2_bytes, L_fill_4_bytes;
|
||||
|
||||
#if defined(COMPILER2) && defined(_LP64)
|
||||
if(MaxVectorSize >=32 &&
|
||||
VM_Version::supports_avx512vlbw() &&
|
||||
VM_Version::supports_bmi2()) {
|
||||
generate_fill_avx3(t, to, value, count, rtmp, xtmp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
int shift = -1;
|
||||
switch (t) {
|
||||
case T_BYTE:
|
||||
@ -5427,6 +5436,30 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
BIND(L_exit);
|
||||
}
|
||||
|
||||
void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
|
||||
switch(type) {
|
||||
case T_BYTE:
|
||||
case T_BOOLEAN:
|
||||
evpbroadcastb(dst, src, vector_len);
|
||||
break;
|
||||
case T_SHORT:
|
||||
case T_CHAR:
|
||||
evpbroadcastw(dst, src, vector_len);
|
||||
break;
|
||||
case T_INT:
|
||||
case T_FLOAT:
|
||||
evpbroadcastd(dst, src, vector_len);
|
||||
break;
|
||||
case T_LONG:
|
||||
case T_DOUBLE:
|
||||
evpbroadcastq(dst, src, vector_len);
|
||||
break;
|
||||
default:
|
||||
fatal("Unhandled type : %s", type2name(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// encode char[] to byte[] in ISO_8859_1 or ASCII
|
||||
//@IntrinsicCandidate
|
||||
//private static int implEncodeISOArray(byte[] sa, int sp,
|
||||
@ -8236,59 +8269,234 @@ void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMR
|
||||
|
||||
#if COMPILER2_OR_JVMCI
|
||||
|
||||
void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
|
||||
Register length, Register temp, int vec_enc) {
|
||||
// Computing mask for predicated vector store.
|
||||
movptr(temp, -1);
|
||||
bzhiq(temp, temp, length);
|
||||
kmov(mask, temp);
|
||||
evmovdqu(bt, mask, dst, xmm, vec_enc);
|
||||
}
|
||||
|
||||
// Set memory operation for length "less than" 64 bytes.
|
||||
void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
|
||||
void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
|
||||
XMMRegister xmm, KRegister mask, Register length,
|
||||
Register temp, bool use64byteVector) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
if (!use64byteVector) {
|
||||
fill32_avx(dst, disp, xmm);
|
||||
fill32(dst, disp, xmm);
|
||||
subptr(length, 32 >> shift);
|
||||
fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
|
||||
fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
|
||||
} else {
|
||||
assert(MaxVectorSize == 64, "vector length != 64");
|
||||
movl(temp, 1);
|
||||
shlxl(temp, temp, length);
|
||||
subptr(temp, 1);
|
||||
kmovwl(mask, temp);
|
||||
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
|
||||
fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
|
||||
void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
|
||||
XMMRegister xmm, KRegister mask, Register length,
|
||||
Register temp) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
movl(temp, 1);
|
||||
shlxl(temp, temp, length);
|
||||
subptr(temp, 1);
|
||||
kmovwl(mask, temp);
|
||||
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
|
||||
void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
vmovdqu(Address(dst, disp), xmm);
|
||||
}
|
||||
|
||||
void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
|
||||
void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
if (!use64byteVector) {
|
||||
fill32_avx(dst, disp, xmm);
|
||||
fill32_avx(dst, disp + 32, xmm);
|
||||
fill32(dst, disp, xmm);
|
||||
fill32(dst, disp + 32, xmm);
|
||||
} else {
|
||||
evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
|
||||
Register count, Register rtmp, XMMRegister xtmp) {
|
||||
Label L_exit;
|
||||
Label L_fill_start;
|
||||
Label L_fill_64_bytes;
|
||||
Label L_fill_96_bytes;
|
||||
Label L_fill_128_bytes;
|
||||
Label L_fill_128_bytes_loop;
|
||||
Label L_fill_128_loop_header;
|
||||
Label L_fill_128_bytes_loop_header;
|
||||
Label L_fill_128_bytes_loop_pre_header;
|
||||
Label L_fill_zmm_sequence;
|
||||
|
||||
int shift = -1;
|
||||
switch(type) {
|
||||
case T_BYTE: shift = 0;
|
||||
break;
|
||||
case T_SHORT: shift = 1;
|
||||
break;
|
||||
case T_INT: shift = 2;
|
||||
break;
|
||||
/* Uncomment when LONG fill stubs are supported.
|
||||
case T_LONG: shift = 3;
|
||||
break;
|
||||
*/
|
||||
default:
|
||||
fatal("Unhandled type: %s\n", type2name(type));
|
||||
}
|
||||
|
||||
if (AVX3Threshold != 0 || MaxVectorSize == 32) {
|
||||
|
||||
if (MaxVectorSize == 64) {
|
||||
cmpq(count, AVX3Threshold >> shift);
|
||||
jcc(Assembler::greater, L_fill_zmm_sequence);
|
||||
}
|
||||
|
||||
evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
|
||||
|
||||
bind(L_fill_start);
|
||||
|
||||
cmpq(count, 32 >> shift);
|
||||
jccb(Assembler::greater, L_fill_64_bytes);
|
||||
fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_64_bytes);
|
||||
cmpq(count, 64 >> shift);
|
||||
jccb(Assembler::greater, L_fill_96_bytes);
|
||||
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_96_bytes);
|
||||
cmpq(count, 96 >> shift);
|
||||
jccb(Assembler::greater, L_fill_128_bytes);
|
||||
fill64(to, 0, xtmp);
|
||||
subq(count, 64 >> shift);
|
||||
fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_128_bytes);
|
||||
cmpq(count, 128 >> shift);
|
||||
jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
|
||||
fill64(to, 0, xtmp);
|
||||
fill32(to, 64, xtmp);
|
||||
subq(count, 96 >> shift);
|
||||
fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_128_bytes_loop_pre_header);
|
||||
{
|
||||
mov(rtmp, to);
|
||||
andq(rtmp, 31);
|
||||
jccb(Assembler::zero, L_fill_128_bytes_loop_header);
|
||||
negq(rtmp);
|
||||
addq(rtmp, 32);
|
||||
mov64(r8, -1L);
|
||||
bzhiq(r8, r8, rtmp);
|
||||
kmovql(k2, r8);
|
||||
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
|
||||
addq(to, rtmp);
|
||||
shrq(rtmp, shift);
|
||||
subq(count, rtmp);
|
||||
}
|
||||
|
||||
cmpq(count, 128 >> shift);
|
||||
jcc(Assembler::less, L_fill_start);
|
||||
|
||||
bind(L_fill_128_bytes_loop_header);
|
||||
subq(count, 128 >> shift);
|
||||
|
||||
align32();
|
||||
bind(L_fill_128_bytes_loop);
|
||||
fill64(to, 0, xtmp);
|
||||
fill64(to, 64, xtmp);
|
||||
addq(to, 128);
|
||||
subq(count, 128 >> shift);
|
||||
jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
|
||||
|
||||
addq(count, 128 >> shift);
|
||||
jcc(Assembler::zero, L_exit);
|
||||
jmp(L_fill_start);
|
||||
}
|
||||
|
||||
if (MaxVectorSize == 64) {
|
||||
// Sequence using 64 byte ZMM register.
|
||||
Label L_fill_128_bytes_zmm;
|
||||
Label L_fill_192_bytes_zmm;
|
||||
Label L_fill_192_bytes_loop_zmm;
|
||||
Label L_fill_192_bytes_loop_header_zmm;
|
||||
Label L_fill_192_bytes_loop_pre_header_zmm;
|
||||
Label L_fill_start_zmm_sequence;
|
||||
|
||||
bind(L_fill_zmm_sequence);
|
||||
evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
|
||||
|
||||
bind(L_fill_start_zmm_sequence);
|
||||
cmpq(count, 64 >> shift);
|
||||
jccb(Assembler::greater, L_fill_128_bytes_zmm);
|
||||
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_128_bytes_zmm);
|
||||
cmpq(count, 128 >> shift);
|
||||
jccb(Assembler::greater, L_fill_192_bytes_zmm);
|
||||
fill64(to, 0, xtmp, true);
|
||||
subq(count, 64 >> shift);
|
||||
fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_192_bytes_zmm);
|
||||
cmpq(count, 192 >> shift);
|
||||
jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
|
||||
fill64(to, 0, xtmp, true);
|
||||
fill64(to, 64, xtmp, true);
|
||||
subq(count, 128 >> shift);
|
||||
fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
|
||||
jmp(L_exit);
|
||||
|
||||
bind(L_fill_192_bytes_loop_pre_header_zmm);
|
||||
{
|
||||
movq(rtmp, to);
|
||||
andq(rtmp, 63);
|
||||
jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
|
||||
negq(rtmp);
|
||||
addq(rtmp, 64);
|
||||
mov64(r8, -1L);
|
||||
bzhiq(r8, r8, rtmp);
|
||||
kmovql(k2, r8);
|
||||
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
addq(to, rtmp);
|
||||
shrq(rtmp, shift);
|
||||
subq(count, rtmp);
|
||||
}
|
||||
|
||||
cmpq(count, 192 >> shift);
|
||||
jcc(Assembler::less, L_fill_start_zmm_sequence);
|
||||
|
||||
bind(L_fill_192_bytes_loop_header_zmm);
|
||||
subq(count, 192 >> shift);
|
||||
|
||||
align32();
|
||||
bind(L_fill_192_bytes_loop_zmm);
|
||||
fill64(to, 0, xtmp, true);
|
||||
fill64(to, 64, xtmp, true);
|
||||
fill64(to, 128, xtmp, true);
|
||||
addq(to, 192);
|
||||
subq(count, 192 >> shift);
|
||||
jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
|
||||
|
||||
addq(count, 192 >> shift);
|
||||
jcc(Assembler::zero, L_exit);
|
||||
jmp(L_fill_start_zmm_sequence);
|
||||
}
|
||||
bind(L_exit);
|
||||
}
|
||||
#endif
|
||||
#endif //COMPILER2_OR_JVMCI
|
||||
|
||||
|
||||
|
@ -1305,6 +1305,7 @@ public:
|
||||
void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
|
||||
int comparison, bool is_signed, int vector_len, Register scratch_reg);
|
||||
|
||||
void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
|
||||
|
||||
// Emit comparison instruction for the specified comparison predicate.
|
||||
void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg);
|
||||
@ -1838,17 +1839,20 @@ public:
|
||||
void byte_array_inflate(Register src, Register dst, Register len,
|
||||
XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
|
||||
|
||||
void fill64_masked_avx(uint shift, Register dst, int disp,
|
||||
void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
|
||||
Register length, Register temp, int vec_enc);
|
||||
|
||||
void fill64_masked(uint shift, Register dst, int disp,
|
||||
XMMRegister xmm, KRegister mask, Register length,
|
||||
Register temp, bool use64byteVector = false);
|
||||
|
||||
void fill32_masked_avx(uint shift, Register dst, int disp,
|
||||
void fill32_masked(uint shift, Register dst, int disp,
|
||||
XMMRegister xmm, KRegister mask, Register length,
|
||||
Register temp);
|
||||
|
||||
void fill32_avx(Register dst, int disp, XMMRegister xmm);
|
||||
void fill32(Register dst, int disp, XMMRegister xmm);
|
||||
|
||||
void fill64_avx(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
|
||||
void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
|
||||
|
||||
#ifdef _LP64
|
||||
void convert_f2i(Register dst, XMMRegister src);
|
||||
@ -1885,6 +1889,10 @@ public:
|
||||
void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
bool conjoint, int shift = Address::times_1, int offset = 0,
|
||||
bool use64byteVector = false);
|
||||
|
||||
void generate_fill_avx3(BasicType type, Register to, Register value,
|
||||
Register count, Register rtmp, XMMRegister xtmp);
|
||||
|
||||
#endif // COMPILER2_OR_JVMCI
|
||||
|
||||
#endif // _LP64
|
||||
|
@ -2113,13 +2113,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
|
||||
const Register to = c_rarg0; // source array address
|
||||
const Register to = c_rarg0; // destination array address
|
||||
const Register value = c_rarg1; // value
|
||||
const Register count = c_rarg2; // elements count
|
||||
__ mov(r11, count);
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
__ generate_fill(t, aligned, to, value, count, rax, xmm0);
|
||||
__ generate_fill(t, aligned, to, value, r11, rax, xmm0);
|
||||
|
||||
__ vzeroupper();
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
|
||||
|
||||
enum platform_dependent_constants {
|
||||
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 35300 LP64_ONLY(+32000) // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 35300 LP64_ONLY(+35000) // simply increase if too small (assembler will crash if too small)
|
||||
};
|
||||
|
||||
class x86 {
|
||||
|
@ -1469,6 +1469,14 @@ void VM_Version::get_processor_features() {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef COMPILER2
|
||||
if (FLAG_IS_DEFAULT(OptimizeFill)) {
|
||||
if (MaxVectorSize < 32 || !VM_Version::supports_avx512vlbw()) {
|
||||
OptimizeFill = false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _LP64
|
||||
if (UseSSE42Intrinsics) {
|
||||
if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
|
||||
@ -1585,12 +1593,6 @@ void VM_Version::get_processor_features() {
|
||||
// Modern processors allow misaligned memory operations for vectors.
|
||||
AlignVector = !UseUnalignedLoadStores;
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(OptimizeFill)) {
|
||||
// 8247307: On x86, the auto-vectorized loop array fill code shows
|
||||
// better performance than the array fill stubs. We should reenable
|
||||
// this after the x86 stubs get improved.
|
||||
OptimizeFill = false;
|
||||
}
|
||||
#endif // COMPILER2
|
||||
|
||||
if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -36,12 +36,12 @@ import org.openjdk.jmh.annotations.Warmup;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Thread)
|
||||
public class ArraysFill {
|
||||
|
||||
@Param({"10", "266", "2048"})
|
||||
@Param({"10", "16", "31", "59", "89", "126", "250", "266", "511", "1021", "2047", "2048", "4095", "8195"})
|
||||
public int size;
|
||||
|
||||
public byte[] testByteArray;
|
||||
|
Loading…
x
Reference in New Issue
Block a user