8295282: Use Zicboz/cbo.zero to zero-out memory on RISC-V

Reviewed-by: yadongwang, vkempik, fyang
This commit is contained in:
Ludovic Henry 2022-10-25 20:11:48 +00:00 committed by Vladimir Kempik
parent d393e051e6
commit e0c29307f7
9 changed files with 199 additions and 16 deletions

View File

@ -2712,6 +2712,44 @@ public:
#undef INSN #undef INSN
// Cache Management Operations
#define INSN(NAME, funct) \
void NAME(Register Rs1) { \
unsigned insn = 0; \
patch((address)&insn, 6, 0, 0b0001111); \
patch((address)&insn, 14, 12, 0b010); \
patch_reg((address)&insn, 15, Rs1); \
patch((address)&insn, 31, 20, funct); \
emit(insn); \
}
INSN(cbo_inval, 0b0000000000000);
INSN(cbo_clean, 0b0000000000001);
INSN(cbo_flush, 0b0000000000010);
INSN(cbo_zero, 0b0000000000100);
#undef INSN
#define INSN(NAME, funct) \
void NAME(Register Rs1, int32_t offset) { \
guarantee((offset & 0x1f) == 0, "offset lowest 5 bits must be zero"); \
int32_t upperOffset = offset >> 5; \
unsigned insn = 0; \
patch((address)&insn, 6, 0, 0b0010011); \
patch((address)&insn, 14, 12, 0b110); \
patch_reg((address)&insn, 15, Rs1); \
patch((address)&insn, 24, 20, funct); \
upperOffset &= 0x7f; \
patch((address)&insn, 31, 25, upperOffset); \
emit(insn); \
}
INSN(prefetch_i, 0b0000000000000);
INSN(prefetch_r, 0b0000000000001);
INSN(prefetch_w, 0b0000000000011);
#undef INSN
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
#define INSN(NAME, REGISTER) \ #define INSN(NAME, REGISTER) \

View File

@ -49,4 +49,6 @@ const bool CCallingConventionRequiresIntsAsLongs = false;
#define USE_POINTERS_TO_REGISTER_IMPL_ARRAY #define USE_POINTERS_TO_REGISTER_IMPL_ARRAY
#define DEFAULT_CACHE_LINE_SIZE 64
#endif // CPU_RISCV_GLOBALDEFINITIONS_RISCV_HPP #endif // CPU_RISCV_GLOBALDEFINITIONS_RISCV_HPP

View File

@ -82,19 +82,30 @@ define_pd_global(intx, InlineSmallCode, 1000);
\ \
product(bool, NearCpool, true, \ product(bool, NearCpool, true, \
"constant pool is close to instructions") \ "constant pool is close to instructions") \
product(bool, UseBlockZeroing, false, \
"Use Zicboz for block zeroing") \
product(intx, BlockZeroingLowLimit, 256, \ product(intx, BlockZeroingLowLimit, 256, \
"Minimum size in bytes when block zeroing will be used") \ "Minimum size in bytes when block zeroing will be used") \
range(1, max_jint) \ range(1, max_jint) \
product(intx, CacheLineSize, DEFAULT_CACHE_LINE_SIZE, \
"Size in bytes of a CPU cache line") \
range(wordSize, max_jint) \
product(bool, TraceTraps, false, "Trace all traps the signal handler") \ product(bool, TraceTraps, false, "Trace all traps the signal handler") \
/* For now we're going to be safe and add the I/O bits to userspace fences. */ \ /* For now we're going to be safe and add the I/O bits to userspace fences. */ \
product(bool, UseConservativeFence, true, \ product(bool, UseConservativeFence, true, \
"Extend i for r and o for w in the pred/succ flags of fence") \ "Extend i for r and o for w in the pred/succ flags of fence") \
product(bool, AvoidUnalignedAccesses, true, \ product(bool, AvoidUnalignedAccesses, true, \
"Avoid generating unaligned memory accesses") \ "Avoid generating unaligned memory accesses") \
product(bool, UseRVA20U64, false, EXPERIMENTAL, "Use RVA20U64 profile") \
product(bool, UseRVA22U64, false, EXPERIMENTAL, "Use RVA22U64 profile") \
product(bool, UseRVV, false, EXPERIMENTAL, "Use RVV instructions") \ product(bool, UseRVV, false, EXPERIMENTAL, "Use RVV instructions") \
product(bool, UseRVC, false, EXPERIMENTAL, "Use RVC instructions") \ product(bool, UseRVC, false, EXPERIMENTAL, "Use RVC instructions") \
product(bool, UseZba, false, EXPERIMENTAL, "Use Zba instructions") \ product(bool, UseZba, false, EXPERIMENTAL, "Use Zba instructions") \
product(bool, UseZbb, false, EXPERIMENTAL, "Use Zbb instructions") \ product(bool, UseZbb, false, EXPERIMENTAL, "Use Zbb instructions") \
product(bool, UseZic64b, false, EXPERIMENTAL, "Use Zic64b instructions") \
product(bool, UseZicbom, false, EXPERIMENTAL, "Use Zicbom instructions") \
product(bool, UseZicbop, false, EXPERIMENTAL, "Use Zicbop instructions") \
product(bool, UseZicboz, false, EXPERIMENTAL, "Use Zicboz instructions") \
product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \ product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \
"Use RVV instructions for left/right shift of BigInteger") "Use RVV instructions for left/right shift of BigInteger")

View File

@ -792,7 +792,7 @@ void MacroAssembler::la(Register Rd, const Address &adr) {
void MacroAssembler::la(Register Rd, Label &label) { void MacroAssembler::la(Register Rd, Label &label) {
IncompressibleRegion ir(this); // the label address may be patched back. IncompressibleRegion ir(this); // the label address may be patched back.
la(Rd, target(label)); wrap_label(Rd, label, &MacroAssembler::la);
} }
void MacroAssembler::li32(Register Rd, int32_t imm) { void MacroAssembler::li32(Register Rd, int32_t imm) {
@ -3980,9 +3980,9 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
andi(t0, cnt, i); andi(t0, cnt, i);
beqz(t0, l); beqz(t0, l);
for (int j = 0; j < i; j++) { for (int j = 0; j < i; j++) {
sd(zr, Address(ptr, 0)); sd(zr, Address(ptr, j * wordSize));
addi(ptr, ptr, 8);
} }
addi(ptr, ptr, i * wordSize);
bind(l); bind(l);
} }
{ {
@ -4001,7 +4001,7 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
// base: Address of a buffer to be zeroed, 8 bytes aligned. // base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Immediate count in HeapWords. // cnt: Immediate count in HeapWords.
void MacroAssembler::zero_words(Register base, u_int64_t cnt) void MacroAssembler::zero_words(Register base, uint64_t cnt)
{ {
assert_different_registers(base, t0, t1); assert_different_registers(base, t0, t1);
@ -4092,6 +4092,43 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
bind(fini); bind(fini);
} }
// Zero blocks of memory by using CBO.ZERO.
//
// Aligns the base address first sufficiently for CBO.ZERO, then uses
// CBO.ZERO repeatedly for every full block. cnt is the size to be
// zeroed in HeapWords. Returns the count of words left to be zeroed
// in cnt.
//
// NOTE: This is intended to be used in the zero_blocks() stub. If
// you want to use it elsewhere, note that cnt must be >= CacheLineSize.
void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
Label initial_table_end, loop;
// Align base with cache line size.
neg(tmp1, base);
andi(tmp1, tmp1, CacheLineSize - 1);
// tmp1: the number of bytes to be filled to align the base with cache line size.
add(base, base, tmp1);
srai(tmp2, tmp1, 3);
sub(cnt, cnt, tmp2);
srli(tmp2, tmp1, 1);
la(tmp1, initial_table_end);
sub(tmp2, tmp1, tmp2);
jr(tmp2);
for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
sd(zr, Address(base, i));
}
bind(initial_table_end);
mv(tmp1, CacheLineSize / wordSize);
bind(loop);
cbo_zero(base);
sub(cnt, cnt, tmp1);
add(base, base, CacheLineSize);
bge(cnt, tmp1, loop);
}
#define FCVT_SAFE(FLOATCVT, FLOATEQ) \ #define FCVT_SAFE(FLOATCVT, FLOATEQ) \
void MacroAssembler:: FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \ void MacroAssembler:: FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
Label L_Okay; \ Label L_Okay; \

View File

@ -1093,10 +1093,11 @@ public:
void ctzc_bit(Register Rd, Register Rs, bool isLL = false, Register tmp1 = t0, Register tmp2 = t1); void ctzc_bit(Register Rd, Register Rs, bool isLL = false, Register tmp1 = t0, Register tmp2 = t1);
void zero_words(Register base, u_int64_t cnt); void zero_words(Register base, uint64_t cnt);
address zero_words(Register ptr, Register cnt); address zero_words(Register ptr, Register cnt);
void fill_words(Register base, Register cnt, Register value); void fill_words(Register base, Register cnt, Register value);
void zero_memory(Register addr, Register len, Register tmp); void zero_memory(Register addr, Register len, Register tmp);
void zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2);
// shift left by shamt and add // shift left by shamt and add
void shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt); void shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt);

View File

@ -727,6 +727,10 @@ reg_class r30_reg(
R30, R30_H R30, R30_H
); );
reg_class r31_reg(
R31, R31_H
);
// Class for zero registesr // Class for zero registesr
reg_class zr_reg( reg_class zr_reg(
R0, R0_H R0, R0_H
@ -3347,6 +3351,28 @@ operand iRegP_R28()
interface(REG_INTER); interface(REG_INTER);
%} %}
// Pointer 64 bit Register R30 only
operand iRegP_R30()
%{
constraint(ALLOC_IN_RC(r30_reg));
match(RegP);
match(iRegPNoSp);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}
// Pointer 64 bit Register R31 only
operand iRegP_R31()
%{
constraint(ALLOC_IN_RC(r31_reg));
match(RegP);
match(iRegPNoSp);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}
// Pointer Register Operands // Pointer Register Operands
// Narrow Pointer Register // Narrow Pointer Register
operand iRegN() operand iRegN()
@ -10228,11 +10254,13 @@ instruct stringL_indexof_char(iRegP_R11 str1, iRegI_R12 cnt1, iRegI_R13 ch,
%} %}
// clearing of an array // clearing of an array
instruct clearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy) instruct clearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, iRegP_R30 tmp1,
iRegP_R31 tmp2, Universe dummy)
%{ %{
predicate(!UseRVV); // temp registers must match the one used in StubGenerator::generate_zero_blocks()
predicate(UseBlockZeroing || !UseRVV);
match(Set dummy (ClearArray cnt base)); match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base); effect(USE_KILL cnt, USE_KILL base, TEMP tmp1, TEMP tmp2);
ins_cost(4 * DEFAULT_COST); ins_cost(4 * DEFAULT_COST);
format %{ "ClearArray $cnt, $base\t#@clearArray_reg_reg" %} format %{ "ClearArray $cnt, $base\t#@clearArray_reg_reg" %}

View File

@ -2042,7 +2042,7 @@ instruct vstringL_indexof_char(iRegP_R11 str1, iRegI_R12 cnt1, iRegI_R13 ch,
instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy, instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy,
vReg_V1 vReg1, vReg_V2 vReg2, vReg_V3 vReg3) vReg_V1 vReg1, vReg_V2 vReg2, vReg_V3 vReg3)
%{ %{
predicate(UseRVV); predicate(!UseBlockZeroing && UseRVV);
match(Set dummy (ClearArray cnt base)); match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, TEMP vReg1, TEMP vReg2, TEMP vReg3); effect(USE_KILL cnt, USE_KILL base, TEMP vReg1, TEMP vReg2, TEMP vReg3);

View File

@ -671,26 +671,36 @@ class StubGenerator: public StubCodeGenerator {
address generate_zero_blocks() { address generate_zero_blocks() {
Label done; Label done;
const Register base = x28, cnt = x29; const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
__ align(CodeEntryAlignment); __ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "zero_blocks"); StubCodeMark mark(this, "StubRoutines", "zero_blocks");
address start = __ pc(); address start = __ pc();
if (UseBlockZeroing) {
// Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
// after alignment.
Label small;
int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
__ mv(tmp1, low_limit);
__ blt(cnt, tmp1, small);
__ zero_dcache_blocks(base, cnt, tmp1, tmp2);
__ bind(small);
}
{ {
// Clear the remaining blocks. // Clear the remaining blocks.
Label loop; Label loop;
__ sub(cnt, cnt, MacroAssembler::zero_words_block_size); __ mv(tmp1, MacroAssembler::zero_words_block_size);
__ bltz(cnt, done); __ blt(cnt, tmp1, done);
__ bind(loop); __ bind(loop);
for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
__ sd(zr, Address(base, 0)); __ sd(zr, Address(base, i * wordSize));
__ add(base, base, 8);
} }
__ add(base, base, MacroAssembler::zero_words_block_size * wordSize);
__ sub(cnt, cnt, MacroAssembler::zero_words_block_size); __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
__ bgez(cnt, loop); __ bge(cnt, tmp1, loop);
__ bind(done); __ bind(done);
__ add(cnt, cnt, MacroAssembler::zero_words_block_size);
} }
__ ret(); __ ret();

View File

@ -36,6 +36,50 @@ uint32_t VM_Version::_initial_vector_length = 0;
void VM_Version::initialize() { void VM_Version::initialize() {
get_os_cpu_info(); get_os_cpu_info();
// https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc#rva20-profiles
if (UseRVA20U64) {
if (FLAG_IS_DEFAULT(UseRVC)) {
FLAG_SET_DEFAULT(UseRVC, true);
}
}
// https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc#rva22-profiles
if (UseRVA22U64) {
if (FLAG_IS_DEFAULT(UseRVC)) {
FLAG_SET_DEFAULT(UseRVC, true);
}
if (FLAG_IS_DEFAULT(UseZba)) {
FLAG_SET_DEFAULT(UseZba, true);
}
if (FLAG_IS_DEFAULT(UseZbb)) {
FLAG_SET_DEFAULT(UseZbb, true);
}
if (FLAG_IS_DEFAULT(UseZic64b)) {
FLAG_SET_DEFAULT(UseZic64b, true);
}
if (FLAG_IS_DEFAULT(UseZicbom)) {
FLAG_SET_DEFAULT(UseZicbom, true);
}
if (FLAG_IS_DEFAULT(UseZicbop)) {
FLAG_SET_DEFAULT(UseZicbop, true);
}
if (FLAG_IS_DEFAULT(UseZicboz)) {
FLAG_SET_DEFAULT(UseZicboz, true);
}
}
if (UseZic64b) {
if (CacheLineSize != 64) {
assert(!FLAG_IS_DEFAULT(CacheLineSize), "default cache line size should be 64 bytes");
warning("CacheLineSize is assumed to be 64 bytes because Zic64b is enabled");
FLAG_SET_DEFAULT(CacheLineSize, 64);
}
} else {
if (!FLAG_IS_DEFAULT(CacheLineSize) && !is_power_of_2(CacheLineSize)) {
warning("CacheLineSize must be a power of 2");
FLAG_SET_DEFAULT(CacheLineSize, DEFAULT_CACHE_LINE_SIZE);
}
}
if (FLAG_IS_DEFAULT(UseFMA)) { if (FLAG_IS_DEFAULT(UseFMA)) {
FLAG_SET_DEFAULT(UseFMA, true); FLAG_SET_DEFAULT(UseFMA, true);
} }
@ -127,6 +171,18 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UsePopCountInstruction, false); FLAG_SET_DEFAULT(UsePopCountInstruction, false);
} }
if (UseZicboz) {
if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
FLAG_SET_DEFAULT(UseBlockZeroing, true);
}
if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
FLAG_SET_DEFAULT(BlockZeroingLowLimit, 2 * CacheLineSize);
}
} else if (UseBlockZeroing) {
warning("Block zeroing is not available");
FLAG_SET_DEFAULT(UseBlockZeroing, false);
}
char buf[512]; char buf[512];
buf[0] = '\0'; buf[0] = '\0';
if (_uarch != NULL && strcmp(_uarch, "") != 0) snprintf(buf, sizeof(buf), "%s,", _uarch); if (_uarch != NULL && strcmp(_uarch, "") != 0) snprintf(buf, sizeof(buf), "%s,", _uarch);