From e0c29307f7b35149aacae0bb935aa9fe524cff72 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 25 Oct 2022 20:11:48 +0000 Subject: [PATCH] 8295282: Use Zicboz/cbo.zero to zero-out memory on RISC-V Reviewed-by: yadongwang, vkempik, fyang --- src/hotspot/cpu/riscv/assembler_riscv.hpp | 38 +++++++++++++ .../cpu/riscv/globalDefinitions_riscv.hpp | 2 + src/hotspot/cpu/riscv/globals_riscv.hpp | 11 ++++ .../cpu/riscv/macroAssembler_riscv.cpp | 45 +++++++++++++-- .../cpu/riscv/macroAssembler_riscv.hpp | 3 +- src/hotspot/cpu/riscv/riscv.ad | 34 ++++++++++- src/hotspot/cpu/riscv/riscv_v.ad | 2 +- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 24 +++++--- src/hotspot/cpu/riscv/vm_version_riscv.cpp | 56 +++++++++++++++++++ 9 files changed, 199 insertions(+), 16 deletions(-) diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp index 4e70a41374a..58833366151 100644 --- a/src/hotspot/cpu/riscv/assembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp @@ -2712,6 +2712,44 @@ public: #undef INSN +// Cache Management Operations +#define INSN(NAME, funct) \ + void NAME(Register Rs1) { \ + unsigned insn = 0; \ + patch((address)&insn, 6, 0, 0b0001111); \ + patch((address)&insn, 14, 12, 0b010); \ + patch_reg((address)&insn, 15, Rs1); \ + patch((address)&insn, 31, 20, funct); \ + emit(insn); \ + } + + INSN(cbo_inval, 0b0000000000000); + INSN(cbo_clean, 0b0000000000001); + INSN(cbo_flush, 0b0000000000010); + INSN(cbo_zero, 0b0000000000100); + +#undef INSN + +#define INSN(NAME, funct) \ + void NAME(Register Rs1, int32_t offset) { \ + guarantee((offset & 0x1f) == 0, "offset lowest 5 bits must be zero"); \ + int32_t upperOffset = offset >> 5; \ + unsigned insn = 0; \ + patch((address)&insn, 6, 0, 0b0010011); \ + patch((address)&insn, 14, 12, 0b110); \ + patch_reg((address)&insn, 15, Rs1); \ + patch((address)&insn, 24, 20, funct); \ + upperOffset &= 0x7f; \ + patch((address)&insn, 31, 25, upperOffset); \ + emit(insn); \ + } + + INSN(prefetch_i, 0b0000000000000); + INSN(prefetch_r, 0b0000000000001); + INSN(prefetch_w, 0b0000000000011); + +#undef INSN + // --------------------------------------------------------------------------------------- #define INSN(NAME, REGISTER) \ diff --git a/src/hotspot/cpu/riscv/globalDefinitions_riscv.hpp b/src/hotspot/cpu/riscv/globalDefinitions_riscv.hpp index 2936837d951..c324fd52884 100644 --- a/src/hotspot/cpu/riscv/globalDefinitions_riscv.hpp +++ b/src/hotspot/cpu/riscv/globalDefinitions_riscv.hpp @@ -49,4 +49,6 @@ const bool CCallingConventionRequiresIntsAsLongs = false; #define USE_POINTERS_TO_REGISTER_IMPL_ARRAY +#define DEFAULT_CACHE_LINE_SIZE 64 + #endif // CPU_RISCV_GLOBALDEFINITIONS_RISCV_HPP diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp index 3b73bb42236..ad397f437bf 100644 --- a/src/hotspot/cpu/riscv/globals_riscv.hpp +++ b/src/hotspot/cpu/riscv/globals_riscv.hpp @@ -82,19 +82,30 @@ define_pd_global(intx, InlineSmallCode, 1000); \ product(bool, NearCpool, true, \ "constant pool is close to instructions") \ + product(bool, UseBlockZeroing, false, \ + "Use Zicboz for block zeroing") \ product(intx, BlockZeroingLowLimit, 256, \ "Minimum size in bytes when block zeroing will be used") \ range(1, max_jint) \ + product(intx, CacheLineSize, DEFAULT_CACHE_LINE_SIZE, \ + "Size in bytes of a CPU cache line") \ + range(wordSize, max_jint) \ product(bool, TraceTraps, false, "Trace all traps the signal handler") \ /* For now we're going to be safe and add the I/O bits to userspace fences. */ \ product(bool, UseConservativeFence, true, \ "Extend i for r and o for w in the pred/succ flags of fence") \ product(bool, AvoidUnalignedAccesses, true, \ "Avoid generating unaligned memory accesses") \ + product(bool, UseRVA20U64, false, EXPERIMENTAL, "Use RVA20U64 profile") \ + product(bool, UseRVA22U64, false, EXPERIMENTAL, "Use RVA22U64 profile") \ product(bool, UseRVV, false, EXPERIMENTAL, "Use RVV instructions") \ product(bool, UseRVC, false, EXPERIMENTAL, "Use RVC instructions") \ product(bool, UseZba, false, EXPERIMENTAL, "Use Zba instructions") \ product(bool, UseZbb, false, EXPERIMENTAL, "Use Zbb instructions") \ + product(bool, UseZic64b, false, EXPERIMENTAL, "Use Zic64b instructions") \ + product(bool, UseZicbom, false, EXPERIMENTAL, "Use Zicbom instructions") \ + product(bool, UseZicbop, false, EXPERIMENTAL, "Use Zicbop instructions") \ + product(bool, UseZicboz, false, EXPERIMENTAL, "Use Zicboz instructions") \ product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \ "Use RVV instructions for left/right shift of BigInteger") diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index ac70842bc2a..9ac5692c013 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -792,7 +792,7 @@ void MacroAssembler::la(Register Rd, const Address &adr) { void MacroAssembler::la(Register Rd, Label &label) { IncompressibleRegion ir(this); // the label address may be patched back. - la(Rd, target(label)); + wrap_label(Rd, label, &MacroAssembler::la); } void MacroAssembler::li32(Register Rd, int32_t imm) { @@ -3980,9 +3980,9 @@ address MacroAssembler::zero_words(Register ptr, Register cnt) andi(t0, cnt, i); beqz(t0, l); for (int j = 0; j < i; j++) { - sd(zr, Address(ptr, 0)); - addi(ptr, ptr, 8); + sd(zr, Address(ptr, j * wordSize)); } + addi(ptr, ptr, i * wordSize); bind(l); } { @@ -4001,7 +4001,7 @@ address MacroAssembler::zero_words(Register ptr, Register cnt) // base: Address of a buffer to be zeroed, 8 bytes aligned. // cnt: Immediate count in HeapWords. -void MacroAssembler::zero_words(Register base, u_int64_t cnt) +void MacroAssembler::zero_words(Register base, uint64_t cnt) { assert_different_registers(base, t0, t1); @@ -4092,6 +4092,43 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) bind(fini); } +// Zero blocks of memory by using CBO.ZERO. +// +// Aligns the base address first sufficiently for CBO.ZERO, then uses +// CBO.ZERO repeatedly for every full block. cnt is the size to be +// zeroed in HeapWords. Returns the count of words left to be zeroed +// in cnt. +// +// NOTE: This is intended to be used in the zero_blocks() stub. If +// you want to use it elsewhere, note that cnt must be >= CacheLineSize. +void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) { + Label initial_table_end, loop; + + // Align base with cache line size. + neg(tmp1, base); + andi(tmp1, tmp1, CacheLineSize - 1); + + // tmp1: the number of bytes to be filled to align the base with cache line size. + add(base, base, tmp1); + srai(tmp2, tmp1, 3); + sub(cnt, cnt, tmp2); + srli(tmp2, tmp1, 1); + la(tmp1, initial_table_end); + sub(tmp2, tmp1, tmp2); + jr(tmp2); + for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) { + sd(zr, Address(base, i)); + } + bind(initial_table_end); + + mv(tmp1, CacheLineSize / wordSize); + bind(loop); + cbo_zero(base); + sub(cnt, cnt, tmp1); + add(base, base, CacheLineSize); + bge(cnt, tmp1, loop); +} + #define FCVT_SAFE(FLOATCVT, FLOATEQ) \ void MacroAssembler:: FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \ Label L_Okay; \ diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index 8e062604bee..b45098149f1 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -1093,10 +1093,11 @@ public: void ctzc_bit(Register Rd, Register Rs, bool isLL = false, Register tmp1 = t0, Register tmp2 = t1); - void zero_words(Register base, u_int64_t cnt); + void zero_words(Register base, uint64_t cnt); address zero_words(Register ptr, Register cnt); void fill_words(Register base, Register cnt, Register value); void zero_memory(Register addr, Register len, Register tmp); + void zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2); // shift left by shamt and add void shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt); diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index a359eb051cc..75612ef7508 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -727,6 +727,10 @@ reg_class r30_reg( R30, R30_H ); +reg_class r31_reg( + R31, R31_H +); + // Class for zero registesr reg_class zr_reg( R0, R0_H @@ -3347,6 +3351,28 @@ operand iRegP_R28() interface(REG_INTER); %} +// Pointer 64 bit Register R30 only +operand iRegP_R30() +%{ + constraint(ALLOC_IN_RC(r30_reg)); + match(RegP); + match(iRegPNoSp); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +// Pointer 64 bit Register R31 only +operand iRegP_R31() +%{ + constraint(ALLOC_IN_RC(r31_reg)); + match(RegP); + match(iRegPNoSp); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + // Pointer Register Operands // Narrow Pointer Register operand iRegN() @@ -10228,11 +10254,13 @@ instruct stringL_indexof_char(iRegP_R11 str1, iRegI_R12 cnt1, iRegI_R13 ch, %} // clearing of an array -instruct clearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy) +instruct clearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, iRegP_R30 tmp1, + iRegP_R31 tmp2, Universe dummy) %{ - predicate(!UseRVV); + // temp registers must match the one used in StubGenerator::generate_zero_blocks() + predicate(UseBlockZeroing || !UseRVV); match(Set dummy (ClearArray cnt base)); - effect(USE_KILL cnt, USE_KILL base); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp1, TEMP tmp2); ins_cost(4 * DEFAULT_COST); format %{ "ClearArray $cnt, $base\t#@clearArray_reg_reg" %} diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 0ef36fdb292..65eca8664b1 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -2042,7 +2042,7 @@ instruct vstringL_indexof_char(iRegP_R11 str1, iRegI_R12 cnt1, iRegI_R13 ch, instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy, vReg_V1 vReg1, vReg_V2 vReg2, vReg_V3 vReg3) %{ - predicate(UseRVV); + predicate(!UseBlockZeroing && UseRVV); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, TEMP vReg1, TEMP vReg2, TEMP vReg3); diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 563ab7c8fc1..18249733f1e 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -671,26 +671,36 @@ class StubGenerator: public StubCodeGenerator { address generate_zero_blocks() { Label done; - const Register base = x28, cnt = x29; + const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "zero_blocks"); address start = __ pc(); + if (UseBlockZeroing) { + // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero + // after alignment. + Label small; + int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; + __ mv(tmp1, low_limit); + __ blt(cnt, tmp1, small); + __ zero_dcache_blocks(base, cnt, tmp1, tmp2); + __ bind(small); + } + { // Clear the remaining blocks. Label loop; - __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); - __ bltz(cnt, done); + __ mv(tmp1, MacroAssembler::zero_words_block_size); + __ blt(cnt, tmp1, done); __ bind(loop); for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { - __ sd(zr, Address(base, 0)); - __ add(base, base, 8); + __ sd(zr, Address(base, i * wordSize)); } + __ add(base, base, MacroAssembler::zero_words_block_size * wordSize); __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); - __ bgez(cnt, loop); + __ bge(cnt, tmp1, loop); __ bind(done); - __ add(cnt, cnt, MacroAssembler::zero_words_block_size); } __ ret(); diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp index 02133dbb595..544853fe040 100644 --- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp +++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp @@ -36,6 +36,50 @@ uint32_t VM_Version::_initial_vector_length = 0; void VM_Version::initialize() { get_os_cpu_info(); + // https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc#rva20-profiles + if (UseRVA20U64) { + if (FLAG_IS_DEFAULT(UseRVC)) { + FLAG_SET_DEFAULT(UseRVC, true); + } + } + // https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc#rva22-profiles + if (UseRVA22U64) { + if (FLAG_IS_DEFAULT(UseRVC)) { + FLAG_SET_DEFAULT(UseRVC, true); + } + if (FLAG_IS_DEFAULT(UseZba)) { + FLAG_SET_DEFAULT(UseZba, true); + } + if (FLAG_IS_DEFAULT(UseZbb)) { + FLAG_SET_DEFAULT(UseZbb, true); + } + if (FLAG_IS_DEFAULT(UseZic64b)) { + FLAG_SET_DEFAULT(UseZic64b, true); + } + if (FLAG_IS_DEFAULT(UseZicbom)) { + FLAG_SET_DEFAULT(UseZicbom, true); + } + if (FLAG_IS_DEFAULT(UseZicbop)) { + FLAG_SET_DEFAULT(UseZicbop, true); + } + if (FLAG_IS_DEFAULT(UseZicboz)) { + FLAG_SET_DEFAULT(UseZicboz, true); + } + } + + if (UseZic64b) { + if (CacheLineSize != 64) { + assert(!FLAG_IS_DEFAULT(CacheLineSize), "default cache line size should be 64 bytes"); + warning("CacheLineSize is assumed to be 64 bytes because Zic64b is enabled"); + FLAG_SET_DEFAULT(CacheLineSize, 64); + } + } else { + if (!FLAG_IS_DEFAULT(CacheLineSize) && !is_power_of_2(CacheLineSize)) { + warning("CacheLineSize must be a power of 2"); + FLAG_SET_DEFAULT(CacheLineSize, DEFAULT_CACHE_LINE_SIZE); + } + } + if (FLAG_IS_DEFAULT(UseFMA)) { FLAG_SET_DEFAULT(UseFMA, true); } @@ -127,6 +171,18 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UsePopCountInstruction, false); } + if (UseZicboz) { + if (FLAG_IS_DEFAULT(UseBlockZeroing)) { + FLAG_SET_DEFAULT(UseBlockZeroing, true); + } + if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) { + FLAG_SET_DEFAULT(BlockZeroingLowLimit, 2 * CacheLineSize); + } + } else if (UseBlockZeroing) { + warning("Block zeroing is not available"); + FLAG_SET_DEFAULT(UseBlockZeroing, false); + } + char buf[512]; buf[0] = '\0'; if (_uarch != NULL && strcmp(_uarch, "") != 0) snprintf(buf, sizeof(buf), "%s,", _uarch);