8339771: RISC-V: Reduce icache flushes

Reviewed-by: fyang, mli, luhenry
This commit is contained in:
Robbin Ehn 2024-09-25 08:11:00 +00:00
parent b1f8d2ea76
commit 97a3933f1b
10 changed files with 115 additions and 4 deletions

View File

@ -705,6 +705,16 @@ public:
emit(insn);
}
void fencei() {
unsigned insn = 0;
patch((address)&insn, 6, 0, 0b0001111); // opcode
patch((address)&insn, 11, 7, 0b00000); // rd
patch((address)&insn, 14, 12, 0b001); // func
patch((address)&insn, 19, 15, 0b00000); // rs1
patch((address)&insn, 31, 20, 0b000000000000); // fm
emit(insn);
}
#define INSN(NAME, op, funct3, funct7) \
void NAME() { \
unsigned insn = 0; \

View File

@ -636,8 +636,20 @@ void ZBarrierSetAssembler::patch_barrier_relocation(address addr, int format) {
ShouldNotReachHere();
}
// A full fence is generated before icache_flush by default in invalidate_word
ICache::invalidate_range(addr, bytes);
// If we are using UseCtxFencei no ICache invalidation is needed here.
// Instead every hart will preform an fence.i either by a Java thread
// (due to patching epoch will take it to slow path),
// or by the kernel when a Java thread is moved to a hart.
// The instruction streams changes must only happen before the disarm of
// the nmethod barrier. Where the disarm have a leading full two way fence.
// If this is performed during a safepoint, all Java threads will emit a fence.i
// before transitioning to 'Java', e.g. leaving native or the safepoint wait barrier.
if (!UseCtxFencei) {
// ICache invalidation is a serialization point.
// The above patching of instructions happens before the invalidation.
// Hence it have a leading full two way fence (wr, wr).
ICache::invalidate_range(addr, bytes);
}
}
#ifdef COMPILER2

View File

@ -122,6 +122,8 @@ define_pd_global(intx, InlineSmallCode, 1000);
product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \
"Use RVV instructions for left/right shift of BigInteger") \
product(bool, UseTrampolines, false, EXPERIMENTAL, \
"Far calls uses jal to trampoline.")
"Far calls uses jal to trampoline.") \
product(bool, UseCtxFencei, false, EXPERIMENTAL, \
"Use PR_RISCV_CTX_SW_FENCEI_ON to avoid explicit icache flush")
#endif // CPU_RISCV_GLOBALS_RISCV_HPP

View File

@ -3159,6 +3159,13 @@ void MacroAssembler::membar(uint32_t order_constraint) {
}
}
void MacroAssembler::cmodx_fence() {
BLOCK_COMMENT("cmodx fence");
if (VM_Version::supports_fencei_barrier()) {
Assembler::fencei();
}
}
// Form an address from base + offset in Rd. Rd my or may not
// actually be used: you must use the Address that is returned. It
// is up to you to ensure that the shift provided matches the size

View File

@ -431,6 +431,8 @@ class MacroAssembler: public Assembler {
}
}
void cmodx_fence();
void pause() {
Assembler::fence(w, 0);
}

View File

@ -55,7 +55,21 @@ void Relocation::pd_set_data_value(address x, bool verify_only) {
bytes = MacroAssembler::pd_patch_instruction_size(addr(), x);
break;
}
ICache::invalidate_range(addr(), bytes);
// If we are using UseCtxFencei no ICache invalidation is needed here.
// Instead every hart will preform an fence.i either by a Java thread
// (due to patching epoch will take it to slow path),
// or by the kernel when a Java thread is moved to a hart.
// The instruction streams changes must only happen before the disarm of
// the nmethod barrier. Where the disarm have a leading full two way fence.
// If this is performed during a safepoint, all Java threads will emit a fence.i
// before transitioning to 'Java', e.g. leaving native or the safepoint wait barrier.
if (!UseCtxFencei) {
// ICache invalidation is a serialization point.
// The above patching of instructions happens before the invalidation.
// Hence it have a leading full two way fence (wr, wr).
ICache::invalidate_range(addr(), bytes);
}
}
address Relocation::pd_call_destination(address orig_addr) {

View File

@ -2428,6 +2428,14 @@ class StubGenerator: public StubCodeGenerator {
__ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
__ lwu(t1, t1);
__ sw(t1, thread_epoch_addr);
// There are two ways this can work:
// - The writer did system icache shootdown after the instruction stream update.
// Hence do nothing.
// - The writer trust us to make sure our icache is in sync before entering.
// Hence use cmodx fence (fence.i, may change).
if (UseCtxFencei) {
__ cmodx_fence();
}
__ membar(__ LoadLoad);
}

View File

@ -285,6 +285,7 @@ class VM_Version : public Abstract_VM_Version {
// RISCV64 supports fast class initialization checks
static bool supports_fast_class_init_checks() { return true; }
static bool supports_fencei_barrier() { return ext_Zifencei.enabled(); }
};
#endif // CPU_RISCV_VM_VERSION_RISCV_HPP

View File

@ -54,6 +54,24 @@ inline void OrderAccess::fence() {
}
inline void OrderAccess::cross_modify_fence_impl() {
// From 3 “Zifencei” Instruction-Fetch Fence, Version 2.0
// "RISC-V does not guarantee that stores to instruction memory will be made
// visible to instruction fetches on a RISC-V hart until that hart executes a
// FENCE.I instruction. A FENCE.I instruction ensures that a subsequent
// instruction fetch on a RISC-V hart will see any previous data stores
// already visible to the same RISC-V hart. FENCE.I does not ensure that other
// RISC-V harts instruction fetches will observe the local harts stores in a
// multiprocessor system."
//
// Hence to be able to use fence.i directly we need a kernel that supports
// PR_RISCV_CTX_SW_FENCEI_ON. Thus if context switch to another hart we are
// ensured that instruction fetch will see any previous data stores
//
// The alternative is using full system IPI (system wide icache sync) then
// this barrier is not strictly needed. As this is emitted in runtime slow-path
// we will just always emit it, typically after a safepoint.
guarantee(VM_Version::supports_fencei_barrier(), "Linux kernel require fence.i");
__asm__ volatile("fence.i" : : : "memory");
}
#endif // OS_CPU_LINUX_RISCV_ORDERACCESS_LINUX_RISCV_HPP

View File

@ -35,6 +35,7 @@
#include <asm/hwcap.h>
#include <ctype.h>
#include <sys/auxv.h>
#include <sys/prctl.h>
#ifndef HWCAP_ISA_I
#define HWCAP_ISA_I nth_bit('I' - 'A')
@ -82,6 +83,23 @@
__v; \
})
// prctl PR_RISCV_SET_ICACHE_FLUSH_CTX is from Linux 6.9
#ifndef PR_RISCV_SET_ICACHE_FLUSH_CTX
#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71
#endif
#ifndef PR_RISCV_CTX_SW_FENCEI_ON
#define PR_RISCV_CTX_SW_FENCEI_ON 0
#endif
#ifndef PR_RISCV_CTX_SW_FENCEI_OFF
#define PR_RISCV_CTX_SW_FENCEI_OFF 1
#endif
#ifndef PR_RISCV_SCOPE_PER_PROCESS
#define PR_RISCV_SCOPE_PER_PROCESS 0
#endif
#ifndef PR_RISCV_SCOPE_PER_THREAD
#define PR_RISCV_SCOPE_PER_THREAD 1
#endif
uint32_t VM_Version::cpu_vector_length() {
assert(ext_V.enabled(), "should not call this");
return (uint32_t)read_csr(CSR_VLENB);
@ -102,6 +120,7 @@ void VM_Version::setup_cpu_available_features() {
if (!RiscvHwprobe::probe_features()) {
os_aux_features();
}
char* uarch = os_uarch_additional_features();
vendor_features();
@ -155,6 +174,24 @@ void VM_Version::setup_cpu_available_features() {
i++;
}
// Linux kernel require Zifencei
if (!ext_Zifencei.enabled()) {
log_info(os, cpu)("Zifencei not found, required by Linux, enabling.");
ext_Zifencei.enable_feature();
}
if (UseCtxFencei) {
// Note that we can set this up only for effected threads
// via PR_RISCV_SCOPE_PER_THREAD, i.e. on VM attach/deattach.
int ret = prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_ON, PR_RISCV_SCOPE_PER_PROCESS);
if (ret == 0) {
log_debug(os, cpu)("UseCtxFencei (PR_RISCV_CTX_SW_FENCEI_ON) enabled.");
} else {
FLAG_SET_ERGO(UseCtxFencei, false);
log_info(os, cpu)("UseCtxFencei (PR_RISCV_CTX_SW_FENCEI_ON) disabled, unsupported by kernel.");
}
}
_features_string = os::strdup(buf);
}