8290700: Optimize AArch64 nmethod entry barriers

Reviewed-by: kvn, dlong
This commit is contained in:
Erik Österlund 2022-07-25 07:08:46 +00:00
parent 852e71d9f0
commit 228e8e94fe
14 changed files with 157 additions and 58 deletions

View File

@ -1920,7 +1920,24 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
if (C->stub_function() == NULL) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(&_masm);
if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
// Dummy labels for just measuring the code size
Label dummy_slow_path;
Label dummy_continuation;
Label dummy_guard;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation;
Label* guard = &dummy_guard;
if (!Compile::current()->output()->in_scratch_emit_size()) {
// Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
guard = &stub->guard();
}
// In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
bs->nmethod_entry_barrier(&_masm, slow_path, continuation, guard);
}
}
if (VerifyStackAtCalls) {

View File

@ -298,7 +298,7 @@ void C1_MacroAssembler::build_frame(int framesize, int bang_size_in_bytes) {
// Insert nmethod entry barrier into frame.
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */, NULL /* guard */);
}
void C1_MacroAssembler::remove_frame(int framesize) {

View File

@ -28,6 +28,7 @@
#include "opto/c2_MacroAssembler.hpp"
#include "opto/intrinsicnode.hpp"
#include "opto/matcher.hpp"
#include "opto/output.hpp"
#include "opto/subnode.hpp"
#include "runtime/stubRoutines.hpp"
@ -43,6 +44,21 @@
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
bind(stub->slow_path());
movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
blr(rscratch1);
b(stub->continuation());
bind(stub->guard());
relocate(entry_guard_Relocation::spec());
emit_int32(0); // nmethod guard value
}
int C2_MacroAssembler::entry_barrier_stub_size() {
return 4 * 6;
}
// Search for str1 in str2 and return index or -1
void C2_MacroAssembler::string_indexof(Register str2, Register str1,
Register cnt2, Register cnt1,

View File

@ -28,8 +28,8 @@
// C2_MacroAssembler contains high-level macros for C2
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size();
void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,

View File

@ -246,18 +246,38 @@ void BarrierSetAssembler::clear_patching_epoch() {
_patching_epoch = 0;
}
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation, Label* guard) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
}
Label skip_barrier, guard;
Label local_guard;
Label skip_barrier;
NMethodPatchingType patching_type = nmethod_patching_type();
__ ldrw(rscratch1, guard);
if (slow_path == NULL) {
guard = &local_guard;
}
if (nmethod_code_patching()) {
// If the slow path is out of line in a stub, we flip the condition
Assembler::Condition condition = slow_path == NULL ? Assembler::EQ : Assembler::NE;
Label& barrier_target = slow_path == NULL ? skip_barrier : *slow_path;
__ ldrw(rscratch1, *guard);
if (patching_type == NMethodPatchingType::stw_instruction_and_data_patch) {
// With STW patching, no data or instructions are updated concurrently,
// which means there isn't really any need for any fencing for neither
// data nor instruction modifications happening concurrently. The
// instruction patching is handled with isb fences on the way back
// from the safepoint to Java. So here we can do a plain conditional
// branch with no fencing.
Address thread_disarmed_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldrw(rscratch2, thread_disarmed_addr);
__ cmp(rscratch1, rscratch2);
} else if (patching_type == NMethodPatchingType::conc_instruction_and_data_patch) {
// If we patch code we need both a code patching and a loadload
// fence. It's not super cheap, so we use a global epoch mechanism
// to hide them in a slow path.
@ -278,24 +298,28 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
Address thread_disarmed_and_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldr(rscratch2, thread_disarmed_and_epoch_addr);
__ cmp(rscratch1, rscratch2);
__ br(Assembler::EQ, skip_barrier);
} else {
assert(patching_type == NMethodPatchingType::conc_data_patch, "must be");
// Subsequent loads of oops must occur after load of guard value.
// BarrierSetNMethod::disarm sets guard with release semantics.
__ membar(__ LoadLoad);
Address thread_disarmed_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldrw(rscratch2, thread_disarmed_addr);
__ cmpw(rscratch1, rscratch2);
__ br(Assembler::EQ, skip_barrier);
}
__ br(condition, barrier_target);
__ movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
__ blr(rscratch1);
__ b(skip_barrier);
if (slow_path == NULL) {
__ movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
__ blr(rscratch1);
__ b(skip_barrier);
__ bind(guard);
__ bind(local_guard);
__ emit_int32(0); // nmethod guard value. Skipped over in common case.
__ emit_int32(0); // nmethod guard value. Skipped over in common case.
} else {
__ bind(*continuation);
}
__ bind(skip_barrier);
}

View File

@ -31,6 +31,12 @@
#include "memory/allocation.hpp"
#include "oops/access.hpp"
enum class NMethodPatchingType {
stw_instruction_and_data_patch,
conc_instruction_and_data_patch,
conc_data_patch
};
class BarrierSetAssembler: public CHeapObj<mtGC> {
private:
void incr_allocated_bytes(MacroAssembler* masm,
@ -68,9 +74,9 @@ public:
);
virtual void barrier_stubs_init() {}
virtual bool nmethod_code_patching() { return true; }
virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::stw_instruction_and_data_patch; }
virtual void nmethod_entry_barrier(MacroAssembler* masm);
virtual void nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation, Label* guard);
virtual void c2i_entry_barrier(MacroAssembler* masm);
static address patching_epoch_addr();

View File

@ -37,29 +37,62 @@
#include "utilities/align.hpp"
#include "utilities/debug.hpp"
static int slow_path_size(nmethod* nm) {
// The slow path code is out of line with C2
return nm->is_compiled_by_c2() ? 0 : 6;
}
// This is the offset of the entry barrier from where the frame is completed.
// If any code changes between the end of the verified entry where the entry
// barrier resides, and the completion of the frame, then
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static int entry_barrier_offset(nmethod* nm) {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
switch (bs_asm->nmethod_patching_type()) {
case NMethodPatchingType::stw_instruction_and_data_patch:
return -4 * (4 + slow_path_size(nm));
case NMethodPatchingType::conc_instruction_and_data_patch:
return -4 * (10 + slow_path_size(nm));
case NMethodPatchingType::conc_data_patch:
return -4 * (5 + slow_path_size(nm));
}
ShouldNotReachHere();
return 0;
}
class NativeNMethodBarrier: public NativeInstruction {
address instruction_address() const { return addr_at(0); }
int guard_offset() {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_code_patching()) {
return 4 * 15;
} else {
return 4 * 10;
}
int local_guard_offset(nmethod* nm) {
// It's the last instruction
return (-entry_barrier_offset(nm)) - 4;
}
int *guard_addr() {
return reinterpret_cast<int*>(instruction_address() + guard_offset());
int *guard_addr(nmethod* nm) {
if (nm->is_compiled_by_c2()) {
// With c2 compiled code, the guard is out-of-line in a stub
// We find it using the RelocIterator.
RelocIterator iter(nm);
while (iter.next()) {
if (iter.type() == relocInfo::entry_guard_type) {
entry_guard_Relocation* const reloc = iter.entry_guard_reloc();
return reinterpret_cast<int*>(reloc->addr());
}
}
ShouldNotReachHere();
}
return reinterpret_cast<int*>(instruction_address() + local_guard_offset(nm));
}
public:
int get_value() {
return Atomic::load_acquire(guard_addr());
int get_value(nmethod* nm) {
return Atomic::load_acquire(guard_addr(nm));
}
void set_value(int value) {
Atomic::release_store(guard_addr(), value);
void set_value(nmethod* nm, int value) {
Atomic::release_store(guard_addr(nm), value);
}
void verify() const;
@ -120,24 +153,8 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
new_frame->pc = SharedRuntime::get_handle_wrong_method_stub();
}
// This is the offset of the entry barrier from where the frame is completed.
// If any code changes between the end of the verified entry where the entry
// barrier resides, and the completion of the frame, then
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static int entry_barrier_offset() {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_code_patching()) {
return -4 * 16;
} else {
return -4 * 11;
}
}
static NativeNMethodBarrier* native_nmethod_barrier(nmethod* nm) {
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset();
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(nm);
NativeNMethodBarrier* barrier = reinterpret_cast<NativeNMethodBarrier*>(barrier_address);
debug_only(barrier->verify());
return barrier;
@ -160,7 +177,7 @@ void BarrierSetNMethod::disarm(nmethod* nm) {
// Disarms the nmethod guard emitted by BarrierSetAssembler::nmethod_entry_barrier.
// Symmetric "LDR; DMB ISHLD" is in the nmethod barrier.
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
barrier->set_value(disarmed_value());
barrier->set_value(nm, disarmed_value());
}
void BarrierSetNMethod::arm(nmethod* nm, int arm_value) {
@ -180,7 +197,7 @@ void BarrierSetNMethod::arm(nmethod* nm, int arm_value) {
}
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
barrier->set_value(arm_value);
barrier->set_value(nm, arm_value);
}
bool BarrierSetNMethod::is_armed(nmethod* nm) {
@ -189,5 +206,5 @@ bool BarrierSetNMethod::is_armed(nmethod* nm) {
}
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
return barrier->get_value() != disarmed_value();
return barrier->get_value(nm) != disarmed_value();
}

View File

@ -62,7 +62,7 @@ public:
void iu_barrier(MacroAssembler* masm, Register dst, Register tmp);
virtual bool nmethod_code_patching() { return false; }
virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::conc_data_patch; }
#ifdef COMPILER1
void gen_pre_barrier_stub(LIR_Assembler* ce, ShenandoahPreBarrierStub* stub);

View File

@ -76,7 +76,7 @@ public:
Register tmp,
Label& slowpath);
virtual bool nmethod_code_patching() { return false; }
virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::conc_data_patch; }
#ifdef COMPILER1
void generate_c1_load_barrier_test(LIR_Assembler* ce,

View File

@ -4475,7 +4475,7 @@ void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
// ordered with respected to oop accesses.
// Using immediate literals would necessitate ISBs.
BarrierSet* bs = BarrierSet::barrier_set();
if ((bs->barrier_set_nmethod() != NULL && !bs->barrier_set_assembler()->nmethod_code_patching()) || !immediate) {
if ((bs->barrier_set_nmethod() != NULL && bs->barrier_set_assembler()->nmethod_patching_type() == NMethodPatchingType::conc_data_patch) || !immediate) {
address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
ldr_constant(dst, Address(dummy, rspec));
} else

View File

@ -1424,7 +1424,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ sub(sp, sp, stack_size - 2*wordSize);
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */, NULL /* guard */);
// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;

View File

@ -5145,7 +5145,7 @@ class StubGenerator: public StubCodeGenerator {
return entry;
}
address generate_method_entry_barrier() {
address generate_method_entry_barrier() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
@ -5155,10 +5155,10 @@ class StubGenerator: public StubCodeGenerator {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_code_patching()) {
if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
// We can get here despite the nmethod being good, if we have not
// yet applied our cross modification fence.
// yet applied our cross modification fence (or data fence).
Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
__ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
__ ldrw(rscratch2, rscratch2);

View File

@ -269,6 +269,7 @@ class relocInfo {
runtime_call_w_cp_type = 14, // Runtime call which may load its target from the constant pool
data_prefix_tag = 15, // tag for a prefix (carries data arguments)
post_call_nop_type = 16, // A tag for post call nop relocations
entry_guard_type = 17, // A tag for an nmethod entry barrier guard value
type_mask = 31 // A mask which selects only the above values
};
@ -309,6 +310,7 @@ class relocInfo {
visitor(section_word) \
visitor(trampoline_stub) \
visitor(post_call_nop) \
visitor(entry_guard) \
public:
@ -883,6 +885,19 @@ public:
}
};
class entry_guard_Relocation : public Relocation {
friend class RelocIterator;
public:
entry_guard_Relocation() : Relocation(relocInfo::entry_guard_type) { }
static RelocationHolder spec() {
RelocationHolder rh = newHolder();
new(rh) entry_guard_Relocation();
return rh;
}
};
// A CallRelocation always points at a call instruction.
// It is PC-relative on most machines.
class CallRelocation : public Relocation {

View File

@ -118,14 +118,18 @@ public:
class C2EntryBarrierStub: public ResourceObj {
Label _slow_path;
Label _continuation;
Label _guard; // Used on AArch64
public:
C2EntryBarrierStub() :
_slow_path(),
_continuation() {}
_continuation(),
_guard() {}
Label& slow_path() { return _slow_path; }
Label& continuation() { return _continuation; }
Label& guard() { return _guard; }
};
class C2EntryBarrierStubTable {