8290700: Optimize AArch64 nmethod entry barriers

Reviewed-by: kvn, dlong
This commit is contained in:
Erik Österlund 2022-07-25 07:08:46 +00:00
parent 852e71d9f0
commit 228e8e94fe
14 changed files with 157 additions and 58 deletions

@ -1920,7 +1920,24 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
if (C->stub_function() == NULL) { if (C->stub_function() == NULL) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(&_masm); if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
// Dummy labels for just measuring the code size
Label dummy_slow_path;
Label dummy_continuation;
Label dummy_guard;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation;
Label* guard = &dummy_guard;
if (!Compile::current()->output()->in_scratch_emit_size()) {
// Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
guard = &stub->guard();
}
// In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
bs->nmethod_entry_barrier(&_masm, slow_path, continuation, guard);
}
} }
if (VerifyStackAtCalls) { if (VerifyStackAtCalls) {

@ -298,7 +298,7 @@ void C1_MacroAssembler::build_frame(int framesize, int bang_size_in_bytes) {
// Insert nmethod entry barrier into frame. // Insert nmethod entry barrier into frame.
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this); bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */, NULL /* guard */);
} }
void C1_MacroAssembler::remove_frame(int framesize) { void C1_MacroAssembler::remove_frame(int framesize) {

@ -28,6 +28,7 @@
#include "opto/c2_MacroAssembler.hpp" #include "opto/c2_MacroAssembler.hpp"
#include "opto/intrinsicnode.hpp" #include "opto/intrinsicnode.hpp"
#include "opto/matcher.hpp" #include "opto/matcher.hpp"
#include "opto/output.hpp"
#include "opto/subnode.hpp" #include "opto/subnode.hpp"
#include "runtime/stubRoutines.hpp" #include "runtime/stubRoutines.hpp"
@ -43,6 +44,21 @@
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
bind(stub->slow_path());
movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
blr(rscratch1);
b(stub->continuation());
bind(stub->guard());
relocate(entry_guard_Relocation::spec());
emit_int32(0); // nmethod guard value
}
int C2_MacroAssembler::entry_barrier_stub_size() {
return 4 * 6;
}
// Search for str1 in str2 and return index or -1 // Search for str1 in str2 and return index or -1
void C2_MacroAssembler::string_indexof(Register str2, Register str1, void C2_MacroAssembler::string_indexof(Register str2, Register str1,
Register cnt2, Register cnt1, Register cnt2, Register cnt1,

@ -28,8 +28,8 @@
// C2_MacroAssembler contains high-level macros for C2 // C2_MacroAssembler contains high-level macros for C2
public: public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {} void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size() { return 0; } static int entry_barrier_stub_size();
void string_compare(Register str1, Register str2, void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result, Register cnt1, Register cnt2, Register result,

@ -246,18 +246,38 @@ void BarrierSetAssembler::clear_patching_epoch() {
_patching_epoch = 0; _patching_epoch = 0;
} }
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) { void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation, Label* guard) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) { if (bs_nm == NULL) {
return; return;
} }
Label skip_barrier, guard; Label local_guard;
Label skip_barrier;
NMethodPatchingType patching_type = nmethod_patching_type();
__ ldrw(rscratch1, guard); if (slow_path == NULL) {
guard = &local_guard;
}
if (nmethod_code_patching()) { // If the slow path is out of line in a stub, we flip the condition
Assembler::Condition condition = slow_path == NULL ? Assembler::EQ : Assembler::NE;
Label& barrier_target = slow_path == NULL ? skip_barrier : *slow_path;
__ ldrw(rscratch1, *guard);
if (patching_type == NMethodPatchingType::stw_instruction_and_data_patch) {
// With STW patching, no data or instructions are updated concurrently,
// which means there isn't really any need for any fencing for neither
// data nor instruction modifications happening concurrently. The
// instruction patching is handled with isb fences on the way back
// from the safepoint to Java. So here we can do a plain conditional
// branch with no fencing.
Address thread_disarmed_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldrw(rscratch2, thread_disarmed_addr);
__ cmp(rscratch1, rscratch2);
} else if (patching_type == NMethodPatchingType::conc_instruction_and_data_patch) {
// If we patch code we need both a code patching and a loadload // If we patch code we need both a code patching and a loadload
// fence. It's not super cheap, so we use a global epoch mechanism // fence. It's not super cheap, so we use a global epoch mechanism
// to hide them in a slow path. // to hide them in a slow path.
@ -278,24 +298,28 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
Address thread_disarmed_and_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset())); Address thread_disarmed_and_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldr(rscratch2, thread_disarmed_and_epoch_addr); __ ldr(rscratch2, thread_disarmed_and_epoch_addr);
__ cmp(rscratch1, rscratch2); __ cmp(rscratch1, rscratch2);
__ br(Assembler::EQ, skip_barrier);
} else { } else {
assert(patching_type == NMethodPatchingType::conc_data_patch, "must be");
// Subsequent loads of oops must occur after load of guard value. // Subsequent loads of oops must occur after load of guard value.
// BarrierSetNMethod::disarm sets guard with release semantics. // BarrierSetNMethod::disarm sets guard with release semantics.
__ membar(__ LoadLoad); __ membar(__ LoadLoad);
Address thread_disarmed_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset())); Address thread_disarmed_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()));
__ ldrw(rscratch2, thread_disarmed_addr); __ ldrw(rscratch2, thread_disarmed_addr);
__ cmpw(rscratch1, rscratch2); __ cmpw(rscratch1, rscratch2);
__ br(Assembler::EQ, skip_barrier);
} }
__ br(condition, barrier_target);
__ movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier()); if (slow_path == NULL) {
__ blr(rscratch1); __ movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
__ b(skip_barrier); __ blr(rscratch1);
__ b(skip_barrier);
__ bind(guard); __ bind(local_guard);
__ emit_int32(0); // nmethod guard value. Skipped over in common case. __ emit_int32(0); // nmethod guard value. Skipped over in common case.
} else {
__ bind(*continuation);
}
__ bind(skip_barrier); __ bind(skip_barrier);
} }

@ -31,6 +31,12 @@
#include "memory/allocation.hpp" #include "memory/allocation.hpp"
#include "oops/access.hpp" #include "oops/access.hpp"
enum class NMethodPatchingType {
stw_instruction_and_data_patch,
conc_instruction_and_data_patch,
conc_data_patch
};
class BarrierSetAssembler: public CHeapObj<mtGC> { class BarrierSetAssembler: public CHeapObj<mtGC> {
private: private:
void incr_allocated_bytes(MacroAssembler* masm, void incr_allocated_bytes(MacroAssembler* masm,
@ -68,9 +74,9 @@ public:
); );
virtual void barrier_stubs_init() {} virtual void barrier_stubs_init() {}
virtual bool nmethod_code_patching() { return true; } virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::stw_instruction_and_data_patch; }
virtual void nmethod_entry_barrier(MacroAssembler* masm); virtual void nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation, Label* guard);
virtual void c2i_entry_barrier(MacroAssembler* masm); virtual void c2i_entry_barrier(MacroAssembler* masm);
static address patching_epoch_addr(); static address patching_epoch_addr();

@ -37,29 +37,62 @@
#include "utilities/align.hpp" #include "utilities/align.hpp"
#include "utilities/debug.hpp" #include "utilities/debug.hpp"
static int slow_path_size(nmethod* nm) {
// The slow path code is out of line with C2
return nm->is_compiled_by_c2() ? 0 : 6;
}
// This is the offset of the entry barrier from where the frame is completed.
// If any code changes between the end of the verified entry where the entry
// barrier resides, and the completion of the frame, then
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static int entry_barrier_offset(nmethod* nm) {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
switch (bs_asm->nmethod_patching_type()) {
case NMethodPatchingType::stw_instruction_and_data_patch:
return -4 * (4 + slow_path_size(nm));
case NMethodPatchingType::conc_instruction_and_data_patch:
return -4 * (10 + slow_path_size(nm));
case NMethodPatchingType::conc_data_patch:
return -4 * (5 + slow_path_size(nm));
}
ShouldNotReachHere();
return 0;
}
class NativeNMethodBarrier: public NativeInstruction { class NativeNMethodBarrier: public NativeInstruction {
address instruction_address() const { return addr_at(0); } address instruction_address() const { return addr_at(0); }
int guard_offset() { int local_guard_offset(nmethod* nm) {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); // It's the last instruction
if (bs_asm->nmethod_code_patching()) { return (-entry_barrier_offset(nm)) - 4;
return 4 * 15;
} else {
return 4 * 10;
}
} }
int *guard_addr() { int *guard_addr(nmethod* nm) {
return reinterpret_cast<int*>(instruction_address() + guard_offset()); if (nm->is_compiled_by_c2()) {
// With c2 compiled code, the guard is out-of-line in a stub
// We find it using the RelocIterator.
RelocIterator iter(nm);
while (iter.next()) {
if (iter.type() == relocInfo::entry_guard_type) {
entry_guard_Relocation* const reloc = iter.entry_guard_reloc();
return reinterpret_cast<int*>(reloc->addr());
}
}
ShouldNotReachHere();
}
return reinterpret_cast<int*>(instruction_address() + local_guard_offset(nm));
} }
public: public:
int get_value() { int get_value(nmethod* nm) {
return Atomic::load_acquire(guard_addr()); return Atomic::load_acquire(guard_addr(nm));
} }
void set_value(int value) { void set_value(nmethod* nm, int value) {
Atomic::release_store(guard_addr(), value); Atomic::release_store(guard_addr(nm), value);
} }
void verify() const; void verify() const;
@ -120,24 +153,8 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
new_frame->pc = SharedRuntime::get_handle_wrong_method_stub(); new_frame->pc = SharedRuntime::get_handle_wrong_method_stub();
} }
// This is the offset of the entry barrier from where the frame is completed.
// If any code changes between the end of the verified entry where the entry
// barrier resides, and the completion of the frame, then
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static int entry_barrier_offset() {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_code_patching()) {
return -4 * 16;
} else {
return -4 * 11;
}
}
static NativeNMethodBarrier* native_nmethod_barrier(nmethod* nm) { static NativeNMethodBarrier* native_nmethod_barrier(nmethod* nm) {
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(); address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(nm);
NativeNMethodBarrier* barrier = reinterpret_cast<NativeNMethodBarrier*>(barrier_address); NativeNMethodBarrier* barrier = reinterpret_cast<NativeNMethodBarrier*>(barrier_address);
debug_only(barrier->verify()); debug_only(barrier->verify());
return barrier; return barrier;
@ -160,7 +177,7 @@ void BarrierSetNMethod::disarm(nmethod* nm) {
// Disarms the nmethod guard emitted by BarrierSetAssembler::nmethod_entry_barrier. // Disarms the nmethod guard emitted by BarrierSetAssembler::nmethod_entry_barrier.
// Symmetric "LDR; DMB ISHLD" is in the nmethod barrier. // Symmetric "LDR; DMB ISHLD" is in the nmethod barrier.
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm); NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
barrier->set_value(disarmed_value()); barrier->set_value(nm, disarmed_value());
} }
void BarrierSetNMethod::arm(nmethod* nm, int arm_value) { void BarrierSetNMethod::arm(nmethod* nm, int arm_value) {
@ -180,7 +197,7 @@ void BarrierSetNMethod::arm(nmethod* nm, int arm_value) {
} }
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm); NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
barrier->set_value(arm_value); barrier->set_value(nm, arm_value);
} }
bool BarrierSetNMethod::is_armed(nmethod* nm) { bool BarrierSetNMethod::is_armed(nmethod* nm) {
@ -189,5 +206,5 @@ bool BarrierSetNMethod::is_armed(nmethod* nm) {
} }
NativeNMethodBarrier* barrier = native_nmethod_barrier(nm); NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
return barrier->get_value() != disarmed_value(); return barrier->get_value(nm) != disarmed_value();
} }

@ -62,7 +62,7 @@ public:
void iu_barrier(MacroAssembler* masm, Register dst, Register tmp); void iu_barrier(MacroAssembler* masm, Register dst, Register tmp);
virtual bool nmethod_code_patching() { return false; } virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::conc_data_patch; }
#ifdef COMPILER1 #ifdef COMPILER1
void gen_pre_barrier_stub(LIR_Assembler* ce, ShenandoahPreBarrierStub* stub); void gen_pre_barrier_stub(LIR_Assembler* ce, ShenandoahPreBarrierStub* stub);

@ -76,7 +76,7 @@ public:
Register tmp, Register tmp,
Label& slowpath); Label& slowpath);
virtual bool nmethod_code_patching() { return false; } virtual NMethodPatchingType nmethod_patching_type() { return NMethodPatchingType::conc_data_patch; }
#ifdef COMPILER1 #ifdef COMPILER1
void generate_c1_load_barrier_test(LIR_Assembler* ce, void generate_c1_load_barrier_test(LIR_Assembler* ce,

@ -4475,7 +4475,7 @@ void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
// ordered with respected to oop accesses. // ordered with respected to oop accesses.
// Using immediate literals would necessitate ISBs. // Using immediate literals would necessitate ISBs.
BarrierSet* bs = BarrierSet::barrier_set(); BarrierSet* bs = BarrierSet::barrier_set();
if ((bs->barrier_set_nmethod() != NULL && !bs->barrier_set_assembler()->nmethod_code_patching()) || !immediate) { if ((bs->barrier_set_nmethod() != NULL && bs->barrier_set_assembler()->nmethod_patching_type() == NMethodPatchingType::conc_data_patch) || !immediate) {
address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
ldr_constant(dst, Address(dummy, rspec)); ldr_constant(dst, Address(dummy, rspec));
} else } else

@ -1424,7 +1424,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ sub(sp, sp, stack_size - 2*wordSize); __ sub(sp, sp, stack_size - 2*wordSize);
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm); bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */, NULL /* guard */);
// Frame is now completed as far as size and linkage. // Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start; int frame_complete = ((intptr_t)__ pc()) - start;

@ -5145,7 +5145,7 @@ class StubGenerator: public StubCodeGenerator {
return entry; return entry;
} }
address generate_method_entry_barrier() { address generate_method_entry_barrier() {
__ align(CodeEntryAlignment); __ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
@ -5155,10 +5155,10 @@ class StubGenerator: public StubCodeGenerator {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_code_patching()) { if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
// We can get here despite the nmethod being good, if we have not // We can get here despite the nmethod being good, if we have not
// yet applied our cross modification fence. // yet applied our cross modification fence (or data fence).
Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4); Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
__ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
__ ldrw(rscratch2, rscratch2); __ ldrw(rscratch2, rscratch2);

@ -269,6 +269,7 @@ class relocInfo {
runtime_call_w_cp_type = 14, // Runtime call which may load its target from the constant pool runtime_call_w_cp_type = 14, // Runtime call which may load its target from the constant pool
data_prefix_tag = 15, // tag for a prefix (carries data arguments) data_prefix_tag = 15, // tag for a prefix (carries data arguments)
post_call_nop_type = 16, // A tag for post call nop relocations post_call_nop_type = 16, // A tag for post call nop relocations
entry_guard_type = 17, // A tag for an nmethod entry barrier guard value
type_mask = 31 // A mask which selects only the above values type_mask = 31 // A mask which selects only the above values
}; };
@ -309,6 +310,7 @@ class relocInfo {
visitor(section_word) \ visitor(section_word) \
visitor(trampoline_stub) \ visitor(trampoline_stub) \
visitor(post_call_nop) \ visitor(post_call_nop) \
visitor(entry_guard) \
public: public:
@ -883,6 +885,19 @@ public:
} }
}; };
class entry_guard_Relocation : public Relocation {
friend class RelocIterator;
public:
entry_guard_Relocation() : Relocation(relocInfo::entry_guard_type) { }
static RelocationHolder spec() {
RelocationHolder rh = newHolder();
new(rh) entry_guard_Relocation();
return rh;
}
};
// A CallRelocation always points at a call instruction. // A CallRelocation always points at a call instruction.
// It is PC-relative on most machines. // It is PC-relative on most machines.
class CallRelocation : public Relocation { class CallRelocation : public Relocation {

@ -118,14 +118,18 @@ public:
class C2EntryBarrierStub: public ResourceObj { class C2EntryBarrierStub: public ResourceObj {
Label _slow_path; Label _slow_path;
Label _continuation; Label _continuation;
Label _guard; // Used on AArch64
public: public:
C2EntryBarrierStub() : C2EntryBarrierStub() :
_slow_path(), _slow_path(),
_continuation() {} _continuation(),
_guard() {}
Label& slow_path() { return _slow_path; } Label& slow_path() { return _slow_path; }
Label& continuation() { return _continuation; } Label& continuation() { return _continuation; }
Label& guard() { return _guard; }
}; };
class C2EntryBarrierStubTable { class C2EntryBarrierStubTable {