8290688: Optimize x86_64 nmethod entry barriers

Reviewed-by: kvn, rrich
This commit is contained in:
Erik Österlund 2022-07-22 14:42:42 +00:00
parent 54854d9300
commit b28f9dab80
16 changed files with 161 additions and 15 deletions

View File

@ -28,6 +28,8 @@
// C2_MacroAssembler contains high-level macros for C2
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,

View File

@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
// Compare char[] arrays aligned to 4 bytes.
void char_arrays_equals(Register ary1, Register ary2,
Register limit, Register result,

View File

@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
// Intrinsics for CompactStrings
// Compress char[] to byte[] by compressing 16 bytes at once.
void string_compress_16(Register src, Register dst, Register cnt,

View File

@ -36,6 +36,8 @@
VectorRegister vrs,
bool is_latin, Label& DONE);
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,

View File

@ -29,6 +29,9 @@
// C2_MacroAssembler contains high-level macros for C2
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }
//-------------------------------------------
// Special String Intrinsics Implementation.
//-------------------------------------------

View File

@ -325,7 +325,8 @@ void C1_MacroAssembler::build_frame(int frame_size_in_bytes, int bang_size_in_by
decrement(rsp, frame_size_in_bytes); // does not emit code for frame_size == 0
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
// C1 code is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
}

View File

@ -30,6 +30,7 @@
#include "oops/methodData.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/intrinsicnode.hpp"
#include "opto/output.hpp"
#include "opto/opcodes.hpp"
#include "opto/subnode.hpp"
#include "runtime/objectMonitor.hpp"
@ -128,10 +129,38 @@ void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool
if (!is_stub) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
#ifdef _LP64
if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
// We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
Label dummy_slow_path;
Label dummy_continuation;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation;
if (!Compile::current()->output()->in_scratch_emit_size()) {
// Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
}
bs->nmethod_entry_barrier(this, slow_path, continuation);
}
#else
// Don't bother with out-of-line nmethod entry barrier stub for x86_32.
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
#endif
}
}
void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
bind(stub->slow_path());
call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
jmp(stub->continuation(), false /* maybe_short */);
}
int C2_MacroAssembler::entry_barrier_stub_size() {
return 10;
}
inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
switch (vlen_in_bytes) {
case 4: // fall-through

View File

@ -31,6 +31,9 @@ public:
// C2 compiled method's prolog code.
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);
void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size();
Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);
// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.

View File

@ -309,22 +309,34 @@ void BarrierSetAssembler::incr_allocated_bytes(MacroAssembler* masm, Register th
}
#ifdef _LP64
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
}
Label continuation;
Register thread = r15_thread;
Address disarmed_addr(thread, in_bytes(bs_nm->thread_disarmed_offset()));
__ align(8);
// The immediate is the last 4 bytes, so if we align the start of the cmp
// instruction to 4 bytes, we know that the second half of it is also 4
// byte aligned, which means that the immediate will not cross a cache line
__ align(4);
uintptr_t before_cmp = (uintptr_t)__ pc();
__ cmpl(disarmed_addr, 0);
__ jcc(Assembler::equal, continuation);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(continuation);
uintptr_t after_cmp = (uintptr_t)__ pc();
guarantee(after_cmp - before_cmp == 8, "Wrong assumed instruction length");
if (slow_path != NULL) {
__ jcc(Assembler::notEqual, *slow_path);
__ bind(*continuation);
} else {
Label done;
__ jccb(Assembler::equal, done);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(done);
}
}
#else
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label*, Label*) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;

View File

@ -68,7 +68,7 @@ public:
virtual void barrier_stubs_init() {}
virtual void nmethod_entry_barrier(MacroAssembler* masm);
virtual void nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation);
virtual void c2i_entry_barrier(MacroAssembler* masm);
};

View File

@ -32,6 +32,7 @@
#include "runtime/sharedRuntime.hpp"
#include "utilities/align.hpp"
#include "utilities/debug.hpp"
#include "utilities/macros.hpp"
class NativeNMethodCmpBarrier: public NativeInstruction {
public:
@ -62,7 +63,7 @@ public:
#ifdef _LP64
void NativeNMethodCmpBarrier::verify() const {
if (((uintptr_t) instruction_address()) & 0x7) {
if (((uintptr_t) instruction_address()) & 0x3) {
fatal("Not properly aligned");
}
@ -156,10 +157,20 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static const int entry_barrier_offset = LP64_ONLY(-19) NOT_LP64(-18);
static const int entry_barrier_offset(nmethod* nm) {
#ifdef _LP64
if (nm->is_compiled_by_c2()) {
return -14;
} else {
return -15;
}
#else
return -18;
#endif
}
static NativeNMethodCmpBarrier* native_nmethod_barrier(nmethod* nm) {
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset;
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(nm);
NativeNMethodCmpBarrier* barrier = reinterpret_cast<NativeNMethodCmpBarrier*>(barrier_address);
debug_only(barrier->verify());
return barrier;

View File

@ -1518,7 +1518,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;

View File

@ -1744,7 +1744,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ subptr(rsp, stack_size - 2*wordSize);
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
// native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;

View File

@ -29,6 +29,8 @@
#include "asm/macroAssembler.inline.hpp"
#include "utilities/macros.hpp"
class C2EntryBarrierStub;
class C2_MacroAssembler: public MacroAssembler {
public:
// creation

View File

@ -39,6 +39,7 @@
#include "opto/ad.hpp"
#include "opto/block.hpp"
#include "opto/c2compiler.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/callnode.hpp"
#include "opto/cfgnode.hpp"
#include "opto/locknode.hpp"
@ -284,12 +285,51 @@ int C2SafepointPollStubTable::estimate_stub_size() const {
return result;
}
// Nmethod entry barrier stubs
C2EntryBarrierStub* C2EntryBarrierStubTable::add_entry_barrier() {
assert(_stub == NULL, "There can only be one entry barrier stub");
_stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
return _stub;
}
void C2EntryBarrierStubTable::emit(CodeBuffer& cb) {
if (_stub == NULL) {
// No stub - nothing to do
return;
}
C2_MacroAssembler masm(&cb);
// Make sure there is enough space in the code buffer
if (cb.insts()->maybe_expand_to_ensure_remaining(PhaseOutput::MAX_inst_size) && cb.blob() == NULL) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}
intptr_t before = masm.offset();
masm.emit_entry_barrier_stub(_stub);
intptr_t after = masm.offset();
int actual_size = (int)(after - before);
int expected_size = masm.entry_barrier_stub_size();
assert(actual_size == expected_size, "Estimated size is wrong, expected %d, was %d", expected_size, actual_size);
}
int C2EntryBarrierStubTable::estimate_stub_size() const {
if (BarrierSet::barrier_set()->barrier_set_nmethod() == NULL) {
// No nmethod entry barrier?
return 0;
}
return C2_MacroAssembler::entry_barrier_stub_size();
}
PhaseOutput::PhaseOutput()
: Phase(Phase::Output),
_code_buffer("Compile::Fill_buffer"),
_first_block_size(0),
_handler_table(),
_inc_table(),
_safepoint_poll_table(),
_entry_barrier_table(),
_oop_map_set(NULL),
_scratch_buffer_blob(NULL),
_scratch_locs_memory(NULL),
@ -1302,6 +1342,7 @@ CodeBuffer* PhaseOutput::init_buffer() {
BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
stub_req += bs->estimate_stub_size();
stub_req += safepoint_poll_table()->estimate_stub_size();
stub_req += entry_barrier_table()->estimate_stub_size();
// nmethod and CodeBuffer count stubs & constants as part of method's code.
// class HandlerImpl is platform-specific and defined in the *.ad files.
@ -1812,6 +1853,10 @@ void PhaseOutput::fill_buffer(CodeBuffer* cb, uint* blk_starts) {
safepoint_poll_table()->emit(*cb);
if (C->failing()) return;
// Fill in stubs for calling the runtime from nmethod entries.
entry_barrier_table()->emit(*cb);
if (C->failing()) return;
#ifndef PRODUCT
// Information on the size of the method, without the extraneous code
Scheduling::increment_method_size(cb->insts_size());

View File

@ -40,6 +40,7 @@ class Arena;
class Bundle;
class Block;
class Block_Array;
class C2_MacroAssembler;
class ciMethod;
class Compile;
class MachNode;
@ -113,6 +114,30 @@ public:
void emit(CodeBuffer& cb);
};
// We move non-hot code of the nmethod entry barrier to an out-of-line stub
class C2EntryBarrierStub: public ResourceObj {
Label _slow_path;
Label _continuation;
public:
C2EntryBarrierStub() :
_slow_path(),
_continuation() {}
Label& slow_path() { return _slow_path; }
Label& continuation() { return _continuation; }
};
class C2EntryBarrierStubTable {
C2EntryBarrierStub* _stub;
public:
C2EntryBarrierStubTable() : _stub(NULL) {}
C2EntryBarrierStub* add_entry_barrier();
int estimate_stub_size() const;
void emit(CodeBuffer& cb);
};
class PhaseOutput : public Phase {
private:
// Instruction bits passed off to the VM
@ -122,6 +147,7 @@ private:
ExceptionHandlerTable _handler_table; // Table of native-code exception handlers
ImplicitExceptionTable _inc_table; // Table of implicit null checks in native code
C2SafepointPollStubTable _safepoint_poll_table;// Table for safepoint polls
C2EntryBarrierStubTable _entry_barrier_table; // Table for entry barrier stubs
OopMapSet* _oop_map_set; // Table of oop maps (one for each safepoint location)
BufferBlob* _scratch_buffer_blob; // For temporary code buffers.
relocInfo* _scratch_locs_memory; // For temporary code buffers.
@ -172,6 +198,9 @@ public:
// Safepoint poll table
C2SafepointPollStubTable* safepoint_poll_table() { return &_safepoint_poll_table; }
// Entry barrier table
C2EntryBarrierStubTable* entry_barrier_table() { return &_entry_barrier_table; }
// Code emission iterator
Block* block() { return _block; }
int index() { return _index; }