8331117: [PPC64] secondary_super_cache does not scale well

Reviewed-by: rrich, amitkumar
This commit is contained in:
Martin Doerr 2024-06-17 09:30:48 +00:00
parent 113a2c028d
commit 0d1080d194
6 changed files with 421 additions and 0 deletions

@ -2130,6 +2130,295 @@ void MacroAssembler::check_klass_subtype(Register sub_klass,
bind(L_failure); // Fallthru if not successful.
}
// scans count pointer sized words at [addr] for occurrence of value,
// generic (count must be >0)
// iff found: CR0 eq, scratch == 0
void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
Label Lloop, Lexit;
#ifdef ASSERT
{
Label ok;
cmpdi(CCR0, count, 0);
bgt(CCR0, ok);
stop("count must be positive");
bind(ok);
}
#endif
mtctr(count);
bind(Lloop);
ld(scratch, 0 , addr);
xor_(scratch, scratch, value);
beq(CCR0, Lexit);
addi(addr, addr, wordSize);
bdnz(Lloop);
bind(Lexit);
}
// Ensure that the inline code and the stub are using the same registers.
#define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
do { \
assert(r_super_klass == R4_ARG2 && \
r_array_base == R3_ARG1 && \
r_array_length == R7_ARG5 && \
(r_array_index == R6_ARG4 || r_array_index == noreg) && \
(r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \
(r_bitmap == R11_scratch1 || r_bitmap == noreg) && \
(result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \
} while(0)
// Return true: we succeeded in generating this code
void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register temp1,
Register temp2,
Register temp3,
Register temp4,
Register result,
u1 super_klass_slot) {
assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
Label L_done;
BLOCK_COMMENT("lookup_secondary_supers_table {");
const Register
r_array_base = temp1,
r_array_length = temp2,
r_array_index = temp3,
r_bitmap = temp4;
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
ld(r_bitmap, in_bytes(Klass::bitmap_offset()), r_sub_klass);
// First check the bitmap to see if super_klass might be present. If
// the bit is zero, we are certain that super_klass is not one of
// the secondary supers.
u1 bit = super_klass_slot;
int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
// if (shift_count == 0) this is used for comparing with 0:
sldi_(r_array_index, r_bitmap, shift_count);
li(result, 1); // failure
// We test the MSB of r_array_index, i.e. its sign bit
bge(CCR0, L_done);
// We will consult the secondary-super array.
ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
// The value i in r_array_index is >= 1, so even though r_array_base
// points to the length, we don't need to adjust it to point to the
// data.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
// Get the first array index that can contain super_klass.
if (bit != 0) {
popcntd(r_array_index, r_array_index);
// NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
ldx(result, r_array_base, r_array_index);
} else {
// Actually use index 0, but r_array_base and r_array_index are off by 1 word
// such that the sum is precise.
ld(result, BytesPerWord, r_array_base);
li(r_array_index, BytesPerWord); // for slow path (scaled)
}
xor_(result, result, r_super_klass);
beq(CCR0, L_done); // Found a match (result == 0)
// Is there another entry to check? Consult the bitmap.
testbitdi(CCR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
beq(CCR0, L_done); // (result != 0)
// Linear probe. Rotate the bitmap so that the next bit to test is
// in Bit 2 for the look-ahead check in the slow path.
if (bit != 0) {
rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
}
// Calls into the stub generated by lookup_secondary_supers_table_slow_path.
// Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
// Kills: r_array_length.
// Returns: result.
address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
Register r_stub_addr = r_array_length;
add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
mtctr(r_stub_addr);
bctrl();
bind(L_done);
BLOCK_COMMENT("} lookup_secondary_supers_table");
if (VerifySecondarySupers) {
verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
temp1, temp2, temp3);
}
}
// Called by code generated by check_klass_subtype_slow_path
// above. This is called when there is a collision in the hashed
// lookup in the secondary supers array.
void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register result,
Register temp1) {
assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
const Register
r_array_length = temp1,
r_sub_klass = noreg;
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
Label L_done;
// Load the array length.
lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
// And adjust the array base to point to the data.
// NB! Effectively increments current slot index by 1.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
// Linear probe
Label L_huge;
// The bitmap is full to bursting.
// Implicit invariant: BITMAP_FULL implies (length > 0)
assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "");
cmpdi(CCR0, r_bitmap, -1);
beq(CCR0, L_huge);
// NB! Our caller has checked bits 0 and 1 in the bitmap. The
// current slot (at secondary_supers[r_array_index]) has not yet
// been inspected, and r_array_index may be out of bounds if we
// wrapped around the end of the array.
{ // This is conventional linear probing, but instead of terminating
// when a null entry is found in the table, we maintain a bitmap
// in which a 0 indicates missing entries.
// The check above guarantees there are 0s in the bitmap, so the loop
// eventually terminates.
#ifdef ASSERT
{
// We should only reach here after having found a bit in the bitmap.
// Invariant: array_length == popcount(bitmap)
Label ok;
cmpdi(CCR0, r_array_length, 0);
bgt(CCR0, ok);
stop("array_length must be positive");
bind(ok);
}
#endif
// Compute limit in r_array_length
addi(r_array_length, r_array_length, -1);
sldi(r_array_length, r_array_length, LogBytesPerWord);
Label L_loop;
bind(L_loop);
// Check for wraparound.
cmpd(CCR0, r_array_index, r_array_length);
isel_0(r_array_index, CCR0, Assembler::greater);
ldx(result, r_array_base, r_array_index);
xor_(result, result, r_super_klass);
beq(CCR0, L_done); // success (result == 0)
// look-ahead check (Bit 2); result is non-zero
testbitdi(CCR0, R0, r_bitmap, 2);
beq(CCR0, L_done); // fail (result != 0)
rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
addi(r_array_index, r_array_index, BytesPerWord);
b(L_loop);
}
{ // Degenerate case: more than 64 secondary supers.
// FIXME: We could do something smarter here, maybe a vectorized
// comparison or a binary search, but is that worth any added
// complexity?
bind(L_huge);
repne_scan(r_array_base, r_super_klass, r_array_length, result);
}
bind(L_done);
}
// Make sure that the hashed lookup and a linear scan agree.
void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register temp1,
Register temp2,
Register temp3) {
assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
const Register
r_array_base = temp1,
r_array_length = temp2,
r_array_index = temp3,
r_bitmap = noreg; // unused
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
BLOCK_COMMENT("verify_secondary_supers_table {");
Label passed, failure;
// We will consult the secondary-super array.
ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
// Load the array length.
lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
// And adjust the array base to point to the data.
addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
// convert !=0 to 1
neg(R0, result);
orr(result, result, R0);
srdi(result, result, 63);
const Register linear_result = r_array_index; // reuse
li(linear_result, 1);
cmpdi(CCR0, r_array_length, 0);
ble(CCR0, failure);
repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
bind(failure);
// convert !=0 to 1
neg(R0, linear_result);
orr(linear_result, linear_result, R0);
srdi(linear_result, linear_result, 63);
cmpd(CCR0, result, linear_result);
beq(CCR0, passed);
assert_different_registers(R3_ARG1, r_sub_klass, linear_result, result);
mr_if_needed(R3_ARG1, r_super_klass);
assert_different_registers(R4_ARG2, linear_result, result);
mr_if_needed(R4_ARG2, r_sub_klass);
assert_different_registers(R5_ARG3, result);
neg(R5_ARG3, linear_result);
neg(R6_ARG4, result);
const char* msg = "mismatch";
load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
should_not_reach_here();
bind(passed);
BLOCK_COMMENT("} verify_secondary_supers_table");
}
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");

@ -604,6 +604,33 @@ class MacroAssembler: public Assembler {
Register temp2_reg,
Label& L_success);
void repne_scan(Register addr, Register value, Register count, Register scratch);
// As above, but with a constant super_klass.
// The result is in Register result, not the condition codes.
void lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register temp1,
Register temp2,
Register temp3,
Register temp4,
Register result,
u1 super_klass_slot);
void verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register temp1,
Register temp2,
Register temp3);
void lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register result,
Register temp1);
void clinit_barrier(Register klass,
Register thread,
Label* L_fast_path = nullptr,

@ -641,6 +641,8 @@ reg_class rarg1_bits64_reg(R3_H, R3);
reg_class rarg2_bits64_reg(R4_H, R4);
reg_class rarg3_bits64_reg(R5_H, R5);
reg_class rarg4_bits64_reg(R6_H, R6);
reg_class rarg5_bits64_reg(R7_H, R7);
reg_class rarg6_bits64_reg(R8_H, R8);
// Thread register, 'written' by tlsLoadP, see there.
reg_class thread_bits64_reg(R16_H, R16);
@ -4354,6 +4356,8 @@ operand iRegPsrc() %{
match(rarg2RegP);
match(rarg3RegP);
match(rarg4RegP);
match(rarg5RegP);
match(rarg6RegP);
match(threadRegP);
format %{ %}
interface(REG_INTER);
@ -4409,6 +4413,20 @@ operand rarg4RegP() %{
interface(REG_INTER);
%}
operand rarg5RegP() %{
constraint(ALLOC_IN_RC(rarg5_bits64_reg));
match(iRegPdst);
format %{ %}
interface(REG_INTER);
%}
operand rarg6RegP() %{
constraint(ALLOC_IN_RC(rarg6_bits64_reg));
match(iRegPdst);
format %{ %}
interface(REG_INTER);
%}
operand iRegNsrc() %{
constraint(ALLOC_IN_RC(bits32_reg_ro));
match(RegN);
@ -12024,6 +12042,35 @@ instruct partialSubtypeCheck(iRegPdst result, iRegP_N2P subklass, iRegP_N2P supe
ins_pipe(pipe_class_default);
%}
instruct partialSubtypeCheckConstSuper(rarg3RegP sub, rarg2RegP super_reg, immP super_con, rarg6RegP result,
rarg1RegP tempR1, rarg5RegP tempR2, rarg4RegP tempR3, rscratch1RegP tempR4,
flagsRegCR0 cr0, regCTR ctr)
%{
match(Set result (PartialSubtypeCheck sub (Binary super_reg super_con)));
predicate(UseSecondarySupersTable);
effect(KILL cr0, KILL ctr, TEMP tempR1, TEMP tempR2, TEMP tempR3, TEMP tempR4);
ins_cost(DEFAULT_COST*8); // smaller than the other version
format %{ "partialSubtypeCheck $result, $sub, $super_reg" %}
ins_encode %{
u1 super_klass_slot = ((Klass*)$super_con$$constant)->hash_slot();
if (InlineSecondarySupersTest) {
__ lookup_secondary_supers_table($sub$$Register, $super_reg$$Register,
$tempR1$$Register, $tempR2$$Register, $tempR3$$Register, $tempR4$$Register,
$result$$Register, super_klass_slot);
} else {
address stub = StubRoutines::lookup_secondary_supers_table_stub(super_klass_slot);
Register r_stub_addr = $tempR1$$Register;
__ add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
__ mtctr(r_stub_addr);
__ bctrl();
}
%}
ins_pipe(pipe_class_memory);
%}
// inlined locking and unlocking
instruct cmpFastLock(flagsRegCR0 crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2) %{

@ -4531,6 +4531,46 @@ class StubGenerator: public StubCodeGenerator {
#endif // VM_LITTLE_ENDIAN
address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
address start = __ pc();
const Register
r_super_klass = R4_ARG2,
r_array_base = R3_ARG1,
r_array_length = R7_ARG5,
r_array_index = R6_ARG4,
r_sub_klass = R5_ARG3,
r_bitmap = R11_scratch1,
result = R8_ARG6;
__ lookup_secondary_supers_table(r_sub_klass, r_super_klass,
r_array_base, r_array_length, r_array_index,
r_bitmap, result, super_klass_index);
__ blr();
return start;
}
// Slow path implementation for UseSecondarySupersTable.
address generate_lookup_secondary_supers_table_slow_path_stub() {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
address start = __ pc();
const Register
r_super_klass = R4_ARG2,
r_array_base = R3_ARG1,
temp1 = R7_ARG5,
r_array_index = R6_ARG4,
r_bitmap = R11_scratch1,
result = R8_ARG6;
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
__ blr();
return start;
}
address generate_cont_thaw(const char* label, Continuation::thaw_kind kind) {
if (!Continuations::enabled()) return nullptr;
@ -4807,6 +4847,16 @@ class StubGenerator: public StubCodeGenerator {
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
if (UseSecondarySupersTable) {
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
if (!InlineSecondarySupersTest) {
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
StubRoutines::_lookup_secondary_supers_table_stubs[slot]
= generate_lookup_secondary_supers_table_stub(slot);
}
}
}
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
}

@ -340,6 +340,13 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseSHA, false);
}
if (UseSecondarySupersTable && PowerArchitecturePPC64 < 7) {
if (!FLAG_IS_DEFAULT(UseSecondarySupersTable)) {
warning("UseSecondarySupersTable requires Power7 or later.");
}
FLAG_SET_DEFAULT(UseSecondarySupersTable, false);
}
#ifdef COMPILER2
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
UseSquareToLenIntrinsic = true;

@ -95,6 +95,7 @@ public:
static bool supports_fast_class_init_checks() { return true; }
constexpr static bool supports_stack_watermark_barrier() { return true; }
constexpr static bool supports_recursive_lightweight_locking() { return true; }
constexpr static bool supports_secondary_supers_table() { return true; }
static bool is_determine_features_test_running() { return _is_determine_features_test_running; }
// CPU instruction support