8332587: RISC-V: secondary_super_cache does not scale well

Reviewed-by: mli, fyang
This commit is contained in:
Gui Cao 2024-06-20 13:45:31 +00:00 committed by Hamlin Li
parent 5cad0b4df7
commit 001d686019
5 changed files with 400 additions and 2 deletions

View File

@ -3611,6 +3611,278 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
bind(L_fallthrough);
}
// population_count variant for running without the CPOP
// instruction, which was introduced with Zbb extension.
void MacroAssembler::population_count(Register dst, Register src,
Register tmp1, Register tmp2) {
if (UsePopCountInstruction) {
cpop(dst, src);
} else {
assert_different_registers(src, tmp1, tmp2);
assert_different_registers(dst, tmp1, tmp2);
Label loop, done;
mv(tmp1, src);
// dst = 0;
// while(tmp1 != 0) {
// dst++;
// tmp1 &= (tmp1 - 1);
// }
mv(dst, zr);
beqz(tmp1, done);
{
bind(loop);
addi(dst, dst, 1);
addi(tmp2, tmp1, -1);
andr(tmp1, tmp1, tmp2);
bnez(tmp1, loop);
}
bind(done);
}
}
// Ensure that the inline code and the stub are using the same registers
// as we need to call the stub from inline code when there is a collision
// in the hashed lookup in the secondary supers array.
#define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
r_array_index, r_sub_klass, result, r_bitmap) \
do { \
assert(r_super_klass == x10 && \
r_array_base == x11 && \
r_array_length == x12 && \
(r_array_index == x13 || r_array_index == noreg) && \
(r_sub_klass == x14 || r_sub_klass == noreg) && \
(result == x15 || result == noreg) && \
(r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
} while(0)
// Return true: we succeeded in generating this code
bool MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register tmp1,
Register tmp2,
Register tmp3,
Register tmp4,
u1 super_klass_slot,
bool stub_is_near) {
assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0);
Label L_fallthrough;
BLOCK_COMMENT("lookup_secondary_supers_table {");
const Register
r_array_base = tmp1, // x11
r_array_length = tmp2, // x12
r_array_index = tmp3, // x13
r_bitmap = tmp4; // x16
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
r_array_index, r_sub_klass, result, r_bitmap);
u1 bit = super_klass_slot;
// Initialize result value to 1 which means mismatch.
mv(result, 1);
ld(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
// First check the bitmap to see if super_klass might be present. If
// the bit is zero, we are certain that super_klass is not one of
// the secondary supers.
test_bit(t0, r_bitmap, bit);
beqz(t0, L_fallthrough);
// Get the first array index that can contain super_klass into r_array_index.
if (bit != 0) {
slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
population_count(r_array_index, r_array_index, tmp1, tmp2);
} else {
mv(r_array_index, (u1)1);
}
// We will consult the secondary-super array.
ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
// The value i in r_array_index is >= 1, so even though r_array_base
// points to the length, we don't need to adjust it to point to the data.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
ld(result, Address(result));
xorr(result, result, r_super_klass);
beqz(result, L_fallthrough); // Found a match
// Is there another entry to check? Consult the bitmap.
test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
beqz(t0, L_fallthrough);
// Linear probe.
if (bit != 0) {
ror_imm(r_bitmap, r_bitmap, bit);
}
// The slot we just inspected is at secondary_supers[r_array_index - 1].
// The next slot to be inspected, by the stub we're about to call,
// is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
// have been checked.
Address stub = RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
if (stub_is_near) {
jump_link(stub, t0);
} else {
address call = trampoline_call(stub);
if (call == nullptr) {
return false; // trampoline allocation failed
}
}
BLOCK_COMMENT("} lookup_secondary_supers_table");
bind(L_fallthrough);
if (VerifySecondarySupers) {
verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
}
return true;
}
// Called by code generated by check_klass_subtype_slow_path
// above. This is called when there is a collision in the hashed
// lookup in the secondary supers array.
void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register result,
Register tmp1) {
assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp1, result, t0);
const Register
r_array_length = tmp1,
r_sub_klass = noreg; // unused
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
r_array_index, r_sub_klass, result, r_bitmap);
Label L_matched, L_fallthrough, L_bitmap_full;
// Initialize result value to 1 which means mismatch.
mv(result, 1);
// Load the array length.
lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
// And adjust the array base to point to the data.
// NB! Effectively increments current slot index by 1.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
// Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
addi(t0, r_bitmap, (u1)1);
beqz(t0, L_bitmap_full);
// NB! Our caller has checked bits 0 and 1 in the bitmap. The
// current slot (at secondary_supers[r_array_index]) has not yet
// been inspected, and r_array_index may be out of bounds if we
// wrapped around the end of the array.
{ // This is conventional linear probing, but instead of terminating
// when a null entry is found in the table, we maintain a bitmap
// in which a 0 indicates missing entries.
// The check above guarantees there are 0s in the bitmap, so the loop
// eventually terminates.
Label L_loop;
bind(L_loop);
// Check for wraparound.
Label skip;
bge(r_array_length, r_array_index, skip);
mv(r_array_index, zr);
bind(skip);
shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
ld(t0, Address(t0));
beq(t0, r_super_klass, L_matched);
test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
beqz(t0, L_fallthrough);
ror_imm(r_bitmap, r_bitmap, 1);
addi(r_array_index, r_array_index, 1);
j(L_loop);
}
{ // Degenerate case: more than 64 secondary supers.
// FIXME: We could do something smarter here, maybe a vectorized
// comparison or a binary search, but is that worth any added
// complexity?
bind(L_bitmap_full);
repne_scan(r_array_base, r_super_klass, r_array_length, t0);
bne(r_super_klass, t0, L_fallthrough);
}
bind(L_matched);
mv(result, zr);
bind(L_fallthrough);
}
// Make sure that the hashed lookup and a linear scan agree.
void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register tmp1,
Register tmp2,
Register tmp3) {
assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0);
const Register
r_array_base = tmp1, // X11
r_array_length = tmp2, // X12
r_array_index = noreg, // unused
r_bitmap = noreg; // unused
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
r_array_index, r_sub_klass, result, r_bitmap);
BLOCK_COMMENT("verify_secondary_supers_table {");
// We will consult the secondary-super array.
ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
// Load the array length.
lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
// And adjust the array base to point to the data.
addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
repne_scan(r_array_base, r_super_klass, r_array_length, t0);
Label failed;
mv(tmp3, 1);
bne(r_super_klass, t0, failed);
mv(tmp3, zr);
bind(failed);
snez(result, result); // normalize result to 0/1 for comparison
Label passed;
beq(tmp3, result, passed);
{
mv(x10, r_super_klass);
mv(x11, r_sub_klass);
mv(x12, tmp3);
mv(x13, result);
mv(x14, (address)("mismatch"));
rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
should_not_reach_here();
}
bind(passed);
BLOCK_COMMENT("} verify_secondary_supers_table");
}
// Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
void MacroAssembler::tlab_allocate(Register obj,
Register var_size_in_bytes,

View File

@ -322,6 +322,34 @@ class MacroAssembler: public Assembler {
Label* L_success,
Label* L_failure);
void population_count(Register dst, Register src, Register tmp1, Register tmp2);
// As above, but with a constant super_klass.
// The result is in Register result, not the condition codes.
bool lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register tmp1,
Register tmp2,
Register tmp3,
Register tmp4,
u1 super_klass_slot,
bool stub_is_near = false);
void verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register result,
Register tmp1,
Register tmp2,
Register tmp3);
void lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register result,
Register tmp1);
void check_klass_subtype(Register sub_klass,
Register super_klass,
Register tmp_reg,

View File

@ -3313,6 +3313,16 @@ operand iRegP_R15()
interface(REG_INTER);
%}
operand iRegP_R16()
%{
constraint(ALLOC_IN_RC(r16_reg));
match(RegP);
match(iRegPNoSp);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}
// Pointer 64 bit Register R28 only
operand iRegP_R28()
%{
@ -10075,7 +10085,7 @@ instruct partialSubtypeCheck(iRegP_R15 result, iRegP_R14 sub, iRegP_R10 super, i
match(Set result (PartialSubtypeCheck sub super));
effect(KILL tmp, KILL cr);
ins_cost(2 * STORE_COST + 3 * LOAD_COST + 4 * ALU_COST + BRANCH_COST * 4);
ins_cost(11 * DEFAULT_COST);
format %{ "partialSubtypeCheck $result, $sub, $super\t#@partialSubtypeCheck" %}
ins_encode(riscv_enc_partial_subtype_check(sub, super, tmp, result));
@ -10085,13 +10095,43 @@ instruct partialSubtypeCheck(iRegP_R15 result, iRegP_R14 sub, iRegP_R10 super, i
ins_pipe(pipe_class_memory);
%}
instruct partialSubtypeCheckConstSuper(iRegP_R14 sub, iRegP_R10 super_reg, immP super_con, iRegP_R15 result,
iRegP_R11 tmpR11, iRegP_R12 tmpR12, iRegP_R13 tmpR13, iRegP_R16 tmpR16)
%{
predicate(UseSecondarySupersTable);
match(Set result (PartialSubtypeCheck sub (Binary super_reg super_con)));
effect(TEMP tmpR11, TEMP tmpR12, TEMP tmpR13, TEMP tmpR16);
ins_cost(7 * DEFAULT_COST); // needs to be less than competing nodes
format %{ "partialSubtypeCheck $result, $sub, $super_reg, $super_con" %}
ins_encode %{
bool success = false;
u1 super_klass_slot = ((Klass*)$super_con$$constant)->hash_slot();
if (InlineSecondarySupersTest) {
success = __ lookup_secondary_supers_table($sub$$Register, $super_reg$$Register, $result$$Register,
$tmpR11$$Register, $tmpR12$$Register, $tmpR13$$Register,
$tmpR16$$Register, super_klass_slot);
} else {
address call = __ trampoline_call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_stub(super_klass_slot)));
success = (call != nullptr);
}
if (!success) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}
%}
ins_pipe(pipe_class_memory);
%}
instruct partialSubtypeCheckVsZero(iRegP_R15 result, iRegP_R14 sub, iRegP_R10 super, iRegP_R12 tmp,
immP0 zero, rFlagsReg cr)
%{
match(Set cr (CmpP (PartialSubtypeCheck sub super) zero));
effect(KILL tmp, KILL result);
ins_cost(2 * STORE_COST + 3 * LOAD_COST + 4 * ALU_COST + BRANCH_COST * 4);
ins_cost(11 * DEFAULT_COST);
format %{ "partialSubtypeCheck $result, $sub, $super == 0\t#@partialSubtypeCheckVsZero" %}
ins_encode(riscv_enc_partial_subtype_check(sub, super, tmp, result));

View File

@ -2808,6 +2808,50 @@ class StubGenerator: public StubCodeGenerator {
}
#ifdef COMPILER2
address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
address start = __ pc();
const Register
r_super_klass = x10,
r_array_base = x11,
r_array_length = x12,
r_array_index = x13,
r_sub_klass = x14,
result = x15,
r_bitmap = x16;
Label L_success;
__ enter();
__ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result,
r_array_base, r_array_length, r_array_index,
r_bitmap, super_klass_index, /*stub_is_near*/true);
__ leave();
__ ret();
return start;
}
// Slow path implementation for UseSecondarySupersTable.
address generate_lookup_secondary_supers_table_slow_path_stub() {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
address start = __ pc();
const Register
r_super_klass = x10, // argument
r_array_base = x11, // argument
temp1 = x12, // tmp
r_array_index = x13, // argument
result = x15, // argument
r_bitmap = x16; // argument
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
__ ret();
return start;
}
address generate_mulAdd()
{
__ align(CodeEntryAlignment);
@ -5566,6 +5610,18 @@ static const int64_t right_3_bits = right_n_bits(3);
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
}
#ifdef COMPILER2
if (UseSecondarySupersTable) {
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
if (!InlineSecondarySupersTest) {
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
StubRoutines::_lookup_secondary_supers_table_stubs[slot]
= generate_lookup_secondary_supers_table_stub(slot);
}
}
}
#endif // COMPILER2
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
StubRoutines::riscv::set_completed();

View File

@ -277,6 +277,8 @@ class VM_Version : public Abstract_VM_Version {
constexpr static bool supports_recursive_lightweight_locking() { return true; }
constexpr static bool supports_secondary_supers_table() { return true; }
static bool supports_on_spin_wait() { return UseZihintpause; }
// RISCV64 supports fast class initialization checks