8331126: [s390x] secondary_super_cache does not scale well

Reviewed-by: lucy, aph, mdoerr
This commit is contained in:
Amit Kumar 2024-07-30 09:32:27 +00:00
parent 156f0b4332
commit 7ac531181c
6 changed files with 429 additions and 3 deletions

@ -1,6 +1,7 @@
/*
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024 SAP SE. All rights reserved.
* Copyright 2024 IBM Corporation. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -3152,6 +3153,301 @@ void MacroAssembler::check_klass_subtype(Register sub_klass,
BLOCK_COMMENT("} check_klass_subtype");
}
// scans r_count pointer sized words at [r_addr] for occurrence of r_value,
// generic (r_count must be >0)
// iff found: CC eq, r_result == 0
void MacroAssembler::repne_scan(Register r_addr, Register r_value, Register r_count, Register r_result) {
NearLabel L_loop, L_exit;
BLOCK_COMMENT("repne_scan {");
#ifdef ASSERT
z_chi(r_count, 0);
asm_assert(bcondHigh, "count must be positive", 11);
#endif
clear_reg(r_result, true /* whole_reg */, false /* set_cc */); // sets r_result=0, let's hope that search will be successful
bind(L_loop);
z_cg(r_value, Address(r_addr));
z_bre(L_exit); // branch on success
z_la(r_addr, wordSize, r_addr);
z_brct(r_count, L_loop);
// z_brct above doesn't change CC.
// If we reach here, then the value in r_value is not present. Set r_result to 1.
z_lghi(r_result, 1);
bind(L_exit);
BLOCK_COMMENT("} repne_scan");
}
// Ensure that the inline code and the stub are using the same registers.
#define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
do { \
assert(r_super_klass == Z_ARG1 && \
r_array_base == Z_ARG5 && \
r_array_length == Z_ARG4 && \
(r_array_index == Z_ARG3 || r_array_index == noreg) && \
(r_sub_klass == Z_ARG2 || r_sub_klass == noreg) && \
(r_bitmap == Z_R10 || r_bitmap == noreg) && \
(r_result == Z_R11 || r_result == noreg), "registers must match s390.ad"); \
} while(0)
// Note: this method also kills Z_R1_scratch register on machines older than z15
void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register r_temp1,
Register r_temp2,
Register r_temp3,
Register r_temp4,
Register r_result,
u1 super_klass_slot) {
NearLabel L_done, L_failure;
BLOCK_COMMENT("lookup_secondary_supers_table {");
const Register
r_array_base = r_temp1,
r_array_length = r_temp2,
r_array_index = r_temp3,
r_bitmap = r_temp4;
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
z_lg(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
// First check the bitmap to see if super_klass might be present. If
// the bit is zero, we are certain that super_klass is not one of
// the secondary supers.
u1 bit = super_klass_slot;
int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
z_sllg(r_array_index, r_bitmap, shift_count); // take the bit to 63rd location
// Initialize r_result with 0 (indicating success). If searching fails, r_result will be loaded
// with 1 (failure) at the end of this method.
clear_reg(r_result, true /* whole_reg */, false /* set_cc */); // r_result = 0
// We test the MSB of r_array_index, i.e., its sign bit
testbit(r_array_index, 63);
z_bfalse(L_failure); // if not set, then jump!!!
// We will consult the secondary-super array.
z_lg(r_array_base, Address(r_sub_klass, Klass::secondary_supers_offset()));
// The value i in r_array_index is >= 1, so even though r_array_base
// points to the length, we don't need to adjust it to point to the
// data.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
// Get the first array index that can contain super_klass.
if (bit != 0) {
pop_count_long(r_array_index, r_array_index, Z_R1_scratch); // kills Z_R1_scratch on machines older than z15
// NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
z_sllg(r_array_index, r_array_index, LogBytesPerWord); // scale
} else {
// Actually use index 0, but r_array_base and r_array_index are off by 1 word
// such that the sum is precise.
z_lghi(r_array_index, BytesPerWord); // for slow path (scaled)
}
z_cg(r_super_klass, Address(r_array_base, r_array_index));
branch_optimized(bcondEqual, L_done); // found a match; success
// Is there another entry to check? Consult the bitmap.
testbit(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
z_bfalse(L_failure);
// Linear probe. Rotate the bitmap so that the next bit to test is
// in Bit 2 for the look-ahead check in the slow path.
if (bit != 0) {
z_rllg(r_bitmap, r_bitmap, 64-bit); // rotate right
}
// Calls into the stub generated by lookup_secondary_supers_table_slow_path.
// Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
// Kills: r_array_length.
// Returns: r_result
call_stub(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
z_bru(L_done); // pass whatever result we got from a slow path
bind(L_failure);
// TODO: use load immediate on condition and z_bru above will not be required
z_lghi(r_result, 1);
bind(L_done);
BLOCK_COMMENT("} lookup_secondary_supers_table");
if (VerifySecondarySupers) {
verify_secondary_supers_table(r_sub_klass, r_super_klass, r_result,
r_temp1, r_temp2, r_temp3);
}
}
// Called by code generated by check_klass_subtype_slow_path
// above. This is called when there is a collision in the hashed
// lookup in the secondary supers array.
void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register r_result,
Register r_temp1) {
assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, r_result, r_temp1);
const Register
r_array_length = r_temp1,
r_sub_klass = noreg;
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
BLOCK_COMMENT("lookup_secondary_supers_table_slow_path {");
NearLabel L_done, L_failure;
// Load the array length.
z_llgf(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
// And adjust the array base to point to the data.
// NB!
// Effectively increments the current slot index by 1.
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
add2reg(r_array_base, Array<Klass*>::base_offset_in_bytes());
// Linear probe
NearLabel L_huge;
// The bitmap is full to bursting.
z_cghi(r_bitmap, Klass::SECONDARY_SUPERS_BITMAP_FULL);
z_bre(L_huge);
// NB! Our caller has checked bits 0 and 1 in the bitmap. The
// current slot (at secondary_supers[r_array_index]) has not yet
// been inspected, and r_array_index may be out of bounds if we
// wrapped around the end of the array.
{ // This is conventional linear probing, but instead of terminating
// when a null entry is found in the table, we maintain a bitmap
// in which a 0 indicates missing entries.
// The check above guarantees there are 0s in the bitmap, so the loop
// eventually terminates.
#ifdef ASSERT
// r_result is set to 0 by lookup_secondary_supers_table.
// clear_reg(r_result, true /* whole_reg */, false /* set_cc */);
z_cghi(r_result, 0);
asm_assert(bcondEqual, "r_result required to be 0, used by z_locgr", 44);
// We should only reach here after having found a bit in the bitmap.
z_ltgr(r_array_length, r_array_length);
asm_assert(bcondHigh, "array_length > 0, should hold", 22);
#endif // ASSERT
// Compute limit in r_array_length
add2reg(r_array_length, -1);
z_sllg(r_array_length, r_array_length, LogBytesPerWord);
NearLabel L_loop;
bind(L_loop);
// Check for wraparound.
z_cgr(r_array_index, r_array_length);
z_locgr(r_array_index, r_result, bcondHigh); // r_result is containing 0
z_cg(r_super_klass, Address(r_array_base, r_array_index));
z_bre(L_done); // success
// look-ahead check: if Bit 2 is 0, we're done
testbit(r_bitmap, 2);
z_bfalse(L_failure);
z_rllg(r_bitmap, r_bitmap, 64-1); // rotate right
add2reg(r_array_index, BytesPerWord);
z_bru(L_loop);
}
{ // Degenerate case: more than 64 secondary supers.
// FIXME: We could do something smarter here, maybe a vectorized
// comparison or a binary search, but is that worth any added
// complexity?
bind(L_huge);
repne_scan(r_array_base, r_super_klass, r_array_length, r_result);
z_bru(L_done); // forward the result we got from repne_scan
}
bind(L_failure);
z_lghi(r_result, 1);
bind(L_done);
BLOCK_COMMENT("} lookup_secondary_supers_table_slow_path");
}
// Make sure that the hashed lookup and a linear scan agree.
void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register r_result /* expected */,
Register r_temp1,
Register r_temp2,
Register r_temp3) {
assert_different_registers(r_sub_klass, r_super_klass, r_result, r_temp1, r_temp2, r_temp3);
const Register
r_array_base = r_temp1,
r_array_length = r_temp2,
r_array_index = r_temp3,
r_bitmap = noreg; // unused
const Register r_one = Z_R0_scratch;
z_lghi(r_one, 1); // for locgr down there, to a load result for failure
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
BLOCK_COMMENT("verify_secondary_supers_table {");
Label L_passed, L_failure;
// We will consult the secondary-super array.
z_lg(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
// Load the array length.
z_llgf(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
// And adjust the array base to point to the data.
z_aghi(r_array_base, Array<Klass*>::base_offset_in_bytes());
const Register r_linear_result = r_array_index; // reuse
z_chi(r_array_length, 0);
z_locgr(r_linear_result, r_one, bcondNotHigh); // load failure if array_length <= 0
z_brc(bcondNotHigh, L_failure);
repne_scan(r_array_base, r_super_klass, r_array_length, r_linear_result);
bind(L_failure);
z_cr(r_result, r_linear_result);
z_bre(L_passed);
assert_different_registers(Z_ARG1, r_sub_klass, r_linear_result, r_result);
lgr_if_needed(Z_ARG1, r_super_klass);
assert_different_registers(Z_ARG2, r_linear_result, r_result);
lgr_if_needed(Z_ARG2, r_sub_klass);
assert_different_registers(Z_ARG3, r_result);
z_lgr(Z_ARG3, r_linear_result);
z_lgr(Z_ARG4, r_result);
const char* msg = "mismatch";
load_const_optimized(Z_ARG5, (address)msg);
call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
should_not_reach_here();
bind(L_passed);
BLOCK_COMMENT("} verify_secondary_supers_table");
}
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");

@ -708,6 +708,31 @@ class MacroAssembler: public Assembler {
Label* L_success,
Label* L_failure);
void repne_scan(Register r_addr, Register r_value, Register r_count, Register r_scratch);
void lookup_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register r_temp1,
Register r_temp2,
Register r_temp3,
Register r_temp4,
Register r_result,
u1 super_klass_slot);
void lookup_secondary_supers_table_slow_path(Register r_super_klass,
Register r_array_base,
Register r_array_index,
Register r_bitmap,
Register r_result,
Register r_temp1);
void verify_secondary_supers_table(Register r_sub_klass,
Register r_super_klass,
Register r_result /* expected */,
Register r_temp1,
Register r_temp2,
Register r_temp3);
// Simplified, combined version, good for typical uses.
// Falls through on failure.
void check_klass_subtype(Register sub_klass,

@ -356,6 +356,8 @@ reg_class z_rarg3_ptr_reg(Z_R4_H,Z_R4);
reg_class z_rarg4_ptr_reg(Z_R5_H,Z_R5);
reg_class z_rarg5_ptr_reg(Z_R6_H,Z_R6);
reg_class z_thread_ptr_reg(Z_R8_H,Z_R8);
reg_class z_r10_ptr_reg(Z_R10_H, Z_R10);
reg_class z_r11_ptr_reg(Z_R11_H, Z_R11);
reg_class z_ptr_reg(
/*Z_R0_H,Z_R0*/ // R0
@ -2985,6 +2987,8 @@ operand iRegP() %{
match(rarg5RegP);
match(revenRegP);
match(roddRegP);
match(r10TempRegP);
match(r11TempRegP);
format %{ %}
interface(REG_INTER);
%}
@ -2997,6 +3001,20 @@ operand threadRegP() %{
interface(REG_INTER);
%}
operand r10TempRegP() %{
constraint(ALLOC_IN_RC(z_r10_ptr_reg));
match(iRegP);
format %{ %}
interface(REG_INTER);
%}
operand r11TempRegP() %{
constraint(ALLOC_IN_RC(z_r11_ptr_reg));
match(iRegP);
format %{ %}
interface(REG_INTER);
%}
operand noArg_iRegP() %{
constraint(ALLOC_IN_RC(z_no_arg_ptr_reg));
match(iRegP);
@ -9560,6 +9578,32 @@ instruct partialSubtypeCheck(rarg1RegP index, rarg2RegP sub, rarg3RegP super, fl
ins_pipe(pipe_class_dummy);
%}
instruct partialSubtypeCheckConstSuper(rarg2RegP sub, rarg1RegP super, immP super_con,
r11TempRegP result, rarg5RegP temp1, rarg4RegP temp2,
rarg3RegP temp3, r10TempRegP temp4, flagsReg pcc) %{
match(Set result (PartialSubtypeCheck sub (Binary super super_con)));
predicate(UseSecondarySupersTable);
effect(KILL pcc, TEMP temp1, TEMP temp2, TEMP temp3, TEMP temp4);
ins_cost(7 * DEFAULT_COST); // needs to be less than competing nodes
format %{ "partialSubtypeCheck $result, $sub, $super, $super_con" %}
ins_encode %{
u1 super_klass_slot = ((Klass*)$super_con$$constant)->hash_slot();
if (InlineSecondarySupersTest) {
__ lookup_secondary_supers_table($sub$$Register, $super$$Register,
$temp1$$Register, $temp2$$Register, $temp3$$Register,
$temp4$$Register, $result$$Register, super_klass_slot);
} else {
AddressLiteral stub_address(StubRoutines::lookup_secondary_supers_table_stub(super_klass_slot));
__ load_const_optimized(Z_ARG4, stub_address);
__ z_basr(Z_R14, Z_ARG4);
}
%}
ins_pipe(pipe_class_dummy);
%}
instruct partialSubtypeCheck_vs_zero(flagsReg pcc, rarg2RegP sub, rarg3RegP super, immP0 zero,
rarg1RegP index, rarg4RegP scratch1, rarg5RegP scratch2) %{
match(Set pcc (CmpI (PartialSubtypeCheck sub super) zero));

@ -1,6 +1,6 @@
/*
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2023 SAP SE. All rights reserved.
* Copyright (c) 2016, 2024 SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -705,6 +705,50 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
const Register
r_super_klass = Z_ARG1,
r_sub_klass = Z_ARG2,
r_array_index = Z_ARG3,
r_array_length = Z_ARG4,
r_array_base = Z_ARG5,
r_bitmap = Z_R10,
r_result = Z_R11;
address start = __ pc();
__ lookup_secondary_supers_table(r_sub_klass, r_super_klass,
r_array_base, r_array_length, r_array_index,
r_bitmap, r_result, super_klass_index);
__ z_br(Z_R14);
return start;
}
// Slow path implementation for UseSecondarySupersTable.
address generate_lookup_secondary_supers_table_slow_path_stub() {
StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
address start = __ pc();
const Register
r_super_klass = Z_ARG1,
r_array_base = Z_ARG5,
r_temp1 = Z_ARG4,
r_array_index = Z_ARG3,
r_bitmap = Z_R10,
r_result = Z_R11;
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base,
r_array_index, r_bitmap, r_result, r_temp1);
__ z_br(Z_R14);
return start;
}
#if !defined(PRODUCT)
// Wrapper which calls oopDesc::is_oop_or_null()
// Only called by MacroAssembler::verify_oop
@ -3247,6 +3291,14 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
if (UseSecondarySupersTable) {
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
if (!InlineSecondarySupersTest) {
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = generate_lookup_secondary_supers_table_stub(slot);
}
}
}
#endif
#endif // COMPILER2_OR_JVMCI
}

@ -1,6 +1,6 @@
/*
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2023 SAP SE. All rights reserved.
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024 SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -272,6 +272,13 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseSHA, false);
}
if (UseSecondarySupersTable && VM_Version::get_model_index() < 5 /* z196/z11 */) {
if (!FLAG_IS_DEFAULT(UseSecondarySupersTable)) {
warning("UseSecondarySupersTable requires z196 or later.");
}
FLAG_SET_DEFAULT(UseSecondarySupersTable, false);
}
#ifdef COMPILER2
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, true);

@ -413,6 +413,8 @@ class VM_Version: public Abstract_VM_Version {
// s390 supports fast class initialization checks
static bool supports_fast_class_init_checks() { return true; }
constexpr static bool supports_secondary_supers_table() { return true; }
constexpr static bool supports_recursive_lightweight_locking() { return true; }
// CPU feature query functions