8170991: PPC64: Bad code for initialization of short arrays
Implement special ClearArray nodes to improve initialization. Reviewed-by: goetz
This commit is contained in:
parent
0a908be59d
commit
8242125c4f
@ -238,72 +238,15 @@ void C1_MacroAssembler::initialize_body(Register obj, Register tmp1, Register tm
|
||||
int obj_size_in_bytes, int hdr_size_in_bytes) {
|
||||
const int index = (obj_size_in_bytes - hdr_size_in_bytes) / HeapWordSize;
|
||||
|
||||
const int cl_size = VM_Version::L1_data_cache_line_size(),
|
||||
cl_dwords = cl_size>>3,
|
||||
cl_dw_addr_bits = exact_log2(cl_dwords);
|
||||
|
||||
const Register tmp = R0,
|
||||
base_ptr = tmp1,
|
||||
cnt_dwords = tmp2;
|
||||
|
||||
if (index <= 6) {
|
||||
// Use explicit NULL stores.
|
||||
if (index > 0) { li(tmp, 0); }
|
||||
for (int i = 0; i < index; ++i) { std(tmp, hdr_size_in_bytes + i * HeapWordSize, obj); }
|
||||
|
||||
} else if (index < (2<<cl_dw_addr_bits)-1) {
|
||||
// simple loop
|
||||
Label loop;
|
||||
|
||||
li(cnt_dwords, index);
|
||||
addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element.
|
||||
li(tmp, 0);
|
||||
mtctr(cnt_dwords); // Load counter.
|
||||
bind(loop);
|
||||
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
|
||||
addi(base_ptr, base_ptr, 8);
|
||||
bdnz(loop);
|
||||
|
||||
// 2x unrolled loop is shorter with more than 9 HeapWords.
|
||||
if (index <= 9) {
|
||||
clear_memory_unrolled(obj, index, R0, hdr_size_in_bytes);
|
||||
} else {
|
||||
// like clear_memory_doubleword
|
||||
Label startloop, fast, fastloop, restloop, done;
|
||||
const Register base_ptr = tmp1,
|
||||
cnt_dwords = tmp2;
|
||||
|
||||
addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element.
|
||||
load_const_optimized(cnt_dwords, index);
|
||||
rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
|
||||
beq(CCR0, fast); // Already 128byte aligned.
|
||||
|
||||
subfic(tmp, tmp, cl_dwords);
|
||||
mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
|
||||
subf(cnt_dwords, tmp, cnt_dwords); // rest.
|
||||
li(tmp, 0);
|
||||
|
||||
bind(startloop); // Clear at the beginning to reach 128byte boundary.
|
||||
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
|
||||
addi(base_ptr, base_ptr, 8);
|
||||
bdnz(startloop);
|
||||
|
||||
bind(fast); // Clear 128byte blocks.
|
||||
srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
|
||||
andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
|
||||
mtctr(tmp); // Load counter.
|
||||
|
||||
bind(fastloop);
|
||||
dcbz(base_ptr); // Clear 128byte aligned block.
|
||||
addi(base_ptr, base_ptr, cl_size);
|
||||
bdnz(fastloop);
|
||||
|
||||
cmpdi(CCR0, cnt_dwords, 0); // size 0?
|
||||
beq(CCR0, done); // rest == 0
|
||||
li(tmp, 0);
|
||||
mtctr(cnt_dwords); // Load counter.
|
||||
|
||||
bind(restloop); // Clear rest.
|
||||
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
|
||||
addi(base_ptr, base_ptr, 8);
|
||||
bdnz(restloop);
|
||||
|
||||
bind(done);
|
||||
addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element.
|
||||
clear_memory_doubleword(base_ptr, cnt_dwords, R0, index);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -77,7 +77,8 @@ define_pd_global(uintx, TypeProfileLevel, 111);
|
||||
|
||||
define_pd_global(bool, CompactStrings, true);
|
||||
|
||||
define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
|
||||
// 2x unrolled loop is shorter with more than 9 HeapWords.
|
||||
define_pd_global(intx, InitArrayShortSize, 9*BytesPerLong);
|
||||
|
||||
// Platform dependent flag handling: flags only defined on this platform.
|
||||
#define ARCH_FLAGS(develop, \
|
||||
|
@ -3332,53 +3332,90 @@ void MacroAssembler::load_mirror_from_const_method(Register mirror, Register con
|
||||
}
|
||||
|
||||
// Clear Array
|
||||
// For very short arrays. tmp == R0 is allowed.
|
||||
void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
|
||||
if (cnt_dwords > 0) { li(tmp, 0); }
|
||||
for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
|
||||
}
|
||||
|
||||
// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
|
||||
void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
|
||||
if (cnt_dwords < 8) {
|
||||
clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
|
||||
return;
|
||||
}
|
||||
|
||||
Label loop;
|
||||
const long loopcnt = cnt_dwords >> 1,
|
||||
remainder = cnt_dwords & 1;
|
||||
|
||||
li(tmp, loopcnt);
|
||||
mtctr(tmp);
|
||||
li(tmp, 0);
|
||||
bind(loop);
|
||||
std(tmp, 0, base_ptr);
|
||||
std(tmp, 8, base_ptr);
|
||||
addi(base_ptr, base_ptr, 16);
|
||||
bdnz(loop);
|
||||
if (remainder) { std(tmp, 0, base_ptr); }
|
||||
}
|
||||
|
||||
// Kills both input registers. tmp == R0 is allowed.
|
||||
void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
|
||||
void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
|
||||
// Procedure for large arrays (uses data cache block zero instruction).
|
||||
Label startloop, fast, fastloop, small_rest, restloop, done;
|
||||
const int cl_size = VM_Version::L1_data_cache_line_size(),
|
||||
cl_dwords = cl_size>>3,
|
||||
cl_dwords = cl_size >> 3,
|
||||
cl_dw_addr_bits = exact_log2(cl_dwords),
|
||||
dcbz_min = 1; // Min count of dcbz executions, needs to be >0.
|
||||
dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
|
||||
min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
|
||||
|
||||
//2:
|
||||
cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
|
||||
blt(CCR1, small_rest); // Too small.
|
||||
rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
|
||||
beq(CCR0, fast); // Already 128byte aligned.
|
||||
if (const_cnt >= 0) {
|
||||
// Constant case.
|
||||
if (const_cnt < min_cnt) {
|
||||
clear_memory_constlen(base_ptr, const_cnt, tmp);
|
||||
return;
|
||||
}
|
||||
load_const_optimized(cnt_dwords, const_cnt, tmp);
|
||||
} else {
|
||||
// cnt_dwords already loaded in register. Need to check size.
|
||||
cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
|
||||
blt(CCR1, small_rest);
|
||||
}
|
||||
rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
|
||||
beq(CCR0, fast); // Already 128byte aligned.
|
||||
|
||||
subfic(tmp, tmp, cl_dwords);
|
||||
mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
|
||||
subf(cnt_dwords, tmp, cnt_dwords); // rest.
|
||||
li(tmp, 0);
|
||||
//10:
|
||||
|
||||
bind(startloop); // Clear at the beginning to reach 128byte boundary.
|
||||
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
|
||||
addi(base_ptr, base_ptr, 8);
|
||||
bdnz(startloop);
|
||||
//13:
|
||||
|
||||
bind(fast); // Clear 128byte blocks.
|
||||
srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
|
||||
andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
|
||||
mtctr(tmp); // Load counter.
|
||||
//16:
|
||||
|
||||
bind(fastloop);
|
||||
dcbz(base_ptr); // Clear 128byte aligned block.
|
||||
addi(base_ptr, base_ptr, cl_size);
|
||||
bdnz(fastloop);
|
||||
if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
|
||||
//20:
|
||||
|
||||
bind(small_rest);
|
||||
cmpdi(CCR0, cnt_dwords, 0); // size 0?
|
||||
beq(CCR0, done); // rest == 0
|
||||
li(tmp, 0);
|
||||
mtctr(cnt_dwords); // Load counter.
|
||||
//24:
|
||||
|
||||
bind(restloop); // Clear rest.
|
||||
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
|
||||
addi(base_ptr, base_ptr, 8);
|
||||
bdnz(restloop);
|
||||
//27:
|
||||
|
||||
bind(done);
|
||||
}
|
||||
|
||||
|
@ -755,7 +755,9 @@ class MacroAssembler: public Assembler {
|
||||
is_trap_range_check_g(x) || is_trap_range_check_ge(x);
|
||||
}
|
||||
|
||||
void clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp = R0);
|
||||
void clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp = R0, int offset = 0);
|
||||
void clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp = R0);
|
||||
void clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp = R0, long const_cnt = -1);
|
||||
|
||||
#ifdef COMPILER2
|
||||
// Intrinsics for CompactStrings
|
||||
|
@ -965,10 +965,7 @@ static int cc_to_biint(int cc, int flags_reg) {
|
||||
// is the number of bytes (not instructions) which will be inserted before
|
||||
// the instruction. The padding must match the size of a NOP instruction.
|
||||
|
||||
int inlineCallClearArrayNode::compute_padding(int current_offset) const {
|
||||
int desired_padding = (2*4-current_offset)&31; // see MacroAssembler::clear_memory_doubleword
|
||||
return (desired_padding <= 3*4) ? desired_padding : 0;
|
||||
}
|
||||
// Currently not used on this platform.
|
||||
|
||||
//=============================================================================
|
||||
|
||||
@ -4066,6 +4063,14 @@ operand immL() %{
|
||||
interface(CONST_INTER);
|
||||
%}
|
||||
|
||||
operand immLmax30() %{
|
||||
predicate((n->get_long() <= 30));
|
||||
match(ConL);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(CONST_INTER);
|
||||
%}
|
||||
|
||||
// Long Immediate: 16-bit
|
||||
operand immL16() %{
|
||||
predicate(Assembler::is_simm(n->get_long(), 16));
|
||||
@ -11735,18 +11740,44 @@ instruct array_size(iRegLdst dst, iRegPsrc end, iRegPsrc start) %{
|
||||
ins_pipe(pipe_class_default);
|
||||
%}
|
||||
|
||||
// Clear-array with dynamic array-size.
|
||||
instruct inlineCallClearArray(rarg1RegL cnt, rarg2RegP base, Universe dummy, regCTR ctr) %{
|
||||
// Clear-array with constant short array length. The versions below can use dcbz with cnt > 30.
|
||||
instruct inlineCallClearArrayShort(immLmax30 cnt, rarg2RegP base, Universe dummy, regCTR ctr) %{
|
||||
match(Set dummy (ClearArray cnt base));
|
||||
effect(USE_KILL cnt, USE_KILL base, KILL ctr);
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
|
||||
ins_alignment(4); // 'compute_padding()' gets called, up to this number-1 nops will get inserted.
|
||||
effect(USE_KILL base, KILL ctr);
|
||||
ins_cost(2 * MEMORY_REF_COST);
|
||||
|
||||
format %{ "ClearArray $cnt, $base" %}
|
||||
ins_encode %{
|
||||
// TODO: PPC port $archOpcode(ppc64Opcode_compound);
|
||||
__ clear_memory_doubleword($base$$Register, $cnt$$Register); // kills cnt, base, R0
|
||||
__ clear_memory_constlen($base$$Register, $cnt$$constant, R0); // kills base, R0
|
||||
%}
|
||||
ins_pipe(pipe_class_default);
|
||||
%}
|
||||
|
||||
// Clear-array with constant large array length.
|
||||
instruct inlineCallClearArrayLarge(immL cnt, rarg2RegP base, Universe dummy, iRegLdst tmp, regCTR ctr) %{
|
||||
match(Set dummy (ClearArray cnt base));
|
||||
effect(USE_KILL base, TEMP tmp, KILL ctr);
|
||||
ins_cost(3 * MEMORY_REF_COST);
|
||||
|
||||
format %{ "ClearArray $cnt, $base \t// KILL $tmp" %}
|
||||
ins_encode %{
|
||||
// TODO: PPC port $archOpcode(ppc64Opcode_compound);
|
||||
__ clear_memory_doubleword($base$$Register, $tmp$$Register, R0, $cnt$$constant); // kills base, R0
|
||||
%}
|
||||
ins_pipe(pipe_class_default);
|
||||
%}
|
||||
|
||||
// Clear-array with dynamic array length.
|
||||
instruct inlineCallClearArray(rarg1RegL cnt, rarg2RegP base, Universe dummy, regCTR ctr) %{
|
||||
match(Set dummy (ClearArray cnt base));
|
||||
effect(USE_KILL cnt, USE_KILL base, KILL ctr);
|
||||
ins_cost(4 * MEMORY_REF_COST);
|
||||
|
||||
format %{ "ClearArray $cnt, $base" %}
|
||||
ins_encode %{
|
||||
// TODO: PPC port $archOpcode(ppc64Opcode_compound);
|
||||
__ clear_memory_doubleword($base$$Register, $cnt$$Register, R0); // kills cnt, base, R0
|
||||
%}
|
||||
ins_pipe(pipe_class_default);
|
||||
%}
|
||||
|
Loading…
x
Reference in New Issue
Block a user