8270947: AArch64: C1: use zero_words to initialize all objects

Reviewed-by: ngasson, adinn
This commit is contained in:
Andrew Haley 2021-07-30 18:02:11 +00:00
parent cd7e30ef84
commit 6c68ce2d39
8 changed files with 28831 additions and 165 deletions

View File

@ -14983,12 +14983,12 @@ instruct clearArray_reg_reg(iRegL_R11 cnt, iRegP_R10 base, Universe dummy, rFlag
ins_pipe(pipe_class_memory);
%}
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, Universe dummy, rFlagsReg cr)
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 temp, Universe dummy, rFlagsReg cr)
%{
predicate((uint64_t)n->in(2)->get_long()
< (uint64_t)(BlockZeroingLowLimit >> LogBytesPerWord));
match(Set dummy (ClearArray cnt base));
effect(USE_KILL base);
effect(TEMP temp, USE_KILL base, KILL cr);
ins_cost(4 * INSN_COST);
format %{ "ClearArray $cnt, $base" %}

View File

@ -1127,8 +1127,8 @@ void LIRGenerator::do_NewInstance(NewInstance* x) {
CodeEmitInfo* info = state_for(x, x->state());
LIR_Opr reg = result_register_for(x->type());
new_instance(reg, x->klass(), x->is_unresolved(),
FrameMap::r2_oop_opr,
FrameMap::r5_oop_opr,
FrameMap::r10_oop_opr,
FrameMap::r11_oop_opr,
FrameMap::r4_oop_opr,
LIR_OprFact::illegalOpr,
FrameMap::r3_metadata_opr, info);
@ -1143,8 +1143,8 @@ void LIRGenerator::do_NewTypeArray(NewTypeArray* x) {
length.load_item_force(FrameMap::r19_opr);
LIR_Opr reg = result_register_for(x->type());
LIR_Opr tmp1 = FrameMap::r2_oop_opr;
LIR_Opr tmp2 = FrameMap::r4_oop_opr;
LIR_Opr tmp1 = FrameMap::r10_oop_opr;
LIR_Opr tmp2 = FrameMap::r11_oop_opr;
LIR_Opr tmp3 = FrameMap::r5_oop_opr;
LIR_Opr tmp4 = reg;
LIR_Opr klass_reg = FrameMap::r3_metadata_opr;
@ -1172,8 +1172,8 @@ void LIRGenerator::do_NewObjectArray(NewObjectArray* x) {
CodeEmitInfo* info = state_for(x, x->state());
LIR_Opr reg = result_register_for(x->type());
LIR_Opr tmp1 = FrameMap::r2_oop_opr;
LIR_Opr tmp2 = FrameMap::r4_oop_opr;
LIR_Opr tmp1 = FrameMap::r10_oop_opr;
LIR_Opr tmp2 = FrameMap::r11_oop_opr;
LIR_Opr tmp3 = FrameMap::r5_oop_opr;
LIR_Opr tmp4 = reg;
LIR_Opr klass_reg = FrameMap::r3_metadata_opr;

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -180,20 +180,24 @@ void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register
}
// preserves obj, destroys len_in_bytes
void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1) {
//
// Scratch registers: t1 = r10, t2 = r11
//
void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1, Register t2) {
assert(hdr_size_in_bytes >= 0, "header size must be positive or 0");
assert(t1 == r10 && t2 == r11, "must be");
Label done;
// len_in_bytes is positive and ptr sized
subs(len_in_bytes, len_in_bytes, hdr_size_in_bytes);
br(Assembler::EQ, done);
// Preserve obj
if (hdr_size_in_bytes)
add(obj, obj, hdr_size_in_bytes);
zero_memory(obj, len_in_bytes, t1);
if (hdr_size_in_bytes)
sub(obj, obj, hdr_size_in_bytes);
// zero_words() takes ptr in r10 and count in words in r11
mov(rscratch1, len_in_bytes);
lea(t1, Address(obj, hdr_size_in_bytes));
lsr(t2, rscratch1, LogBytesPerWord);
zero_words(t1, t2);
bind(done);
}
@ -208,6 +212,7 @@ void C1_MacroAssembler::allocate_object(Register obj, Register t1, Register t2,
initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2, UseTLAB);
}
// Scratch registers: t1 = r10, t2 = r11
void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, bool is_tlab_allocated) {
assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0,
"con_size_in_bytes is not multiple of alignment");
@ -218,45 +223,13 @@ void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register
if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) {
// clear rest of allocated space
const Register index = t2;
const int threshold = 16 * BytesPerWord; // approximate break even point for code size (see comments below)
if (var_size_in_bytes != noreg) {
mov(index, var_size_in_bytes);
initialize_body(obj, index, hdr_size_in_bytes, t1);
} else if (con_size_in_bytes <= threshold) {
// use explicit null stores
int i = hdr_size_in_bytes;
if (i < con_size_in_bytes && (con_size_in_bytes % (2 * BytesPerWord))) {
str(zr, Address(obj, i));
i += BytesPerWord;
}
for (; i < con_size_in_bytes; i += 2 * BytesPerWord)
stp(zr, zr, Address(obj, i));
initialize_body(obj, index, hdr_size_in_bytes, t1, t2);
} else if (con_size_in_bytes > hdr_size_in_bytes) {
block_comment("zero memory");
// use loop to null out the fields
int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord;
mov(index, words / 8);
const int unroll = 8; // Number of str(zr) instructions we'll unroll
int remainder = words % unroll;
lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord));
Label entry_point, loop;
b(entry_point);
bind(loop);
sub(index, index, 1);
for (int i = -unroll; i < 0; i++) {
if (-i == remainder)
bind(entry_point);
str(zr, Address(rscratch1, i * wordSize));
}
if (remainder == 0)
bind(entry_point);
add(rscratch1, rscratch1, unroll * wordSize);
cbnz(index, loop);
con_size_in_bytes -= hdr_size_in_bytes;
lea(t1, Address(obj, hdr_size_in_bytes));
zero_words(t1, con_size_in_bytes / BytesPerWord);
}
}
@ -291,8 +264,7 @@ void C1_MacroAssembler::allocate_array(Register obj, Register len, Register t1,
initialize_header(obj, klass, len, t1, t2);
// clear rest of allocated space
const Register len_zero = len;
initialize_body(obj, arr_size, header_size * BytesPerWord, len_zero);
initialize_body(obj, arr_size, header_size * BytesPerWord, t1, t2);
membar(StoreStore);

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -48,7 +48,7 @@ using MacroAssembler::null_check;
);
void initialize_header(Register obj, Register klass, Register len, Register t1, Register t2);
void initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1);
void initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1, Register t2);
void float_cmp(bool is_float, int unordered_result,
FloatRegister f0, FloatRegister f1,

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -656,9 +656,9 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
if ((id == fast_new_instance_id || id == fast_new_instance_init_check_id) &&
!UseTLAB && Universe::heap()->supports_inline_contig_alloc()) {
Label slow_path;
Register obj_size = r2;
Register t1 = r19;
Register t2 = r4;
Register obj_size = r19;
Register t1 = r10;
Register t2 = r11;
assert_different_registers(klass, obj, obj_size, t1, t2);
__ stp(r19, zr, Address(__ pre(sp, -2 * wordSize)));
@ -769,9 +769,9 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
// allocations.
// Otherwise, just go to the slow path.
if (!UseTLAB && Universe::heap()->supports_inline_contig_alloc()) {
Register arr_size = r4;
Register t1 = r2;
Register t2 = r5;
Register arr_size = r5;
Register t1 = r10;
Register t2 = r11;
Label slow_path;
assert_different_registers(length, klass, obj, arr_size, t1, t2);
@ -801,7 +801,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
__ andr(t1, t1, Klass::_lh_header_size_mask);
__ sub(arr_size, arr_size, t1); // body length
__ add(t1, t1, obj); // body start
__ initialize_body(t1, arr_size, 0, t2);
__ initialize_body(t1, arr_size, 0, t1, t2);
__ membar(Assembler::StoreStore);
__ verify_oop(obj);

View File

@ -4088,68 +4088,6 @@ void MacroAssembler::eden_allocate(Register obj,
bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
}
// Zero words; len is in bytes
// Destroys all registers except addr
// len must be a nonzero multiple of wordSize
void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
assert_different_registers(addr, len, t1, rscratch1, rscratch2);
#ifdef ASSERT
{ Label L;
tst(len, BytesPerWord - 1);
br(Assembler::EQ, L);
stop("len is not a multiple of BytesPerWord");
bind(L);
}
#endif
#ifndef PRODUCT
block_comment("zero memory");
#endif
Label loop;
Label entry;
// Algorithm:
//
// scratch1 = cnt & 7;
// cnt -= scratch1;
// p += scratch1;
// switch (scratch1) {
// do {
// cnt -= 8;
// p[-8] = 0;
// case 7:
// p[-7] = 0;
// case 6:
// p[-6] = 0;
// // ...
// case 1:
// p[-1] = 0;
// case 0:
// p += 8;
// } while (cnt);
// }
const int unroll = 8; // Number of str(zr) instructions we'll unroll
lsr(len, len, LogBytesPerWord);
andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
sub(len, len, rscratch1); // cnt -= unroll
// t1 always points to the end of the region we're about to zero
add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
adr(rscratch2, entry);
sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
br(rscratch2);
bind(loop);
sub(len, len, unroll);
for (int i = -unroll; i < 0; i++)
Assembler::str(zr, Address(t1, i * wordSize));
bind(entry);
add(t1, t1, unroll * wordSize);
cbnz(len, loop);
}
void MacroAssembler::verify_tlab() {
#ifdef ASSERT
if (UseTLAB && VerifyOops) {
@ -4668,10 +4606,11 @@ void MacroAssembler::string_equals(Register a1, Register a2,
// handle anything smaller than this ourselves in zero_words().
const int MacroAssembler::zero_words_block_size = 8;
// zero_words() is used by C2 ClearArray patterns. It is as small as
// possible, handling small word counts locally and delegating
// anything larger to the zero_blocks stub. It is expanded many times
// in compiled code, so it is important to keep it short.
// zero_words() is used by C2 ClearArray patterns and by
// C1_MacroAssembler. It is as small as possible, handling small word
// counts locally and delegating anything larger to the zero_blocks
// stub. It is expanded many times in compiled code, so it is
// important to keep it short.
// ptr: Address of a buffer to be zeroed.
// cnt: Count in HeapWords.
@ -4680,32 +4619,46 @@ const int MacroAssembler::zero_words_block_size = 8;
address MacroAssembler::zero_words(Register ptr, Register cnt)
{
assert(is_power_of_2(zero_words_block_size), "adjust this");
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
BLOCK_COMMENT("zero_words {");
cmp(cnt, (u1)zero_words_block_size);
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
subs(rscratch1, cnt, zero_words_block_size);
Label around;
br(LO, around);
{
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
if (StubRoutines::aarch64::complete()) {
// Make sure this is a C2 compilation. C1 allocates space only for
// trampoline stubs generated by Call LIR ops, and in any case it
// makes sense for a C1 compilation task to proceed as quickly as
// possible.
CompileTask* task;
if (StubRoutines::aarch64::complete()
&& Thread::current()->is_Compiler_thread()
&& (task = ciEnv::current()->task())
&& is_c2_compile(task->comp_level())) {
address tpc = trampoline_call(zero_blocks);
if (tpc == NULL) {
DEBUG_ONLY(reset_labels(around));
postcond(pc() == badAddress);
assert(false, "failed to allocate space for trampoline");
return NULL;
}
} else {
bl(zero_blocks);
far_call(zero_blocks);
}
}
bind(around);
// We have a few words left to do. zero_blocks has adjusted r10 and r11
// for us.
for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
Label l;
tbz(cnt, exact_log2(i), l);
for (int j = 0; j < i; j += 2) {
stp(zr, zr, post(ptr, 16));
stp(zr, zr, post(ptr, 2 * BytesPerWord));
}
bind(l);
}
@ -4715,46 +4668,56 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
str(zr, Address(ptr));
bind(l);
}
BLOCK_COMMENT("} zero_words");
postcond(pc() != badAddress);
return pc();
}
// base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Immediate count in HeapWords.
#define SmallArraySize (18 * BytesPerLong)
//
// r10, r11, rscratch1, and rscratch2 are clobbered.
void MacroAssembler::zero_words(Register base, uint64_t cnt)
{
BLOCK_COMMENT("zero_words {");
int i = cnt & 1; // store any odd word to start
if (i) str(zr, Address(base));
if (cnt <= SmallArraySize / BytesPerLong) {
guarantee(zero_words_block_size < BlockZeroingLowLimit,
"increase BlockZeroingLowLimit");
if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
#ifndef PRODUCT
{
char buf[64];
snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
BLOCK_COMMENT(buf);
}
#endif
if (cnt >= 16) {
uint64_t loops = cnt/16;
if (loops > 1) {
mov(rscratch2, loops - 1);
}
{
Label loop;
bind(loop);
for (int i = 0; i < 16; i += 2) {
stp(zr, zr, Address(base, i * BytesPerWord));
}
add(base, base, 16 * BytesPerWord);
if (loops > 1) {
subs(rscratch2, rscratch2, 1);
br(GE, loop);
}
}
}
cnt %= 16;
int i = cnt & 1; // store any odd word to start
if (i) str(zr, Address(base));
for (; i < (int)cnt; i += 2) {
stp(zr, zr, Address(base, i * wordSize));
}
BLOCK_COMMENT("} zero_words");
} else {
const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
int remainder = cnt % (2 * unroll);
for (; i < remainder; i += 2) {
stp(zr, zr, Address(base, i * wordSize));
}
Label loop;
Register cnt_reg = rscratch1;
Register loop_base = rscratch2;
cnt = cnt - remainder;
mov(cnt_reg, cnt);
// adjust base and prebias by -2 * wordSize so we can pre-increment
add(loop_base, base, (remainder - 2) * wordSize);
bind(loop);
sub(cnt_reg, cnt_reg, 2 * unroll);
for (i = 1; i < unroll; i++) {
stp(zr, zr, Address(loop_base, 2 * i * wordSize));
}
stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
cbnz(cnt_reg, loop);
mov(r10, base); mov(r11, cnt);
zero_words(r10, r11);
}
BLOCK_COMMENT("} zero_words");
}
// Zero blocks of memory by using DC ZVA.

View File

@ -888,7 +888,6 @@ public:
Register t2, // temp register
Label& slow_case // continuation point if fast allocation fails
);
void zero_memory(Register addr, Register len, Register t1);
void verify_tlab();
// interface method calling

File diff suppressed because it is too large Load Diff