8179444: AArch64: Put zero_words on a diet
Reviewed-by: roland
This commit is contained in:
parent
99e8874a91
commit
1ce2a36252
@ -14021,10 +14021,12 @@ instruct clearArray_reg_reg(iRegL_R11 cnt, iRegP_R10 base, Universe dummy, rFlag
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
|
||||
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, Universe dummy, rFlagsReg cr)
|
||||
%{
|
||||
predicate((u_int64_t)n->in(2)->get_long()
|
||||
< (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord));
|
||||
match(Set dummy (ClearArray cnt base));
|
||||
effect(USE_KILL base, TEMP tmp);
|
||||
effect(USE_KILL base);
|
||||
|
||||
ins_cost(4 * INSN_COST);
|
||||
format %{ "ClearArray $cnt, $base" %}
|
||||
|
@ -698,6 +698,7 @@ void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, in
|
||||
// trampolines won't be emitted.
|
||||
|
||||
address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
|
||||
assert(JavaThread::current()->is_Compiler_thread(), "just checking");
|
||||
assert(entry.rspec().type() == relocInfo::runtime_call_type
|
||||
|| entry.rspec().type() == relocInfo::opt_virtual_call_type
|
||||
|| entry.rspec().type() == relocInfo::static_call_type
|
||||
@ -4944,34 +4945,67 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
}
|
||||
|
||||
|
||||
// base: Address of a buffer to be zeroed, 8 bytes aligned.
|
||||
// cnt: Count in HeapWords.
|
||||
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
|
||||
void MacroAssembler::zero_words(Register base, Register cnt)
|
||||
// The size of the blocks erased by the zero_blocks stub. We must
|
||||
// handle anything smaller than this ourselves in zero_words().
|
||||
const int MacroAssembler::zero_words_block_size = 8;
|
||||
|
||||
// zero_words() is used by C2 ClearArray patterns. It is as small as
|
||||
// possible, handling small word counts locally and delegating
|
||||
// anything larger to the zero_blocks stub. It is expanded many times
|
||||
// in compiled code, so it is important to keep it short.
|
||||
|
||||
// ptr: Address of a buffer to be zeroed.
|
||||
// cnt: Count in HeapWords.
|
||||
//
|
||||
// ptr, cnt, rscratch1, and rscratch2 are clobbered.
|
||||
void MacroAssembler::zero_words(Register ptr, Register cnt)
|
||||
{
|
||||
if (UseBlockZeroing) {
|
||||
block_zero(base, cnt);
|
||||
} else {
|
||||
fill_words(base, cnt, zr);
|
||||
assert(is_power_of_2(zero_words_block_size), "adjust this");
|
||||
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
|
||||
|
||||
BLOCK_COMMENT("zero_words {");
|
||||
cmp(cnt, zero_words_block_size);
|
||||
Label around, done, done16;
|
||||
br(LO, around);
|
||||
{
|
||||
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
|
||||
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
|
||||
if (StubRoutines::aarch64::complete()) {
|
||||
trampoline_call(zero_blocks);
|
||||
} else {
|
||||
bl(zero_blocks);
|
||||
}
|
||||
}
|
||||
bind(around);
|
||||
for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
|
||||
Label l;
|
||||
tbz(cnt, exact_log2(i), l);
|
||||
for (int j = 0; j < i; j += 2) {
|
||||
stp(zr, zr, post(ptr, 16));
|
||||
}
|
||||
bind(l);
|
||||
}
|
||||
{
|
||||
Label l;
|
||||
tbz(cnt, 0, l);
|
||||
str(zr, Address(ptr));
|
||||
bind(l);
|
||||
}
|
||||
BLOCK_COMMENT("} zero_words");
|
||||
}
|
||||
|
||||
// r10 = base: Address of a buffer to be zeroed, 8 bytes aligned.
|
||||
// base: Address of a buffer to be zeroed, 8 bytes aligned.
|
||||
// cnt: Immediate count in HeapWords.
|
||||
// r11 = tmp: For use as cnt if we need to call out
|
||||
#define ShortArraySize (18 * BytesPerLong)
|
||||
#define SmallArraySize (18 * BytesPerLong)
|
||||
void MacroAssembler::zero_words(Register base, u_int64_t cnt)
|
||||
{
|
||||
Register tmp = r11;
|
||||
BLOCK_COMMENT("zero_words {");
|
||||
int i = cnt & 1; // store any odd word to start
|
||||
if (i) str(zr, Address(base));
|
||||
|
||||
if (cnt <= ShortArraySize / BytesPerLong) {
|
||||
if (cnt <= SmallArraySize / BytesPerLong) {
|
||||
for (; i < (int)cnt; i += 2)
|
||||
stp(zr, zr, Address(base, i * wordSize));
|
||||
} else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
|
||||
mov(tmp, cnt);
|
||||
block_zero(base, tmp, true);
|
||||
} else {
|
||||
const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
|
||||
int remainder = cnt % (2 * unroll);
|
||||
@ -4992,6 +5026,51 @@ void MacroAssembler::zero_words(Register base, u_int64_t cnt)
|
||||
stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
|
||||
cbnz(cnt_reg, loop);
|
||||
}
|
||||
BLOCK_COMMENT("} zero_words");
|
||||
}
|
||||
|
||||
// Zero blocks of memory by using DC ZVA.
|
||||
//
|
||||
// Aligns the base address first sufficently for DC ZVA, then uses
|
||||
// DC ZVA repeatedly for every full block. cnt is the size to be
|
||||
// zeroed in HeapWords. Returns the count of words left to be zeroed
|
||||
// in cnt.
|
||||
//
|
||||
// NOTE: This is intended to be used in the zero_blocks() stub. If
|
||||
// you want to use it elsewhere, note that cnt must be >= 2*zva_length.
|
||||
void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
|
||||
Register tmp = rscratch1;
|
||||
Register tmp2 = rscratch2;
|
||||
int zva_length = VM_Version::zva_length();
|
||||
Label initial_table_end, loop_zva;
|
||||
Label fini;
|
||||
|
||||
// Base must be 16 byte aligned. If not just return and let caller handle it
|
||||
tst(base, 0x0f);
|
||||
br(Assembler::NE, fini);
|
||||
// Align base with ZVA length.
|
||||
neg(tmp, base);
|
||||
andr(tmp, tmp, zva_length - 1);
|
||||
|
||||
// tmp: the number of bytes to be filled to align the base with ZVA length.
|
||||
add(base, base, tmp);
|
||||
sub(cnt, cnt, tmp, Assembler::ASR, 3);
|
||||
adr(tmp2, initial_table_end);
|
||||
sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
|
||||
br(tmp2);
|
||||
|
||||
for (int i = -zva_length + 16; i < 0; i += 16)
|
||||
stp(zr, zr, Address(base, i));
|
||||
bind(initial_table_end);
|
||||
|
||||
sub(cnt, cnt, zva_length >> 3);
|
||||
bind(loop_zva);
|
||||
dc(Assembler::ZVA, base);
|
||||
subs(cnt, cnt, zva_length >> 3);
|
||||
add(base, base, zva_length);
|
||||
br(Assembler::GE, loop_zva);
|
||||
add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
|
||||
bind(fini);
|
||||
}
|
||||
|
||||
// base: Address of a buffer to be filled, 8 bytes aligned.
|
||||
@ -5052,69 +5131,6 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
|
||||
bind(fini);
|
||||
}
|
||||
|
||||
// Use DC ZVA to do fast zeroing.
|
||||
// base: Address of a buffer to be zeroed, 8 bytes aligned.
|
||||
// cnt: Count in HeapWords.
|
||||
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
|
||||
void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
|
||||
{
|
||||
Label small;
|
||||
Label store_pair, loop_store_pair, done;
|
||||
Label base_aligned;
|
||||
|
||||
assert_different_registers(base, cnt, rscratch1);
|
||||
guarantee(base == r10 && cnt == r11, "fix register usage");
|
||||
|
||||
Register tmp = rscratch1;
|
||||
Register tmp2 = rscratch2;
|
||||
int zva_length = VM_Version::zva_length();
|
||||
|
||||
// Ensure ZVA length can be divided by 16. This is required by
|
||||
// the subsequent operations.
|
||||
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
|
||||
|
||||
if (!is_large) cbz(cnt, done);
|
||||
tbz(base, 3, base_aligned);
|
||||
str(zr, Address(post(base, 8)));
|
||||
sub(cnt, cnt, 1);
|
||||
bind(base_aligned);
|
||||
|
||||
// Ensure count >= zva_length * 2 so that it still deserves a zva after
|
||||
// alignment.
|
||||
if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
|
||||
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
|
||||
subs(tmp, cnt, low_limit >> 3);
|
||||
br(Assembler::LT, small);
|
||||
}
|
||||
|
||||
far_call(StubRoutines::aarch64::get_zero_longs());
|
||||
|
||||
bind(small);
|
||||
|
||||
const int unroll = 8; // Number of stp instructions we'll unroll
|
||||
Label small_loop, small_table_end;
|
||||
|
||||
andr(tmp, cnt, (unroll-1) * 2);
|
||||
sub(cnt, cnt, tmp);
|
||||
add(base, base, tmp, Assembler::LSL, 3);
|
||||
adr(tmp2, small_table_end);
|
||||
sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
|
||||
br(tmp2);
|
||||
|
||||
bind(small_loop);
|
||||
add(base, base, unroll * 16);
|
||||
for (int i = -unroll; i < 0; i++)
|
||||
stp(zr, zr, Address(base, i * 16));
|
||||
bind(small_table_end);
|
||||
subs(cnt, cnt, unroll * 2);
|
||||
br(Assembler::GE, small_loop);
|
||||
|
||||
tbz(cnt, 0, done);
|
||||
str(zr, Address(post(base, 8)));
|
||||
|
||||
bind(done);
|
||||
}
|
||||
|
||||
// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
|
||||
// java/lang/StringUTF16.compress.
|
||||
void MacroAssembler::encode_iso_array(Register src, Register dst,
|
||||
|
@ -1213,8 +1213,10 @@ public:
|
||||
|
||||
void fill_words(Register base, Register cnt, Register value);
|
||||
void zero_words(Register base, u_int64_t cnt);
|
||||
void zero_words(Register base, Register cnt);
|
||||
void block_zero(Register base, Register cnt, bool is_large = false);
|
||||
void zero_words(Register ptr, Register cnt);
|
||||
void zero_dcache_blocks(Register base, Register cnt);
|
||||
|
||||
static const int zero_words_block_size;
|
||||
|
||||
void byte_array_inflate(Register src, Register dst, Register len,
|
||||
FloatRegister vtmp1, FloatRegister vtmp2,
|
||||
|
@ -719,48 +719,74 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
address generate_zero_longs(Register base, Register cnt) {
|
||||
Register tmp = rscratch1;
|
||||
Register tmp2 = rscratch2;
|
||||
int zva_length = VM_Version::zva_length();
|
||||
Label initial_table_end, loop_zva;
|
||||
Label fini;
|
||||
// The inner part of zero_words(). This is the bulk operation,
|
||||
// zeroing words in blocks, possibly using DC ZVA to do it. The
|
||||
// caller is responsible for zeroing the last few words.
|
||||
//
|
||||
// Inputs:
|
||||
// r10: the HeapWord-aligned base address of an array to zero.
|
||||
// r11: the count in HeapWords, r11 > 0.
|
||||
//
|
||||
// Returns r10 and r11, adjusted for the caller to clear.
|
||||
// r10: the base address of the tail of words left to clear.
|
||||
// r11: the number of words in the tail.
|
||||
// r11 < MacroAssembler::zero_words_block_size.
|
||||
|
||||
address generate_zero_blocks() {
|
||||
Label store_pair, loop_store_pair, done;
|
||||
Label base_aligned;
|
||||
|
||||
Register base = r10, cnt = r11;
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "zero_longs");
|
||||
StubCodeMark mark(this, "StubRoutines", "zero_blocks");
|
||||
address start = __ pc();
|
||||
|
||||
// Base must be 16 byte aligned. If not just return and let caller handle it
|
||||
__ tst(base, 0x0f);
|
||||
__ br(Assembler::NE, fini);
|
||||
// Align base with ZVA length.
|
||||
__ neg(tmp, base);
|
||||
__ andr(tmp, tmp, zva_length - 1);
|
||||
if (UseBlockZeroing) {
|
||||
int zva_length = VM_Version::zva_length();
|
||||
|
||||
// tmp: the number of bytes to be filled to align the base with ZVA length.
|
||||
__ add(base, base, tmp);
|
||||
__ sub(cnt, cnt, tmp, Assembler::ASR, 3);
|
||||
__ adr(tmp2, initial_table_end);
|
||||
__ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
|
||||
__ br(tmp2);
|
||||
// Ensure ZVA length can be divided by 16. This is required by
|
||||
// the subsequent operations.
|
||||
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
|
||||
|
||||
for (int i = -zva_length + 16; i < 0; i += 16)
|
||||
__ stp(zr, zr, Address(base, i));
|
||||
__ bind(initial_table_end);
|
||||
__ tbz(base, 3, base_aligned);
|
||||
__ str(zr, Address(__ post(base, 8)));
|
||||
__ sub(cnt, cnt, 1);
|
||||
__ bind(base_aligned);
|
||||
|
||||
// Ensure count >= zva_length * 2 so that it still deserves a zva after
|
||||
// alignment.
|
||||
Label small;
|
||||
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
|
||||
__ cmp(cnt, low_limit >> 3);
|
||||
__ br(Assembler::LT, small);
|
||||
__ zero_dcache_blocks(base, cnt);
|
||||
__ bind(small);
|
||||
}
|
||||
|
||||
{
|
||||
// Number of stp instructions we'll unroll
|
||||
const int unroll =
|
||||
MacroAssembler::zero_words_block_size / 2;
|
||||
// Clear the remaining blocks.
|
||||
Label loop;
|
||||
__ subs(cnt, cnt, unroll * 2);
|
||||
__ br(Assembler::LT, done);
|
||||
__ bind(loop);
|
||||
for (int i = 0; i < unroll; i++)
|
||||
__ stp(zr, zr, __ post(base, 16));
|
||||
__ subs(cnt, cnt, unroll * 2);
|
||||
__ br(Assembler::GE, loop);
|
||||
__ bind(done);
|
||||
__ add(cnt, cnt, unroll * 2);
|
||||
}
|
||||
|
||||
__ sub(cnt, cnt, zva_length >> 3);
|
||||
__ bind(loop_zva);
|
||||
__ dc(Assembler::ZVA, base);
|
||||
__ subs(cnt, cnt, zva_length >> 3);
|
||||
__ add(base, base, zva_length);
|
||||
__ br(Assembler::GE, loop_zva);
|
||||
__ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
|
||||
__ bind(fini);
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
typedef enum {
|
||||
copy_forwards = 1,
|
||||
copy_backwards = -1
|
||||
@ -2346,20 +2372,16 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
|
||||
if (UseBlockZeroing) {
|
||||
Label non_block_zeroing, rest;
|
||||
Register tmp = rscratch1;
|
||||
// count >= BlockZeroingLowLimit && value == 0
|
||||
__ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
|
||||
__ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
|
||||
__ br(Assembler::NE, non_block_zeroing);
|
||||
// If the fill value is zero we can use the fast zero_words().
|
||||
__ cbnz(value, non_block_zeroing);
|
||||
__ mov(bz_base, to);
|
||||
__ block_zero(bz_base, cnt_words, true);
|
||||
__ mov(to, bz_base);
|
||||
__ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
|
||||
__ zero_words(bz_base, cnt_words);
|
||||
__ b(rest);
|
||||
__ bind(non_block_zeroing);
|
||||
__ fill_words(to, cnt_words, value);
|
||||
__ bind(rest);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
__ fill_words(to, cnt_words, value);
|
||||
}
|
||||
|
||||
@ -2420,7 +2442,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
|
||||
generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
|
||||
|
||||
StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
|
||||
StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
|
||||
|
||||
//*** jbyte
|
||||
// Always need aligned and unaligned versions
|
||||
@ -4769,6 +4791,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
&StubRoutines::_safefetchN_fault_pc,
|
||||
&StubRoutines::_safefetchN_continuation_pc);
|
||||
#endif
|
||||
StubRoutines::aarch64::set_completed();
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -43,7 +43,8 @@ address StubRoutines::aarch64::_float_sign_mask = NULL;
|
||||
address StubRoutines::aarch64::_float_sign_flip = NULL;
|
||||
address StubRoutines::aarch64::_double_sign_mask = NULL;
|
||||
address StubRoutines::aarch64::_double_sign_flip = NULL;
|
||||
address StubRoutines::aarch64::_zero_longs = NULL;
|
||||
address StubRoutines::aarch64::_zero_blocks = NULL;
|
||||
bool StubRoutines::aarch64::_completed = false;
|
||||
|
||||
/**
|
||||
* crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
|
||||
|
@ -61,7 +61,8 @@ class aarch64 {
|
||||
static address _double_sign_mask;
|
||||
static address _double_sign_flip;
|
||||
|
||||
static address _zero_longs;
|
||||
static address _zero_blocks;
|
||||
static bool _completed;
|
||||
|
||||
public:
|
||||
|
||||
@ -115,12 +116,19 @@ class aarch64 {
|
||||
return _double_sign_flip;
|
||||
}
|
||||
|
||||
static address get_zero_longs()
|
||||
{
|
||||
return _zero_longs;
|
||||
static address zero_blocks() {
|
||||
return _zero_blocks;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool complete() {
|
||||
return _completed;
|
||||
}
|
||||
|
||||
static void set_completed() {
|
||||
_completed = true;
|
||||
}
|
||||
|
||||
private:
|
||||
static juint _crc_table[];
|
||||
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user