8150082: aarch64: optimise small array copy
Reviewed-by: aph
This commit is contained in:
parent
f4b4c5d7b0
commit
af0f23a617
@ -729,7 +729,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
//
|
||||
// count is a count of words.
|
||||
//
|
||||
// Precondition: count >= 2
|
||||
// Precondition: count >= 8
|
||||
//
|
||||
// Postconditions:
|
||||
//
|
||||
@ -750,7 +750,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
|
||||
assert_different_registers(s, d, count, rscratch1);
|
||||
|
||||
Label again, large, small;
|
||||
Label again, drain;
|
||||
const char *stub_name;
|
||||
if (direction == copy_forwards)
|
||||
stub_name = "foward_copy_longs";
|
||||
@ -759,51 +759,21 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
__ align(CodeEntryAlignment);
|
||||
__ bind(start);
|
||||
__ cmp(count, 8);
|
||||
__ br(Assembler::LO, small);
|
||||
if (direction == copy_forwards) {
|
||||
__ sub(s, s, 2 * wordSize);
|
||||
__ sub(d, d, 2 * wordSize);
|
||||
}
|
||||
__ subs(count, count, 16);
|
||||
__ br(Assembler::GE, large);
|
||||
|
||||
// 8 <= count < 16 words. Copy 8.
|
||||
__ ldp(t0, t1, Address(s, 2 * unit));
|
||||
__ ldp(t2, t3, Address(s, 4 * unit));
|
||||
__ ldp(t4, t5, Address(s, 6 * unit));
|
||||
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
|
||||
|
||||
__ stp(t0, t1, Address(d, 2 * unit));
|
||||
__ stp(t2, t3, Address(d, 4 * unit));
|
||||
__ stp(t4, t5, Address(d, 6 * unit));
|
||||
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
|
||||
|
||||
if (direction == copy_forwards) {
|
||||
__ add(s, s, 2 * wordSize);
|
||||
__ add(d, d, 2 * wordSize);
|
||||
}
|
||||
|
||||
#ifdef ASSERT
|
||||
// Make sure we are never given < 8 words
|
||||
{
|
||||
Label L1, L2;
|
||||
__ bind(small);
|
||||
__ tbz(count, exact_log2(4), L1);
|
||||
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
|
||||
__ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
|
||||
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
|
||||
__ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
|
||||
__ bind(L1);
|
||||
|
||||
__ tbz(count, 1, L2);
|
||||
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
|
||||
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
|
||||
__ bind(L2);
|
||||
Label L;
|
||||
__ cmp(count, 8);
|
||||
__ br(Assembler::GE, L);
|
||||
__ stop("genrate_copy_longs called with < 8 words");
|
||||
__ bind(L);
|
||||
}
|
||||
|
||||
__ ret(lr);
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
__ bind(large);
|
||||
#endif
|
||||
|
||||
// Fill 8 registers
|
||||
__ ldp(t0, t1, Address(s, 2 * unit));
|
||||
@ -811,6 +781,9 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ ldp(t4, t5, Address(s, 6 * unit));
|
||||
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
|
||||
|
||||
__ subs(count, count, 16);
|
||||
__ br(Assembler::LO, drain);
|
||||
|
||||
int prefetch = PrefetchCopyIntervalInBytes;
|
||||
bool use_stride = false;
|
||||
if (direction == copy_backwards) {
|
||||
@ -837,6 +810,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ br(Assembler::HS, again);
|
||||
|
||||
// Drain
|
||||
__ bind(drain);
|
||||
__ stp(t0, t1, Address(d, 2 * unit));
|
||||
__ stp(t2, t3, Address(d, 4 * unit));
|
||||
__ stp(t4, t5, Address(d, 6 * unit));
|
||||
@ -931,16 +905,119 @@ class StubGenerator: public StubCodeGenerator {
|
||||
int granularity = uabs(step);
|
||||
const Register t0 = r3, t1 = r4;
|
||||
|
||||
// <= 96 bytes do inline. Direction doesn't matter because we always
|
||||
// load all the data before writing anything
|
||||
Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
|
||||
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
|
||||
const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
|
||||
const Register send = r17, dend = r18;
|
||||
|
||||
if (PrefetchCopyIntervalInBytes > 0)
|
||||
__ prfm(Address(s, 0), PLDL1KEEP);
|
||||
|
||||
__ cmp(count, 80/granularity);
|
||||
__ br(Assembler::HI, copy_big);
|
||||
|
||||
__ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
|
||||
__ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
|
||||
|
||||
__ cmp(count, 16/granularity);
|
||||
__ br(Assembler::LS, copy16);
|
||||
|
||||
__ cmp(count, 64/granularity);
|
||||
__ br(Assembler::HI, copy80);
|
||||
|
||||
__ cmp(count, 32/granularity);
|
||||
__ br(Assembler::LS, copy32);
|
||||
|
||||
// 33..64 bytes
|
||||
__ ldp(t0, t1, Address(s, 0));
|
||||
__ ldp(t2, t3, Address(s, 16));
|
||||
__ ldp(t4, t5, Address(send, -32));
|
||||
__ ldp(t6, t7, Address(send, -16));
|
||||
|
||||
__ stp(t0, t1, Address(d, 0));
|
||||
__ stp(t2, t3, Address(d, 16));
|
||||
__ stp(t4, t5, Address(dend, -32));
|
||||
__ stp(t6, t7, Address(dend, -16));
|
||||
__ b(finish);
|
||||
|
||||
// 17..32 bytes
|
||||
__ bind(copy32);
|
||||
__ ldp(t0, t1, Address(s, 0));
|
||||
__ ldp(t2, t3, Address(send, -16));
|
||||
__ stp(t0, t1, Address(d, 0));
|
||||
__ stp(t2, t3, Address(dend, -16));
|
||||
__ b(finish);
|
||||
|
||||
// 65..80 bytes
|
||||
__ bind(copy80);
|
||||
__ ldp(t0, t1, Address(s, 0));
|
||||
__ ldp(t2, t3, Address(s, 16));
|
||||
__ ldp(t4, t5, Address(s, 32));
|
||||
__ ldp(t6, t7, Address(s, 48));
|
||||
__ ldp(t8, t9, Address(send, -16));
|
||||
|
||||
__ stp(t0, t1, Address(d, 0));
|
||||
__ stp(t2, t3, Address(d, 16));
|
||||
__ stp(t4, t5, Address(d, 32));
|
||||
__ stp(t6, t7, Address(d, 48));
|
||||
__ stp(t8, t9, Address(dend, -16));
|
||||
__ b(finish);
|
||||
|
||||
// 0..16 bytes
|
||||
__ bind(copy16);
|
||||
__ cmp(count, 8/granularity);
|
||||
__ br(Assembler::LO, copy8);
|
||||
|
||||
// 8..16 bytes
|
||||
__ ldr(t0, Address(s, 0));
|
||||
__ ldr(t1, Address(send, -8));
|
||||
__ str(t0, Address(d, 0));
|
||||
__ str(t1, Address(dend, -8));
|
||||
__ b(finish);
|
||||
|
||||
if (granularity < 8) {
|
||||
// 4..7 bytes
|
||||
__ bind(copy8);
|
||||
__ tbz(count, 2 - exact_log2(granularity), copy4);
|
||||
__ ldrw(t0, Address(s, 0));
|
||||
__ ldrw(t1, Address(send, -4));
|
||||
__ strw(t0, Address(d, 0));
|
||||
__ strw(t1, Address(dend, -4));
|
||||
__ b(finish);
|
||||
if (granularity < 4) {
|
||||
// 0..3 bytes
|
||||
__ bind(copy4);
|
||||
__ cbz(count, finish); // get rid of 0 case
|
||||
if (granularity == 2) {
|
||||
__ ldrh(t0, Address(s, 0));
|
||||
__ strh(t0, Address(d, 0));
|
||||
} else { // granularity == 1
|
||||
// Now 1..3 bytes. Handle the 1 and 2 byte case by copying
|
||||
// the first and last byte.
|
||||
// Handle the 3 byte case by loading and storing base + count/2
|
||||
// (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
|
||||
// This does means in the 1 byte case we load/store the same
|
||||
// byte 3 times.
|
||||
__ lsr(count, count, 1);
|
||||
__ ldrb(t0, Address(s, 0));
|
||||
__ ldrb(t1, Address(send, -1));
|
||||
__ ldrb(t2, Address(s, count));
|
||||
__ strb(t0, Address(d, 0));
|
||||
__ strb(t1, Address(dend, -1));
|
||||
__ strb(t2, Address(d, count));
|
||||
}
|
||||
__ b(finish);
|
||||
}
|
||||
}
|
||||
|
||||
__ bind(copy_big);
|
||||
if (is_backwards) {
|
||||
__ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
|
||||
__ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
|
||||
}
|
||||
|
||||
Label tail;
|
||||
|
||||
__ cmp(count, 16/granularity);
|
||||
__ br(Assembler::LO, tail);
|
||||
|
||||
// Now we've got the small case out of the way we can align the
|
||||
// source address on a 2-word boundary.
|
||||
|
||||
@ -986,8 +1063,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
#endif
|
||||
}
|
||||
|
||||
__ cmp(count, 16/granularity);
|
||||
__ br(Assembler::LT, tail);
|
||||
__ bind(aligned);
|
||||
|
||||
// s is now 2-word-aligned.
|
||||
@ -1001,9 +1076,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ bl(copy_b);
|
||||
|
||||
// And the tail.
|
||||
|
||||
__ bind(tail);
|
||||
copy_memory_small(s, d, count, tmp, step);
|
||||
|
||||
if (granularity >= 8) __ bind(copy8);
|
||||
if (granularity >= 4) __ bind(copy4);
|
||||
__ bind(finish);
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user