8150082: aarch64: optimise small array copy

Reviewed-by: aph
This commit is contained in:
Ed Nevill 2016-02-20 15:11:42 +00:00
parent f4b4c5d7b0
commit af0f23a617

View File

@ -729,7 +729,7 @@ class StubGenerator: public StubCodeGenerator {
//
// count is a count of words.
//
// Precondition: count >= 2
// Precondition: count >= 8
//
// Postconditions:
//
@ -750,7 +750,7 @@ class StubGenerator: public StubCodeGenerator {
assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
assert_different_registers(s, d, count, rscratch1);
Label again, large, small;
Label again, drain;
const char *stub_name;
if (direction == copy_forwards)
stub_name = "foward_copy_longs";
@ -759,51 +759,21 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", stub_name);
__ align(CodeEntryAlignment);
__ bind(start);
__ cmp(count, 8);
__ br(Assembler::LO, small);
if (direction == copy_forwards) {
__ sub(s, s, 2 * wordSize);
__ sub(d, d, 2 * wordSize);
}
__ subs(count, count, 16);
__ br(Assembler::GE, large);
// 8 <= count < 16 words. Copy 8.
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
if (direction == copy_forwards) {
__ add(s, s, 2 * wordSize);
__ add(d, d, 2 * wordSize);
}
#ifdef ASSERT
// Make sure we are never given < 8 words
{
Label L1, L2;
__ bind(small);
__ tbz(count, exact_log2(4), L1);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ bind(L1);
__ tbz(count, 1, L2);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ bind(L2);
Label L;
__ cmp(count, 8);
__ br(Assembler::GE, L);
__ stop("genrate_copy_longs called with < 8 words");
__ bind(L);
}
__ ret(lr);
__ align(CodeEntryAlignment);
__ bind(large);
#endif
// Fill 8 registers
__ ldp(t0, t1, Address(s, 2 * unit));
@ -811,6 +781,9 @@ class StubGenerator: public StubCodeGenerator {
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
__ subs(count, count, 16);
__ br(Assembler::LO, drain);
int prefetch = PrefetchCopyIntervalInBytes;
bool use_stride = false;
if (direction == copy_backwards) {
@ -837,6 +810,7 @@ class StubGenerator: public StubCodeGenerator {
__ br(Assembler::HS, again);
// Drain
__ bind(drain);
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
@ -931,16 +905,119 @@ class StubGenerator: public StubCodeGenerator {
int granularity = uabs(step);
const Register t0 = r3, t1 = r4;
// <= 96 bytes do inline. Direction doesn't matter because we always
// load all the data before writing anything
Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
const Register send = r17, dend = r18;
if (PrefetchCopyIntervalInBytes > 0)
__ prfm(Address(s, 0), PLDL1KEEP);
__ cmp(count, 80/granularity);
__ br(Assembler::HI, copy_big);
__ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
__ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
__ cmp(count, 16/granularity);
__ br(Assembler::LS, copy16);
__ cmp(count, 64/granularity);
__ br(Assembler::HI, copy80);
__ cmp(count, 32/granularity);
__ br(Assembler::LS, copy32);
// 33..64 bytes
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(s, 16));
__ ldp(t4, t5, Address(send, -32));
__ ldp(t6, t7, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(d, 16));
__ stp(t4, t5, Address(dend, -32));
__ stp(t6, t7, Address(dend, -16));
__ b(finish);
// 17..32 bytes
__ bind(copy32);
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(dend, -16));
__ b(finish);
// 65..80 bytes
__ bind(copy80);
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(s, 16));
__ ldp(t4, t5, Address(s, 32));
__ ldp(t6, t7, Address(s, 48));
__ ldp(t8, t9, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(d, 16));
__ stp(t4, t5, Address(d, 32));
__ stp(t6, t7, Address(d, 48));
__ stp(t8, t9, Address(dend, -16));
__ b(finish);
// 0..16 bytes
__ bind(copy16);
__ cmp(count, 8/granularity);
__ br(Assembler::LO, copy8);
// 8..16 bytes
__ ldr(t0, Address(s, 0));
__ ldr(t1, Address(send, -8));
__ str(t0, Address(d, 0));
__ str(t1, Address(dend, -8));
__ b(finish);
if (granularity < 8) {
// 4..7 bytes
__ bind(copy8);
__ tbz(count, 2 - exact_log2(granularity), copy4);
__ ldrw(t0, Address(s, 0));
__ ldrw(t1, Address(send, -4));
__ strw(t0, Address(d, 0));
__ strw(t1, Address(dend, -4));
__ b(finish);
if (granularity < 4) {
// 0..3 bytes
__ bind(copy4);
__ cbz(count, finish); // get rid of 0 case
if (granularity == 2) {
__ ldrh(t0, Address(s, 0));
__ strh(t0, Address(d, 0));
} else { // granularity == 1
// Now 1..3 bytes. Handle the 1 and 2 byte case by copying
// the first and last byte.
// Handle the 3 byte case by loading and storing base + count/2
// (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
// This does means in the 1 byte case we load/store the same
// byte 3 times.
__ lsr(count, count, 1);
__ ldrb(t0, Address(s, 0));
__ ldrb(t1, Address(send, -1));
__ ldrb(t2, Address(s, count));
__ strb(t0, Address(d, 0));
__ strb(t1, Address(dend, -1));
__ strb(t2, Address(d, count));
}
__ b(finish);
}
}
__ bind(copy_big);
if (is_backwards) {
__ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
__ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
}
Label tail;
__ cmp(count, 16/granularity);
__ br(Assembler::LO, tail);
// Now we've got the small case out of the way we can align the
// source address on a 2-word boundary.
@ -986,8 +1063,6 @@ class StubGenerator: public StubCodeGenerator {
#endif
}
__ cmp(count, 16/granularity);
__ br(Assembler::LT, tail);
__ bind(aligned);
// s is now 2-word-aligned.
@ -1001,9 +1076,11 @@ class StubGenerator: public StubCodeGenerator {
__ bl(copy_b);
// And the tail.
__ bind(tail);
copy_memory_small(s, d, count, tmp, step);
if (granularity >= 8) __ bind(copy8);
if (granularity >= 4) __ bind(copy4);
__ bind(finish);
}