8187472: AARCH64: array_equals intrinsic doesn't use prefetch for large arrays
Reviewed-by: dsamersoff
This commit is contained in:
parent
bf8a34b7a9
commit
0fdec9c25e
@ -16167,9 +16167,8 @@ instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
|
||||
format %{ "String Equals $str1,$str2,$cnt -> $result" %}
|
||||
ins_encode %{
|
||||
// Count is in 8-bit bytes; non-Compact chars are 16 bits.
|
||||
__ arrays_equals($str1$$Register, $str2$$Register,
|
||||
$result$$Register, $cnt$$Register,
|
||||
1, /*is_string*/true);
|
||||
__ string_equals($str1$$Register, $str2$$Register,
|
||||
$result$$Register, $cnt$$Register, 1);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
@ -16184,42 +16183,42 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
|
||||
format %{ "String Equals $str1,$str2,$cnt -> $result" %}
|
||||
ins_encode %{
|
||||
// Count is in 8-bit bytes; non-Compact chars are 16 bits.
|
||||
__ asrw($cnt$$Register, $cnt$$Register, 1);
|
||||
__ arrays_equals($str1$$Register, $str2$$Register,
|
||||
$result$$Register, $cnt$$Register,
|
||||
2, /*is_string*/true);
|
||||
__ string_equals($str1$$Register, $str2$$Register,
|
||||
$result$$Register, $cnt$$Register, 2);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
|
||||
iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3,
|
||||
iRegP_R10 tmp, rFlagsReg cr)
|
||||
%{
|
||||
predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
|
||||
match(Set result (AryEq ary1 ary2));
|
||||
effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
|
||||
effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
|
||||
|
||||
format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %}
|
||||
ins_encode %{
|
||||
__ arrays_equals($ary1$$Register, $ary2$$Register,
|
||||
$result$$Register, $tmp$$Register,
|
||||
1, /*is_string*/false);
|
||||
$tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
|
||||
$result$$Register, $tmp$$Register, 1);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
|
||||
iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3,
|
||||
iRegP_R10 tmp, rFlagsReg cr)
|
||||
%{
|
||||
predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
|
||||
match(Set result (AryEq ary1 ary2));
|
||||
effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
|
||||
effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
|
||||
|
||||
format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %}
|
||||
ins_encode %{
|
||||
__ arrays_equals($ary1$$Register, $ary2$$Register,
|
||||
$result$$Register, $tmp$$Register,
|
||||
2, /*is_string*/false);
|
||||
$tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
|
||||
$result$$Register, $tmp$$Register, 2);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
@ -147,6 +147,10 @@ define_pd_global(intx, InlineSmallCode, 1000);
|
||||
"Use CRC32 instructions for CRC32 computation") \
|
||||
product(bool, UseSIMDForMemoryOps, false, \
|
||||
"Use SIMD instructions in generated memory move code") \
|
||||
product(bool, UseSIMDForArrayEquals, true, \
|
||||
"Use SIMD instructions in generated array equals code") \
|
||||
product(bool, UseSimpleArrayEquals, false, \
|
||||
"Use simpliest and shortest implementation for array equals") \
|
||||
product(bool, AvoidUnalignedAccesses, false, \
|
||||
"Avoid generating unaligned memory accesses") \
|
||||
product(bool, UseLSE, false, \
|
||||
|
@ -5182,28 +5182,11 @@ void MacroAssembler::has_negatives(Register ary1, Register len, Register result)
|
||||
BIND(DONE);
|
||||
}
|
||||
|
||||
// Compare Strings or char/byte arrays.
|
||||
|
||||
// is_string is true iff this is a string comparison.
|
||||
|
||||
// For Strings we're passed the address of the first characters in a1
|
||||
// and a2 and the length in cnt1.
|
||||
|
||||
// For byte and char arrays we're passed the arrays themselves and we
|
||||
// have to extract length fields and do null checks here.
|
||||
|
||||
// elem_size is the element size in bytes: either 1 or 2.
|
||||
|
||||
// There are two implementations. For arrays >= 8 bytes, all
|
||||
// comparisons (including the final one, which may overlap) are
|
||||
// performed 8 bytes at a time. For arrays < 8 bytes, we compare a
|
||||
// halfword, then a short, and then a byte.
|
||||
|
||||
void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
Register result, Register cnt1,
|
||||
int elem_size, bool is_string)
|
||||
void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
|
||||
Register tmp4, Register tmp5, Register result,
|
||||
Register cnt1, int elem_size)
|
||||
{
|
||||
Label SAME, DONE, SHORT, NEXT_WORD, ONE;
|
||||
Label DONE;
|
||||
Register tmp1 = rscratch1;
|
||||
Register tmp2 = rscratch2;
|
||||
Register cnt2 = tmp2; // cnt2 only used in array length compare
|
||||
@ -5212,6 +5195,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
int length_offset = arrayOopDesc::length_offset_in_bytes();
|
||||
int base_offset
|
||||
= arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
|
||||
int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
|
||||
|
||||
assert(elem_size == 1 || elem_size == 2, "must be char or byte");
|
||||
assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
|
||||
@ -5220,35 +5204,33 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
{
|
||||
const char kind = (elem_size == 2) ? 'U' : 'L';
|
||||
char comment[64];
|
||||
snprintf(comment, sizeof comment, "%s%c%s {",
|
||||
is_string ? "string_equals" : "array_equals",
|
||||
kind, "{");
|
||||
snprintf(comment, sizeof comment, "array_equals%c{", kind);
|
||||
BLOCK_COMMENT(comment);
|
||||
}
|
||||
#endif
|
||||
|
||||
mov(result, false);
|
||||
|
||||
if (!is_string) {
|
||||
// if (a==a2)
|
||||
if (UseSimpleArrayEquals) {
|
||||
Label NEXT_WORD, SHORT, SAME, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
|
||||
// if (a1==a2)
|
||||
// return true;
|
||||
eor(rscratch1, a1, a2);
|
||||
cbz(rscratch1, SAME);
|
||||
// if (a==null || a2==null)
|
||||
// return false;
|
||||
cbz(a1, DONE);
|
||||
cbz(a2, DONE);
|
||||
// a1 & a2 == 0 means (some-pointer is null) or
|
||||
// (very-rare-or-even-probably-impossible-pointer-values)
|
||||
// so, we can save one branch in most cases
|
||||
eor(rscratch1, a1, a2);
|
||||
tst(a1, a2);
|
||||
mov(result, false);
|
||||
cbz(rscratch1, SAME);
|
||||
br(EQ, A_MIGHT_BE_NULL);
|
||||
// if (a1.length != a2.length)
|
||||
// return false;
|
||||
bind(A_IS_NOT_NULL);
|
||||
ldrw(cnt1, Address(a1, length_offset));
|
||||
ldrw(cnt2, Address(a2, length_offset));
|
||||
eorw(tmp1, cnt1, cnt2);
|
||||
cbnzw(tmp1, DONE);
|
||||
|
||||
eorw(tmp5, cnt1, cnt2);
|
||||
cbnzw(tmp5, DONE);
|
||||
lea(a1, Address(a1, base_offset));
|
||||
lea(a2, Address(a2, base_offset));
|
||||
}
|
||||
|
||||
// Check for short strings, i.e. smaller than wordSize.
|
||||
subs(cnt1, cnt1, elem_per_word);
|
||||
br(Assembler::LT, SHORT);
|
||||
@ -5257,8 +5239,8 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
ldr(tmp1, Address(post(a1, wordSize)));
|
||||
ldr(tmp2, Address(post(a2, wordSize)));
|
||||
subs(cnt1, cnt1, elem_per_word);
|
||||
eor(tmp1, tmp1, tmp2);
|
||||
cbnz(tmp1, DONE);
|
||||
eor(tmp5, tmp1, tmp2);
|
||||
cbnz(tmp5, DONE);
|
||||
} br(GT, NEXT_WORD);
|
||||
// Last longword. In the case where length == 4 we compare the
|
||||
// same longword twice, but that's still faster than another
|
||||
@ -5267,16 +5249,202 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
// length == 4.
|
||||
if (log_elem_size > 0)
|
||||
lsl(cnt1, cnt1, log_elem_size);
|
||||
ldr(tmp1, Address(a1, cnt1));
|
||||
ldr(tmp2, Address(a2, cnt1));
|
||||
ldr(tmp3, Address(a1, cnt1));
|
||||
ldr(tmp4, Address(a2, cnt1));
|
||||
eor(tmp5, tmp3, tmp4);
|
||||
cbnz(tmp5, DONE);
|
||||
b(SAME);
|
||||
bind(A_MIGHT_BE_NULL);
|
||||
// in case both a1 and a2 are not-null, proceed with loads
|
||||
cbz(a1, DONE);
|
||||
cbz(a2, DONE);
|
||||
b(A_IS_NOT_NULL);
|
||||
bind(SHORT);
|
||||
|
||||
tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
|
||||
{
|
||||
ldrw(tmp1, Address(post(a1, 4)));
|
||||
ldrw(tmp2, Address(post(a2, 4)));
|
||||
eorw(tmp5, tmp1, tmp2);
|
||||
cbnzw(tmp5, DONE);
|
||||
}
|
||||
bind(TAIL03);
|
||||
tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
|
||||
{
|
||||
ldrh(tmp3, Address(post(a1, 2)));
|
||||
ldrh(tmp4, Address(post(a2, 2)));
|
||||
eorw(tmp5, tmp3, tmp4);
|
||||
cbnzw(tmp5, DONE);
|
||||
}
|
||||
bind(TAIL01);
|
||||
if (elem_size == 1) { // Only needed when comparing byte arrays.
|
||||
tbz(cnt1, 0, SAME); // 0-1 bytes left.
|
||||
{
|
||||
ldrb(tmp1, a1);
|
||||
ldrb(tmp2, a2);
|
||||
eorw(tmp5, tmp1, tmp2);
|
||||
cbnzw(tmp5, DONE);
|
||||
}
|
||||
}
|
||||
bind(SAME);
|
||||
mov(result, true);
|
||||
} else {
|
||||
Label NEXT_DWORD, A_IS_NULL, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
|
||||
CSET_EQ, LAST_CHECK, LEN_IS_ZERO, SAME;
|
||||
cbz(a1, A_IS_NULL);
|
||||
ldrw(cnt1, Address(a1, length_offset));
|
||||
cbz(a2, A_IS_NULL);
|
||||
ldrw(cnt2, Address(a2, length_offset));
|
||||
mov(result, false);
|
||||
// on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
|
||||
// faster to perform another branch before comparing a1 and a2
|
||||
cmp(cnt1, elem_per_word);
|
||||
br(LE, SHORT); // short or same
|
||||
cmp(a1, a2);
|
||||
br(EQ, SAME);
|
||||
ldr(tmp3, Address(pre(a1, base_offset)));
|
||||
cmp(cnt1, stubBytesThreshold);
|
||||
br(GE, STUB);
|
||||
ldr(tmp4, Address(pre(a2, base_offset)));
|
||||
sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
|
||||
cmp(cnt2, cnt1);
|
||||
br(NE, DONE);
|
||||
|
||||
// Main 16 byte comparison loop with 2 exits
|
||||
bind(NEXT_DWORD); {
|
||||
ldr(tmp1, Address(pre(a1, wordSize)));
|
||||
ldr(tmp2, Address(pre(a2, wordSize)));
|
||||
subs(cnt1, cnt1, 2 * elem_per_word);
|
||||
br(LE, TAIL);
|
||||
eor(tmp4, tmp3, tmp4);
|
||||
cbnz(tmp4, DONE);
|
||||
ldr(tmp3, Address(pre(a1, wordSize)));
|
||||
ldr(tmp4, Address(pre(a2, wordSize)));
|
||||
cmp(cnt1, elem_per_word);
|
||||
br(LE, TAIL2);
|
||||
cmp(tmp1, tmp2);
|
||||
} br(EQ, NEXT_DWORD);
|
||||
b(DONE);
|
||||
|
||||
bind(TAIL);
|
||||
eor(tmp4, tmp3, tmp4);
|
||||
eor(tmp2, tmp1, tmp2);
|
||||
lslv(tmp2, tmp2, tmp5);
|
||||
orr(tmp5, tmp4, tmp2);
|
||||
cmp(tmp5, zr);
|
||||
b(CSET_EQ);
|
||||
|
||||
bind(TAIL2);
|
||||
eor(tmp2, tmp1, tmp2);
|
||||
cbnz(tmp2, DONE);
|
||||
b(LAST_CHECK);
|
||||
|
||||
bind(STUB);
|
||||
ldr(tmp4, Address(pre(a2, base_offset)));
|
||||
cmp(cnt2, cnt1);
|
||||
br(NE, DONE);
|
||||
if (elem_size == 2) { // convert to byte counter
|
||||
lsl(cnt1, cnt1, 1);
|
||||
}
|
||||
eor(tmp5, tmp3, tmp4);
|
||||
cbnz(tmp5, DONE);
|
||||
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
|
||||
assert(stub.target() != NULL, "array_equals_long stub has not been generated");
|
||||
trampoline_call(stub);
|
||||
b(DONE);
|
||||
|
||||
bind(SAME);
|
||||
mov(result, true);
|
||||
b(DONE);
|
||||
bind(A_IS_NULL);
|
||||
// a1 or a2 is null. if a2 == a2 then return true. else return false
|
||||
cmp(a1, a2);
|
||||
b(CSET_EQ);
|
||||
bind(EARLY_OUT);
|
||||
// (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
|
||||
// so, if a2 == null => return false(0), else return true, so we can return a2
|
||||
mov(result, a2);
|
||||
b(DONE);
|
||||
bind(LEN_IS_ZERO);
|
||||
cmp(cnt2, zr);
|
||||
b(CSET_EQ);
|
||||
bind(SHORT);
|
||||
cbz(cnt1, LEN_IS_ZERO);
|
||||
sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
|
||||
ldr(tmp3, Address(a1, base_offset));
|
||||
ldr(tmp4, Address(a2, base_offset));
|
||||
bind(LAST_CHECK);
|
||||
eor(tmp4, tmp3, tmp4);
|
||||
lslv(tmp5, tmp4, tmp5);
|
||||
cmp(tmp5, zr);
|
||||
bind(CSET_EQ);
|
||||
cset(result, EQ);
|
||||
}
|
||||
|
||||
// That's it.
|
||||
bind(DONE);
|
||||
|
||||
BLOCK_COMMENT("} array_equals");
|
||||
}
|
||||
|
||||
// Compare Strings
|
||||
|
||||
// For Strings we're passed the address of the first characters in a1
|
||||
// and a2 and the length in cnt1.
|
||||
// elem_size is the element size in bytes: either 1 or 2.
|
||||
// There are two implementations. For arrays >= 8 bytes, all
|
||||
// comparisons (including the final one, which may overlap) are
|
||||
// performed 8 bytes at a time. For strings < 8 bytes, we compare a
|
||||
// halfword, then a short, and then a byte.
|
||||
|
||||
void MacroAssembler::string_equals(Register a1, Register a2,
|
||||
Register result, Register cnt1, int elem_size)
|
||||
{
|
||||
Label SAME, DONE, SHORT, NEXT_WORD;
|
||||
Register tmp1 = rscratch1;
|
||||
Register tmp2 = rscratch2;
|
||||
Register cnt2 = tmp2; // cnt2 only used in array length compare
|
||||
|
||||
assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
|
||||
assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
|
||||
|
||||
#ifndef PRODUCT
|
||||
{
|
||||
const char kind = (elem_size == 2) ? 'U' : 'L';
|
||||
char comment[64];
|
||||
snprintf(comment, sizeof comment, "{string_equals%c", kind);
|
||||
BLOCK_COMMENT(comment);
|
||||
}
|
||||
#endif
|
||||
|
||||
mov(result, false);
|
||||
|
||||
// Check for short strings, i.e. smaller than wordSize.
|
||||
subs(cnt1, cnt1, wordSize);
|
||||
br(Assembler::LT, SHORT);
|
||||
// Main 8 byte comparison loop.
|
||||
bind(NEXT_WORD); {
|
||||
ldr(tmp1, Address(post(a1, wordSize)));
|
||||
ldr(tmp2, Address(post(a2, wordSize)));
|
||||
subs(cnt1, cnt1, wordSize);
|
||||
eor(tmp1, tmp1, tmp2);
|
||||
cbnz(tmp1, DONE);
|
||||
} br(GT, NEXT_WORD);
|
||||
// Last longword. In the case where length == 4 we compare the
|
||||
// same longword twice, but that's still faster than another
|
||||
// conditional branch.
|
||||
// cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
|
||||
// length == 4.
|
||||
ldr(tmp1, Address(a1, cnt1));
|
||||
ldr(tmp2, Address(a2, cnt1));
|
||||
eor(tmp2, tmp1, tmp2);
|
||||
cbnz(tmp2, DONE);
|
||||
b(SAME);
|
||||
|
||||
bind(SHORT);
|
||||
Label TAIL03, TAIL01;
|
||||
|
||||
tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
|
||||
tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
|
||||
{
|
||||
ldrw(tmp1, Address(post(a1, 4)));
|
||||
ldrw(tmp2, Address(post(a2, 4)));
|
||||
@ -5284,7 +5452,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
cbnzw(tmp1, DONE);
|
||||
}
|
||||
bind(TAIL03);
|
||||
tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
|
||||
tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
|
||||
{
|
||||
ldrh(tmp1, Address(post(a1, 2)));
|
||||
ldrh(tmp2, Address(post(a2, 2)));
|
||||
@ -5292,7 +5460,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
cbnzw(tmp1, DONE);
|
||||
}
|
||||
bind(TAIL01);
|
||||
if (elem_size == 1) { // Only needed when comparing byte arrays.
|
||||
if (elem_size == 1) { // Only needed when comparing 1-byte elements
|
||||
tbz(cnt1, 0, SAME); // 0-1 bytes left.
|
||||
{
|
||||
ldrb(tmp1, a1);
|
||||
@ -5307,7 +5475,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
|
||||
|
||||
// That's it.
|
||||
bind(DONE);
|
||||
BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
|
||||
BLOCK_COMMENT("} string_equals");
|
||||
}
|
||||
|
||||
|
||||
|
@ -1225,9 +1225,11 @@ public:
|
||||
|
||||
void has_negatives(Register ary1, Register len, Register result);
|
||||
|
||||
void arrays_equals(Register a1, Register a2,
|
||||
Register result, Register cnt1,
|
||||
int elem_size, bool is_string);
|
||||
void arrays_equals(Register a1, Register a2, Register result, Register cnt1,
|
||||
Register tmp1, Register tmp2, Register tmp3, int elem_size);
|
||||
|
||||
void string_equals(Register a1, Register a2, Register result, Register cnt1,
|
||||
int elem_size);
|
||||
|
||||
void fill_words(Register base, Register cnt, Register value);
|
||||
void zero_words(Register base, u_int64_t cnt);
|
||||
|
@ -3813,6 +3813,182 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ ret(lr);
|
||||
return entry;
|
||||
}
|
||||
|
||||
void generate_large_array_equals_loop_nonsimd(int loopThreshold,
|
||||
bool usePrefetch, Label &NOT_EQUAL) {
|
||||
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
||||
tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
|
||||
tmp7 = r12, tmp8 = r13;
|
||||
Label LOOP;
|
||||
|
||||
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
||||
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
||||
__ bind(LOOP);
|
||||
if (usePrefetch) {
|
||||
__ prfm(Address(a1, SoftwarePrefetchHintDistance));
|
||||
__ prfm(Address(a2, SoftwarePrefetchHintDistance));
|
||||
}
|
||||
__ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ eor(tmp3, tmp3, tmp4);
|
||||
__ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
|
||||
__ orr(tmp1, tmp1, tmp3);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
||||
__ eor(tmp5, tmp5, tmp6);
|
||||
__ eor(tmp7, tmp7, tmp8);
|
||||
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
||||
__ orr(tmp5, tmp5, tmp7);
|
||||
__ cbnz(tmp5, NOT_EQUAL);
|
||||
__ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ eor(tmp3, tmp3, tmp4);
|
||||
__ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
|
||||
__ orr(tmp1, tmp1, tmp3);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
||||
__ eor(tmp5, tmp5, tmp6);
|
||||
__ sub(cnt1, cnt1, 8 * wordSize);
|
||||
__ eor(tmp7, tmp7, tmp8);
|
||||
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
||||
__ cmp(cnt1, loopThreshold);
|
||||
__ orr(tmp5, tmp5, tmp7);
|
||||
__ cbnz(tmp5, NOT_EQUAL);
|
||||
__ br(__ GE, LOOP);
|
||||
// post-loop
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ eor(tmp3, tmp3, tmp4);
|
||||
__ orr(tmp1, tmp1, tmp3);
|
||||
__ sub(cnt1, cnt1, 2 * wordSize);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
}
|
||||
|
||||
void generate_large_array_equals_loop_simd(int loopThreshold,
|
||||
bool usePrefetch, Label &NOT_EQUAL) {
|
||||
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
||||
tmp2 = rscratch2;
|
||||
Label LOOP;
|
||||
|
||||
__ bind(LOOP);
|
||||
if (usePrefetch) {
|
||||
__ prfm(Address(a1, SoftwarePrefetchHintDistance));
|
||||
__ prfm(Address(a2, SoftwarePrefetchHintDistance));
|
||||
}
|
||||
__ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
|
||||
__ sub(cnt1, cnt1, 8 * wordSize);
|
||||
__ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
|
||||
__ cmp(cnt1, loopThreshold);
|
||||
__ eor(v0, __ T16B, v0, v4);
|
||||
__ eor(v1, __ T16B, v1, v5);
|
||||
__ eor(v2, __ T16B, v2, v6);
|
||||
__ eor(v3, __ T16B, v3, v7);
|
||||
__ orr(v0, __ T16B, v0, v1);
|
||||
__ orr(v1, __ T16B, v2, v3);
|
||||
__ orr(v0, __ T16B, v0, v1);
|
||||
__ umov(tmp1, v0, __ D, 0);
|
||||
__ umov(tmp2, v0, __ D, 1);
|
||||
__ orr(tmp1, tmp1, tmp2);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
__ br(__ GE, LOOP);
|
||||
}
|
||||
|
||||
// a1 = r1 - array1 address
|
||||
// a2 = r2 - array2 address
|
||||
// result = r0 - return value. Already contains "false"
|
||||
// cnt1 = r10 - amount of elements left to check, reduced by wordSize
|
||||
// r3-r5 are reserved temporary registers
|
||||
address generate_large_array_equals() {
|
||||
StubCodeMark mark(this, "StubRoutines", "large_array_equals");
|
||||
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
||||
tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
|
||||
tmp7 = r12, tmp8 = r13;
|
||||
Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
|
||||
SMALL_LOOP, POST_LOOP;
|
||||
const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
|
||||
// calculate if at least 32 prefetched bytes are used
|
||||
int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
|
||||
int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
|
||||
RegSet spilled_regs = RegSet::range(tmp6, tmp8);
|
||||
assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
|
||||
tmp5, tmp6, tmp7, tmp8);
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
address entry = __ pc();
|
||||
__ enter();
|
||||
__ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
|
||||
// also advance pointers to use post-increment instead of pre-increment
|
||||
__ add(a1, a1, wordSize);
|
||||
__ add(a2, a2, wordSize);
|
||||
if (AvoidUnalignedAccesses) {
|
||||
// both implementations (SIMD/nonSIMD) are using relatively large load
|
||||
// instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
|
||||
// on some CPUs in case of address is not at least 16-byte aligned.
|
||||
// Arrays are 8-byte aligned currently, so, we can make additional 8-byte
|
||||
// load if needed at least for 1st address and make if 16-byte aligned.
|
||||
Label ALIGNED16;
|
||||
__ tbz(a1, 3, ALIGNED16);
|
||||
__ ldr(tmp1, Address(__ post(a1, wordSize)));
|
||||
__ ldr(tmp2, Address(__ post(a2, wordSize)));
|
||||
__ sub(cnt1, cnt1, wordSize);
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ cbnz(tmp1, NOT_EQUAL_NO_POP);
|
||||
__ bind(ALIGNED16);
|
||||
}
|
||||
if (UseSIMDForArrayEquals) {
|
||||
if (SoftwarePrefetchHintDistance >= 0) {
|
||||
__ cmp(cnt1, prefetchLoopThreshold);
|
||||
__ br(__ LE, NO_PREFETCH_LARGE_LOOP);
|
||||
generate_large_array_equals_loop_simd(prefetchLoopThreshold,
|
||||
/* prfm = */ true, NOT_EQUAL);
|
||||
__ cmp(cnt1, nonPrefetchLoopThreshold);
|
||||
__ br(__ LT, TAIL);
|
||||
}
|
||||
__ bind(NO_PREFETCH_LARGE_LOOP);
|
||||
generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
|
||||
/* prfm = */ false, NOT_EQUAL);
|
||||
} else {
|
||||
__ push(spilled_regs, sp);
|
||||
if (SoftwarePrefetchHintDistance >= 0) {
|
||||
__ cmp(cnt1, prefetchLoopThreshold);
|
||||
__ br(__ LE, NO_PREFETCH_LARGE_LOOP);
|
||||
generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
|
||||
/* prfm = */ true, NOT_EQUAL);
|
||||
__ cmp(cnt1, nonPrefetchLoopThreshold);
|
||||
__ br(__ LT, TAIL);
|
||||
}
|
||||
__ bind(NO_PREFETCH_LARGE_LOOP);
|
||||
generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
|
||||
/* prfm = */ false, NOT_EQUAL);
|
||||
}
|
||||
__ bind(TAIL);
|
||||
__ cbz(cnt1, EQUAL);
|
||||
__ subs(cnt1, cnt1, wordSize);
|
||||
__ br(__ LE, POST_LOOP);
|
||||
__ bind(SMALL_LOOP);
|
||||
__ ldr(tmp1, Address(__ post(a1, wordSize)));
|
||||
__ ldr(tmp2, Address(__ post(a2, wordSize)));
|
||||
__ subs(cnt1, cnt1, wordSize);
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
__ br(__ GT, SMALL_LOOP);
|
||||
__ bind(POST_LOOP);
|
||||
__ ldr(tmp1, Address(a1, cnt1));
|
||||
__ ldr(tmp2, Address(a2, cnt1));
|
||||
__ eor(tmp1, tmp1, tmp2);
|
||||
__ cbnz(tmp1, NOT_EQUAL);
|
||||
__ bind(EQUAL);
|
||||
__ mov(result, true);
|
||||
__ bind(NOT_EQUAL);
|
||||
if (!UseSIMDForArrayEquals) {
|
||||
__ pop(spilled_regs, sp);
|
||||
}
|
||||
__ bind(NOT_EQUAL_NO_POP);
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -4895,6 +5071,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// has negatives stub for large arrays.
|
||||
StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
|
||||
|
||||
// array equals stub for large arrays.
|
||||
if (!UseSimpleArrayEquals) {
|
||||
StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
|
||||
}
|
||||
|
||||
if (UseMultiplyToLenIntrinsic) {
|
||||
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
||||
}
|
||||
|
@ -46,6 +46,7 @@ address StubRoutines::aarch64::_double_sign_flip = NULL;
|
||||
address StubRoutines::aarch64::_zero_blocks = NULL;
|
||||
address StubRoutines::aarch64::_has_negatives = NULL;
|
||||
address StubRoutines::aarch64::_has_negatives_long = NULL;
|
||||
address StubRoutines::aarch64::_large_array_equals = NULL;
|
||||
bool StubRoutines::aarch64::_completed = false;
|
||||
|
||||
/**
|
||||
|
@ -65,6 +65,7 @@ class aarch64 {
|
||||
|
||||
static address _has_negatives;
|
||||
static address _has_negatives_long;
|
||||
static address _large_array_equals;
|
||||
static bool _completed;
|
||||
|
||||
public:
|
||||
@ -131,6 +132,10 @@ class aarch64 {
|
||||
return _has_negatives_long;
|
||||
}
|
||||
|
||||
static address large_array_equals() {
|
||||
return _large_array_equals;
|
||||
}
|
||||
|
||||
static bool complete() {
|
||||
return _completed;
|
||||
}
|
||||
|
@ -203,7 +203,11 @@ void VM_Version::get_processor_features() {
|
||||
if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) {
|
||||
FLAG_SET_DEFAULT(UseSIMDForMemoryOps, (_variant > 0));
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) {
|
||||
FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false);
|
||||
}
|
||||
}
|
||||
|
||||
// ThunderX2
|
||||
if ((_cpu == CPU_CAVIUM && (_model == 0xAF)) ||
|
||||
(_cpu == CPU_BROADCOM && (_model == 0x516))) {
|
||||
@ -218,7 +222,25 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
}
|
||||
|
||||
if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _features |= CPU_A53MAC;
|
||||
// Cortex A53
|
||||
if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) {
|
||||
_features |= CPU_A53MAC;
|
||||
if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) {
|
||||
FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false);
|
||||
}
|
||||
}
|
||||
|
||||
// Cortex A73
|
||||
if (_cpu == CPU_ARM && (_model == 0xd09 || _model2 == 0xd09)) {
|
||||
if (FLAG_IS_DEFAULT(SoftwarePrefetchHintDistance)) {
|
||||
FLAG_SET_DEFAULT(SoftwarePrefetchHintDistance, -1);
|
||||
}
|
||||
// A73 is faster with short-and-easy-for-speculative-execution-loop
|
||||
if (FLAG_IS_DEFAULT(UseSimpleArrayEquals)) {
|
||||
FLAG_SET_DEFAULT(UseSimpleArrayEquals, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07)) _features |= CPU_STXR_PREFETCH;
|
||||
// If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is an A57 (0xd07)
|
||||
// we assume the worst and assume we could be on a big little system and have
|
||||
|
Loading…
Reference in New Issue
Block a user