From 0fdec9c25ed0f345a1c84a90ef6ece4693bff52d Mon Sep 17 00:00:00 2001 From: Dmitrij Pochepko Date: Mon, 9 Apr 2018 18:43:40 +0300 Subject: [PATCH] 8187472: AARCH64: array_equals intrinsic doesn't use prefetch for large arrays Reviewed-by: dsamersoff --- src/hotspot/cpu/aarch64/aarch64.ad | 29 +- src/hotspot/cpu/aarch64/globals_aarch64.hpp | 4 + .../cpu/aarch64/macroAssembler_aarch64.cpp | 276 ++++++++++++++---- .../cpu/aarch64/macroAssembler_aarch64.hpp | 8 +- .../cpu/aarch64/stubGenerator_aarch64.cpp | 181 ++++++++++++ .../cpu/aarch64/stubRoutines_aarch64.cpp | 1 + .../cpu/aarch64/stubRoutines_aarch64.hpp | 5 + .../cpu/aarch64/vm_version_aarch64.cpp | 24 +- 8 files changed, 455 insertions(+), 73 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index e3c0a72eaf3..3c53bf00628 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -16167,9 +16167,8 @@ instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, format %{ "String Equals $str1,$str2,$cnt -> $result" %} ins_encode %{ // Count is in 8-bit bytes; non-Compact chars are 16 bits. - __ arrays_equals($str1$$Register, $str2$$Register, - $result$$Register, $cnt$$Register, - 1, /*is_string*/true); + __ string_equals($str1$$Register, $str2$$Register, + $result$$Register, $cnt$$Register, 1); %} ins_pipe(pipe_class_memory); %} @@ -16184,42 +16183,42 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, format %{ "String Equals $str1,$str2,$cnt -> $result" %} ins_encode %{ // Count is in 8-bit bytes; non-Compact chars are 16 bits. - __ asrw($cnt$$Register, $cnt$$Register, 1); - __ arrays_equals($str1$$Register, $str2$$Register, - $result$$Register, $cnt$$Register, - 2, /*is_string*/true); + __ string_equals($str1$$Register, $str2$$Register, + $result$$Register, $cnt$$Register, 2); %} ins_pipe(pipe_class_memory); %} instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, - iRegP_R10 tmp, rFlagsReg cr) + iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} ins_encode %{ __ arrays_equals($ary1$$Register, $ary2$$Register, - $result$$Register, $tmp$$Register, - 1, /*is_string*/false); + $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, + $result$$Register, $tmp$$Register, 1); %} ins_pipe(pipe_class_memory); %} instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, - iRegP_R10 tmp, rFlagsReg cr) + iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} ins_encode %{ __ arrays_equals($ary1$$Register, $ary2$$Register, - $result$$Register, $tmp$$Register, - 2, /*is_string*/false); + $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, + $result$$Register, $tmp$$Register, 2); %} ins_pipe(pipe_class_memory); %} diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index dfd984e4fb0..e35b75bb97d 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -147,6 +147,10 @@ define_pd_global(intx, InlineSmallCode, 1000); "Use CRC32 instructions for CRC32 computation") \ product(bool, UseSIMDForMemoryOps, false, \ "Use SIMD instructions in generated memory move code") \ + product(bool, UseSIMDForArrayEquals, true, \ + "Use SIMD instructions in generated array equals code") \ + product(bool, UseSimpleArrayEquals, false, \ + "Use simpliest and shortest implementation for array equals") \ product(bool, AvoidUnalignedAccesses, false, \ "Avoid generating unaligned memory accesses") \ product(bool, UseLSE, false, \ diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index aae6ca22c11..9d73b897868 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -5182,28 +5182,11 @@ void MacroAssembler::has_negatives(Register ary1, Register len, Register result) BIND(DONE); } -// Compare Strings or char/byte arrays. - -// is_string is true iff this is a string comparison. - -// For Strings we're passed the address of the first characters in a1 -// and a2 and the length in cnt1. - -// For byte and char arrays we're passed the arrays themselves and we -// have to extract length fields and do null checks here. - -// elem_size is the element size in bytes: either 1 or 2. - -// There are two implementations. For arrays >= 8 bytes, all -// comparisons (including the final one, which may overlap) are -// performed 8 bytes at a time. For arrays < 8 bytes, we compare a -// halfword, then a short, and then a byte. - -void MacroAssembler::arrays_equals(Register a1, Register a2, - Register result, Register cnt1, - int elem_size, bool is_string) +void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, + Register tmp4, Register tmp5, Register result, + Register cnt1, int elem_size) { - Label SAME, DONE, SHORT, NEXT_WORD, ONE; + Label DONE; Register tmp1 = rscratch1; Register tmp2 = rscratch2; Register cnt2 = tmp2; // cnt2 only used in array length compare @@ -5212,6 +5195,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); + int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); assert(elem_size == 1 || elem_size == 2, "must be char or byte"); assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); @@ -5220,43 +5204,229 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, { const char kind = (elem_size == 2) ? 'U' : 'L'; char comment[64]; - snprintf(comment, sizeof comment, "%s%c%s {", - is_string ? "string_equals" : "array_equals", - kind, "{"); + snprintf(comment, sizeof comment, "array_equals%c{", kind); + BLOCK_COMMENT(comment); + } +#endif + if (UseSimpleArrayEquals) { + Label NEXT_WORD, SHORT, SAME, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; + // if (a1==a2) + // return true; + // if (a==null || a2==null) + // return false; + // a1 & a2 == 0 means (some-pointer is null) or + // (very-rare-or-even-probably-impossible-pointer-values) + // so, we can save one branch in most cases + eor(rscratch1, a1, a2); + tst(a1, a2); + mov(result, false); + cbz(rscratch1, SAME); + br(EQ, A_MIGHT_BE_NULL); + // if (a1.length != a2.length) + // return false; + bind(A_IS_NOT_NULL); + ldrw(cnt1, Address(a1, length_offset)); + ldrw(cnt2, Address(a2, length_offset)); + eorw(tmp5, cnt1, cnt2); + cbnzw(tmp5, DONE); + lea(a1, Address(a1, base_offset)); + lea(a2, Address(a2, base_offset)); + // Check for short strings, i.e. smaller than wordSize. + subs(cnt1, cnt1, elem_per_word); + br(Assembler::LT, SHORT); + // Main 8 byte comparison loop. + bind(NEXT_WORD); { + ldr(tmp1, Address(post(a1, wordSize))); + ldr(tmp2, Address(post(a2, wordSize))); + subs(cnt1, cnt1, elem_per_word); + eor(tmp5, tmp1, tmp2); + cbnz(tmp5, DONE); + } br(GT, NEXT_WORD); + // Last longword. In the case where length == 4 we compare the + // same longword twice, but that's still faster than another + // conditional branch. + // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when + // length == 4. + if (log_elem_size > 0) + lsl(cnt1, cnt1, log_elem_size); + ldr(tmp3, Address(a1, cnt1)); + ldr(tmp4, Address(a2, cnt1)); + eor(tmp5, tmp3, tmp4); + cbnz(tmp5, DONE); + b(SAME); + bind(A_MIGHT_BE_NULL); + // in case both a1 and a2 are not-null, proceed with loads + cbz(a1, DONE); + cbz(a2, DONE); + b(A_IS_NOT_NULL); + bind(SHORT); + + tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. + { + ldrw(tmp1, Address(post(a1, 4))); + ldrw(tmp2, Address(post(a2, 4))); + eorw(tmp5, tmp1, tmp2); + cbnzw(tmp5, DONE); + } + bind(TAIL03); + tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. + { + ldrh(tmp3, Address(post(a1, 2))); + ldrh(tmp4, Address(post(a2, 2))); + eorw(tmp5, tmp3, tmp4); + cbnzw(tmp5, DONE); + } + bind(TAIL01); + if (elem_size == 1) { // Only needed when comparing byte arrays. + tbz(cnt1, 0, SAME); // 0-1 bytes left. + { + ldrb(tmp1, a1); + ldrb(tmp2, a2); + eorw(tmp5, tmp1, tmp2); + cbnzw(tmp5, DONE); + } + } + bind(SAME); + mov(result, true); + } else { + Label NEXT_DWORD, A_IS_NULL, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, + CSET_EQ, LAST_CHECK, LEN_IS_ZERO, SAME; + cbz(a1, A_IS_NULL); + ldrw(cnt1, Address(a1, length_offset)); + cbz(a2, A_IS_NULL); + ldrw(cnt2, Address(a2, length_offset)); + mov(result, false); + // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's + // faster to perform another branch before comparing a1 and a2 + cmp(cnt1, elem_per_word); + br(LE, SHORT); // short or same + cmp(a1, a2); + br(EQ, SAME); + ldr(tmp3, Address(pre(a1, base_offset))); + cmp(cnt1, stubBytesThreshold); + br(GE, STUB); + ldr(tmp4, Address(pre(a2, base_offset))); + sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); + cmp(cnt2, cnt1); + br(NE, DONE); + + // Main 16 byte comparison loop with 2 exits + bind(NEXT_DWORD); { + ldr(tmp1, Address(pre(a1, wordSize))); + ldr(tmp2, Address(pre(a2, wordSize))); + subs(cnt1, cnt1, 2 * elem_per_word); + br(LE, TAIL); + eor(tmp4, tmp3, tmp4); + cbnz(tmp4, DONE); + ldr(tmp3, Address(pre(a1, wordSize))); + ldr(tmp4, Address(pre(a2, wordSize))); + cmp(cnt1, elem_per_word); + br(LE, TAIL2); + cmp(tmp1, tmp2); + } br(EQ, NEXT_DWORD); + b(DONE); + + bind(TAIL); + eor(tmp4, tmp3, tmp4); + eor(tmp2, tmp1, tmp2); + lslv(tmp2, tmp2, tmp5); + orr(tmp5, tmp4, tmp2); + cmp(tmp5, zr); + b(CSET_EQ); + + bind(TAIL2); + eor(tmp2, tmp1, tmp2); + cbnz(tmp2, DONE); + b(LAST_CHECK); + + bind(STUB); + ldr(tmp4, Address(pre(a2, base_offset))); + cmp(cnt2, cnt1); + br(NE, DONE); + if (elem_size == 2) { // convert to byte counter + lsl(cnt1, cnt1, 1); + } + eor(tmp5, tmp3, tmp4); + cbnz(tmp5, DONE); + RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); + assert(stub.target() != NULL, "array_equals_long stub has not been generated"); + trampoline_call(stub); + b(DONE); + + bind(SAME); + mov(result, true); + b(DONE); + bind(A_IS_NULL); + // a1 or a2 is null. if a2 == a2 then return true. else return false + cmp(a1, a2); + b(CSET_EQ); + bind(EARLY_OUT); + // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) + // so, if a2 == null => return false(0), else return true, so we can return a2 + mov(result, a2); + b(DONE); + bind(LEN_IS_ZERO); + cmp(cnt2, zr); + b(CSET_EQ); + bind(SHORT); + cbz(cnt1, LEN_IS_ZERO); + sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); + ldr(tmp3, Address(a1, base_offset)); + ldr(tmp4, Address(a2, base_offset)); + bind(LAST_CHECK); + eor(tmp4, tmp3, tmp4); + lslv(tmp5, tmp4, tmp5); + cmp(tmp5, zr); + bind(CSET_EQ); + cset(result, EQ); + } + + // That's it. + bind(DONE); + + BLOCK_COMMENT("} array_equals"); +} + +// Compare Strings + +// For Strings we're passed the address of the first characters in a1 +// and a2 and the length in cnt1. +// elem_size is the element size in bytes: either 1 or 2. +// There are two implementations. For arrays >= 8 bytes, all +// comparisons (including the final one, which may overlap) are +// performed 8 bytes at a time. For strings < 8 bytes, we compare a +// halfword, then a short, and then a byte. + +void MacroAssembler::string_equals(Register a1, Register a2, + Register result, Register cnt1, int elem_size) +{ + Label SAME, DONE, SHORT, NEXT_WORD; + Register tmp1 = rscratch1; + Register tmp2 = rscratch2; + Register cnt2 = tmp2; // cnt2 only used in array length compare + + assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); + assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); + +#ifndef PRODUCT + { + const char kind = (elem_size == 2) ? 'U' : 'L'; + char comment[64]; + snprintf(comment, sizeof comment, "{string_equals%c", kind); BLOCK_COMMENT(comment); } #endif mov(result, false); - if (!is_string) { - // if (a==a2) - // return true; - eor(rscratch1, a1, a2); - cbz(rscratch1, SAME); - // if (a==null || a2==null) - // return false; - cbz(a1, DONE); - cbz(a2, DONE); - // if (a1.length != a2.length) - // return false; - ldrw(cnt1, Address(a1, length_offset)); - ldrw(cnt2, Address(a2, length_offset)); - eorw(tmp1, cnt1, cnt2); - cbnzw(tmp1, DONE); - - lea(a1, Address(a1, base_offset)); - lea(a2, Address(a2, base_offset)); - } - // Check for short strings, i.e. smaller than wordSize. - subs(cnt1, cnt1, elem_per_word); + subs(cnt1, cnt1, wordSize); br(Assembler::LT, SHORT); // Main 8 byte comparison loop. bind(NEXT_WORD); { ldr(tmp1, Address(post(a1, wordSize))); ldr(tmp2, Address(post(a2, wordSize))); - subs(cnt1, cnt1, elem_per_word); + subs(cnt1, cnt1, wordSize); eor(tmp1, tmp1, tmp2); cbnz(tmp1, DONE); } br(GT, NEXT_WORD); @@ -5265,18 +5435,16 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, // conditional branch. // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when // length == 4. - if (log_elem_size > 0) - lsl(cnt1, cnt1, log_elem_size); ldr(tmp1, Address(a1, cnt1)); ldr(tmp2, Address(a2, cnt1)); - eor(tmp1, tmp1, tmp2); - cbnz(tmp1, DONE); + eor(tmp2, tmp1, tmp2); + cbnz(tmp2, DONE); b(SAME); bind(SHORT); Label TAIL03, TAIL01; - tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. + tbz(cnt1, 2, TAIL03); // 0-7 bytes left. { ldrw(tmp1, Address(post(a1, 4))); ldrw(tmp2, Address(post(a2, 4))); @@ -5284,7 +5452,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, cbnzw(tmp1, DONE); } bind(TAIL03); - tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. + tbz(cnt1, 1, TAIL01); // 0-3 bytes left. { ldrh(tmp1, Address(post(a1, 2))); ldrh(tmp2, Address(post(a2, 2))); @@ -5292,7 +5460,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, cbnzw(tmp1, DONE); } bind(TAIL01); - if (elem_size == 1) { // Only needed when comparing byte arrays. + if (elem_size == 1) { // Only needed when comparing 1-byte elements tbz(cnt1, 0, SAME); // 0-1 bytes left. { ldrb(tmp1, a1); @@ -5307,7 +5475,7 @@ void MacroAssembler::arrays_equals(Register a1, Register a2, // That's it. bind(DONE); - BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals"); + BLOCK_COMMENT("} string_equals"); } diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 9cf6bf6f8c5..8ac1ad74fcc 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1225,9 +1225,11 @@ public: void has_negatives(Register ary1, Register len, Register result); - void arrays_equals(Register a1, Register a2, - Register result, Register cnt1, - int elem_size, bool is_string); + void arrays_equals(Register a1, Register a2, Register result, Register cnt1, + Register tmp1, Register tmp2, Register tmp3, int elem_size); + + void string_equals(Register a1, Register a2, Register result, Register cnt1, + int elem_size); void fill_words(Register base, Register cnt, Register value); void zero_words(Register base, u_int64_t cnt); diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 35b8c46debf..b73d017c91d 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3813,6 +3813,182 @@ class StubGenerator: public StubCodeGenerator { __ ret(lr); return entry; } + + void generate_large_array_equals_loop_nonsimd(int loopThreshold, + bool usePrefetch, Label &NOT_EQUAL) { + Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, + tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, + tmp7 = r12, tmp8 = r13; + Label LOOP; + + __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); + __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); + __ bind(LOOP); + if (usePrefetch) { + __ prfm(Address(a1, SoftwarePrefetchHintDistance)); + __ prfm(Address(a2, SoftwarePrefetchHintDistance)); + } + __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); + __ eor(tmp1, tmp1, tmp2); + __ eor(tmp3, tmp3, tmp4); + __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); + __ orr(tmp1, tmp1, tmp3); + __ cbnz(tmp1, NOT_EQUAL); + __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); + __ eor(tmp5, tmp5, tmp6); + __ eor(tmp7, tmp7, tmp8); + __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); + __ orr(tmp5, tmp5, tmp7); + __ cbnz(tmp5, NOT_EQUAL); + __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); + __ eor(tmp1, tmp1, tmp2); + __ eor(tmp3, tmp3, tmp4); + __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); + __ orr(tmp1, tmp1, tmp3); + __ cbnz(tmp1, NOT_EQUAL); + __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); + __ eor(tmp5, tmp5, tmp6); + __ sub(cnt1, cnt1, 8 * wordSize); + __ eor(tmp7, tmp7, tmp8); + __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); + __ cmp(cnt1, loopThreshold); + __ orr(tmp5, tmp5, tmp7); + __ cbnz(tmp5, NOT_EQUAL); + __ br(__ GE, LOOP); + // post-loop + __ eor(tmp1, tmp1, tmp2); + __ eor(tmp3, tmp3, tmp4); + __ orr(tmp1, tmp1, tmp3); + __ sub(cnt1, cnt1, 2 * wordSize); + __ cbnz(tmp1, NOT_EQUAL); + } + + void generate_large_array_equals_loop_simd(int loopThreshold, + bool usePrefetch, Label &NOT_EQUAL) { + Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, + tmp2 = rscratch2; + Label LOOP; + + __ bind(LOOP); + if (usePrefetch) { + __ prfm(Address(a1, SoftwarePrefetchHintDistance)); + __ prfm(Address(a2, SoftwarePrefetchHintDistance)); + } + __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); + __ sub(cnt1, cnt1, 8 * wordSize); + __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); + __ cmp(cnt1, loopThreshold); + __ eor(v0, __ T16B, v0, v4); + __ eor(v1, __ T16B, v1, v5); + __ eor(v2, __ T16B, v2, v6); + __ eor(v3, __ T16B, v3, v7); + __ orr(v0, __ T16B, v0, v1); + __ orr(v1, __ T16B, v2, v3); + __ orr(v0, __ T16B, v0, v1); + __ umov(tmp1, v0, __ D, 0); + __ umov(tmp2, v0, __ D, 1); + __ orr(tmp1, tmp1, tmp2); + __ cbnz(tmp1, NOT_EQUAL); + __ br(__ GE, LOOP); + } + + // a1 = r1 - array1 address + // a2 = r2 - array2 address + // result = r0 - return value. Already contains "false" + // cnt1 = r10 - amount of elements left to check, reduced by wordSize + // r3-r5 are reserved temporary registers + address generate_large_array_equals() { + StubCodeMark mark(this, "StubRoutines", "large_array_equals"); + Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, + tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, + tmp7 = r12, tmp8 = r13; + Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, + SMALL_LOOP, POST_LOOP; + const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; + // calculate if at least 32 prefetched bytes are used + int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; + int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); + RegSet spilled_regs = RegSet::range(tmp6, tmp8); + assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, + tmp5, tmp6, tmp7, tmp8); + + __ align(CodeEntryAlignment); + address entry = __ pc(); + __ enter(); + __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub + // also advance pointers to use post-increment instead of pre-increment + __ add(a1, a1, wordSize); + __ add(a2, a2, wordSize); + if (AvoidUnalignedAccesses) { + // both implementations (SIMD/nonSIMD) are using relatively large load + // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) + // on some CPUs in case of address is not at least 16-byte aligned. + // Arrays are 8-byte aligned currently, so, we can make additional 8-byte + // load if needed at least for 1st address and make if 16-byte aligned. + Label ALIGNED16; + __ tbz(a1, 3, ALIGNED16); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ sub(cnt1, cnt1, wordSize); + __ eor(tmp1, tmp1, tmp2); + __ cbnz(tmp1, NOT_EQUAL_NO_POP); + __ bind(ALIGNED16); + } + if (UseSIMDForArrayEquals) { + if (SoftwarePrefetchHintDistance >= 0) { + __ cmp(cnt1, prefetchLoopThreshold); + __ br(__ LE, NO_PREFETCH_LARGE_LOOP); + generate_large_array_equals_loop_simd(prefetchLoopThreshold, + /* prfm = */ true, NOT_EQUAL); + __ cmp(cnt1, nonPrefetchLoopThreshold); + __ br(__ LT, TAIL); + } + __ bind(NO_PREFETCH_LARGE_LOOP); + generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, + /* prfm = */ false, NOT_EQUAL); + } else { + __ push(spilled_regs, sp); + if (SoftwarePrefetchHintDistance >= 0) { + __ cmp(cnt1, prefetchLoopThreshold); + __ br(__ LE, NO_PREFETCH_LARGE_LOOP); + generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, + /* prfm = */ true, NOT_EQUAL); + __ cmp(cnt1, nonPrefetchLoopThreshold); + __ br(__ LT, TAIL); + } + __ bind(NO_PREFETCH_LARGE_LOOP); + generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, + /* prfm = */ false, NOT_EQUAL); + } + __ bind(TAIL); + __ cbz(cnt1, EQUAL); + __ subs(cnt1, cnt1, wordSize); + __ br(__ LE, POST_LOOP); + __ bind(SMALL_LOOP); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ subs(cnt1, cnt1, wordSize); + __ eor(tmp1, tmp1, tmp2); + __ cbnz(tmp1, NOT_EQUAL); + __ br(__ GT, SMALL_LOOP); + __ bind(POST_LOOP); + __ ldr(tmp1, Address(a1, cnt1)); + __ ldr(tmp2, Address(a2, cnt1)); + __ eor(tmp1, tmp1, tmp2); + __ cbnz(tmp1, NOT_EQUAL); + __ bind(EQUAL); + __ mov(result, true); + __ bind(NOT_EQUAL); + if (!UseSIMDForArrayEquals) { + __ pop(spilled_regs, sp); + } + __ bind(NOT_EQUAL_NO_POP); + __ leave(); + __ ret(lr); + return entry; + } + + /** * Arguments: * @@ -4895,6 +5071,11 @@ class StubGenerator: public StubCodeGenerator { // has negatives stub for large arrays. StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); + // array equals stub for large arrays. + if (!UseSimpleArrayEquals) { + StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); + } + if (UseMultiplyToLenIntrinsic) { StubRoutines::_multiplyToLen = generate_multiplyToLen(); } diff --git a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp index 1313166ca3a..2741888c13c 100644 --- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp @@ -46,6 +46,7 @@ address StubRoutines::aarch64::_double_sign_flip = NULL; address StubRoutines::aarch64::_zero_blocks = NULL; address StubRoutines::aarch64::_has_negatives = NULL; address StubRoutines::aarch64::_has_negatives_long = NULL; +address StubRoutines::aarch64::_large_array_equals = NULL; bool StubRoutines::aarch64::_completed = false; /** diff --git a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp index e7a6bc3c850..92378f15e2c 100644 --- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp @@ -65,6 +65,7 @@ class aarch64 { static address _has_negatives; static address _has_negatives_long; + static address _large_array_equals; static bool _completed; public: @@ -131,6 +132,10 @@ class aarch64 { return _has_negatives_long; } + static address large_array_equals() { + return _large_array_equals; + } + static bool complete() { return _completed; } diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index b5911b559a8..0cd4c3565ad 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -203,7 +203,11 @@ void VM_Version::get_processor_features() { if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) { FLAG_SET_DEFAULT(UseSIMDForMemoryOps, (_variant > 0)); } + if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) { + FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false); + } } + // ThunderX2 if ((_cpu == CPU_CAVIUM && (_model == 0xAF)) || (_cpu == CPU_BROADCOM && (_model == 0x516))) { @@ -218,7 +222,25 @@ void VM_Version::get_processor_features() { } } - if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _features |= CPU_A53MAC; + // Cortex A53 + if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) { + _features |= CPU_A53MAC; + if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) { + FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false); + } + } + + // Cortex A73 + if (_cpu == CPU_ARM && (_model == 0xd09 || _model2 == 0xd09)) { + if (FLAG_IS_DEFAULT(SoftwarePrefetchHintDistance)) { + FLAG_SET_DEFAULT(SoftwarePrefetchHintDistance, -1); + } + // A73 is faster with short-and-easy-for-speculative-execution-loop + if (FLAG_IS_DEFAULT(UseSimpleArrayEquals)) { + FLAG_SET_DEFAULT(UseSimpleArrayEquals, true); + } + } + if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07)) _features |= CPU_STXR_PREFETCH; // If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is an A57 (0xd07) // we assume the worst and assume we could be on a big little system and have