From ddddec7d74745905230282124524a0dbdd1bd1c1 Mon Sep 17 00:00:00 2001 From: Patric Hedlin Date: Wed, 12 Jan 2022 15:30:54 +0000 Subject: [PATCH] 8274243: Implement fast-path for ASCII-compatible CharsetEncoders on aarch64 Reviewed-by: neliasso, redestad --- src/hotspot/cpu/aarch64/aarch64.ad | 48 ++-- src/hotspot/cpu/aarch64/assembler_aarch64.hpp | 6 + .../cpu/aarch64/macroAssembler_aarch64.cpp | 211 +++++++++--------- .../cpu/aarch64/macroAssembler_aarch64.hpp | 13 +- src/hotspot/cpu/aarch64/matcher_aarch64.hpp | 2 +- .../bench/java/nio/CharsetEncodeDecode.java | 2 +- 6 files changed, 159 insertions(+), 123 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 75af9ed04b2..b82ee59b99c 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -17018,16 +17018,17 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, + USE_KILL src, USE_KILL dst, USE len, KILL cr); - format %{ "String Compress $src,$dst -> $result // KILL R1, R2, R3, R4" %} + format %{ "String Compress $src,$dst,$len -> $result // KILL $src,$dst" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, + $result$$Register, $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$FloatRegister, - $result$$Register); + $tmp3$$FloatRegister, $tmp4$$FloatRegister); %} - ins_pipe( pipe_slow ); + ins_pipe(pipe_slow); %} // fast byte[] to char[] inflation @@ -17052,22 +17053,43 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len // encode char[] to byte[] in ISO_8859_1 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 Vtmp1, vRegD_V1 Vtmp2, - vRegD_V2 Vtmp3, vRegD_V3 Vtmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, + vRegD_V2 vtmp2, vRegD_V3 vtmp3, iRegI_R0 result, rFlagsReg cr) %{ predicate(!((EncodeISOArrayNode*)n)->is_ascii()); match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE_KILL len, - KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE len, + KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); - format %{ "Encode array $src,$dst,$len -> $result" %} + format %{ "Encode ISO array $src,$dst,$len -> $result" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, - $result$$Register, $Vtmp1$$FloatRegister, $Vtmp2$$FloatRegister, - $Vtmp3$$FloatRegister, $Vtmp4$$FloatRegister); + $result$$Register, false, + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); %} - ins_pipe( pipe_class_memory ); + ins_pipe(pipe_class_memory); +%} + +instruct encode_ascii_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, + vRegD_V2 vtmp2, vRegD_V3 vtmp3, + iRegI_R0 result, rFlagsReg cr) +%{ + predicate(((EncodeISOArrayNode*)n)->is_ascii()); + match(Set result (EncodeISOArray src (Binary dst len))); + effect(USE_KILL src, USE_KILL dst, USE len, + KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); + + format %{ "Encode ASCII array $src,$dst,$len -> $result" %} + ins_encode %{ + __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, + $result$$Register, true, + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); + %} + ins_pipe(pipe_class_memory); %} // ============================================================================ diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 95edeadbf18..fb4b7b87141 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -2424,6 +2424,12 @@ public: INSN(cnt, 0, 0b100000010110, 0); // accepted arrangements: T8B, T16B INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S + // Zero compare. + INSN(cmeq, 0, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(cmge, 1, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(cmgt, 0, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(cmle, 1, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(cmlt, 0, 0b100000101010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D #undef INSN diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index bcb5eb92a0b..cce3d497bbc 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -4923,112 +4923,119 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) bind(fini); } -// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and -// java/lang/StringUTF16.compress. +// Intrinsic for +// +// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray +// return the number of characters copied. +// - java/lang/StringUTF16.compress +// return zero (0) if copy fails, otherwise 'len'. +// +// This version always returns the number of characters copied, and does not +// clobber the 'len' register. A successful copy will complete with the post- +// condition: 'res' == 'len', while an unsuccessful copy will exit with the +// post-condition: 0 <= 'res' < 'len'. +// +// NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to +// degrade performance (on Ampere Altra - Neoverse N1), to an extent +// beyond the acceptable, even though the footprint would be smaller. +// Using 'umaxv' in the ASCII-case comes with a small penalty but does +// avoid additional bloat. +// void MacroAssembler::encode_iso_array(Register src, Register dst, - Register len, Register result, - FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4) + Register len, Register res, bool ascii, + FloatRegister vtmp0, FloatRegister vtmp1, + FloatRegister vtmp2, FloatRegister vtmp3) { - Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, - NEXT_32_START, NEXT_32_PRFM_START; - Register tmp1 = rscratch1, tmp2 = rscratch2; + Register cnt = res; + Register max = rscratch1; + Register chk = rscratch2; - mov(result, len); // Save initial len + prfm(Address(src), PLDL1STRM); + movw(cnt, len); - cmp(len, (u1)8); // handle shortest strings first - br(LT, LOOP_1); - cmp(len, (u1)32); - br(LT, NEXT_8); - // The following code uses the SIMD 'uzp1' and 'uzp2' instructions - // to convert chars to bytes - if (SoftwarePrefetchHintDistance >= 0) { - ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); - subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); - br(LE, NEXT_32_START); - b(NEXT_32_PRFM_START); - BIND(NEXT_32_PRFM); - ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); - BIND(NEXT_32_PRFM_START); - prfm(Address(src, SoftwarePrefetchHintDistance)); - orr(v4, T16B, Vtmp1, Vtmp2); - orr(v5, T16B, Vtmp3, Vtmp4); - uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); - uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); - uzp2(v5, T16B, v4, v5); // high bytes - umov(tmp2, v5, D, 1); - fmovd(tmp1, v5); - orr(tmp1, tmp1, tmp2); - cbnz(tmp1, LOOP_8); - stpq(Vtmp1, Vtmp3, dst); - sub(len, len, 32); - add(dst, dst, 32); - add(src, src, 64); - subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); - br(GE, NEXT_32_PRFM); - cmp(len, (u1)32); - br(LT, LOOP_8); - BIND(NEXT_32); - ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); - BIND(NEXT_32_START); - } else { - BIND(NEXT_32); - ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); - } - prfm(Address(src, SoftwarePrefetchHintDistance)); - uzp1(v4, T16B, Vtmp1, Vtmp2); - uzp1(v5, T16B, Vtmp3, Vtmp4); - orr(Vtmp1, T16B, Vtmp1, Vtmp2); - orr(Vtmp3, T16B, Vtmp3, Vtmp4); - uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes - umov(tmp2, Vtmp1, D, 1); - fmovd(tmp1, Vtmp1); - orr(tmp1, tmp1, tmp2); - cbnz(tmp1, LOOP_8); - stpq(v4, v5, dst); - sub(len, len, 32); - add(dst, dst, 32); - add(src, src, 64); - cmp(len, (u1)32); - br(GE, NEXT_32); - cbz(len, DONE); +#define ASCII(insn) do { if (ascii) { insn; } } while (0) - BIND(LOOP_8); - cmp(len, (u1)8); - br(LT, LOOP_1); - BIND(NEXT_8); - ld1(Vtmp1, T8H, src); - uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes - uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes - fmovd(tmp1, Vtmp3); - cbnz(tmp1, NEXT_1); - strd(Vtmp2, dst); + Label LOOP_32, DONE_32, FAIL_32; - sub(len, len, 8); - add(dst, dst, 8); - add(src, src, 16); - cmp(len, (u1)8); - br(GE, NEXT_8); + BIND(LOOP_32); + { + cmpw(cnt, 32); + br(LT, DONE_32); + ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64))); + // Extract lower bytes. + FloatRegister vlo0 = v4; + FloatRegister vlo1 = v5; + uzp1(vlo0, T16B, vtmp0, vtmp1); + uzp1(vlo1, T16B, vtmp2, vtmp3); + // Merge bits... + orr(vtmp0, T16B, vtmp0, vtmp1); + orr(vtmp2, T16B, vtmp2, vtmp3); + // Extract merged upper bytes. + FloatRegister vhix = vtmp0; + uzp2(vhix, T16B, vtmp0, vtmp2); + // ISO-check on hi-parts (all zero). + // ASCII-check on lo-parts (no sign). + FloatRegister vlox = vtmp1; // Merge lower bytes. + ASCII(orr(vlox, T16B, vlo0, vlo1)); + umov(chk, vhix, D, 1); ASCII(cmlt(vlox, T16B, vlox)); + fmovd(max, vhix); ASCII(umaxv(vlox, T16B, vlox)); + orr(chk, chk, max); ASCII(umov(max, vlox, B, 0)); + ASCII(orr(chk, chk, max)); + cbnz(chk, FAIL_32); + subw(cnt, cnt, 32); + st1(vlo0, vlo1, T16B, Address(post(dst, 32))); + b(LOOP_32); + } + BIND(FAIL_32); + sub(src, src, 64); + BIND(DONE_32); - BIND(LOOP_1); + Label LOOP_8, SKIP_8; - cbz(len, DONE); - BIND(NEXT_1); - ldrh(tmp1, Address(post(src, 2))); - tst(tmp1, 0xff00); - br(NE, SET_RESULT); - strb(tmp1, Address(post(dst, 1))); - subs(len, len, 1); - br(GT, NEXT_1); + BIND(LOOP_8); + { + cmpw(cnt, 8); + br(LT, SKIP_8); + FloatRegister vhi = vtmp0; + FloatRegister vlo = vtmp1; + ld1(vtmp3, T8H, src); + uzp1(vlo, T16B, vtmp3, vtmp3); + uzp2(vhi, T16B, vtmp3, vtmp3); + // ISO-check on hi-parts (all zero). + // ASCII-check on lo-parts (no sign). + ASCII(cmlt(vtmp2, T16B, vlo)); + fmovd(chk, vhi); ASCII(umaxv(vtmp2, T16B, vtmp2)); + ASCII(umov(max, vtmp2, B, 0)); + ASCII(orr(chk, chk, max)); + cbnz(chk, SKIP_8); - BIND(SET_RESULT); - sub(result, result, len); // Return index where we stopped - // Return len == 0 if we processed all - // characters - BIND(DONE); + strd(vlo, Address(post(dst, 8))); + subw(cnt, cnt, 8); + add(src, src, 16); + b(LOOP_8); + } + BIND(SKIP_8); + +#undef ASCII + + Label LOOP, DONE; + + cbz(cnt, DONE); + BIND(LOOP); + { + Register chr = rscratch1; + ldrh(chr, Address(post(src, 2))); + tst(chr, ascii ? 0xff80 : 0xff00); + br(NE, DONE); + strb(chr, Address(post(dst, 1))); + subs(cnt, cnt, 1); + br(GT, LOOP); + } + BIND(DONE); + // Return index where we stopped. + subw(res, len, cnt); } - // Inflate byte[] array to char[]. address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, FloatRegister vtmp1, FloatRegister vtmp2, @@ -5136,13 +5143,13 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register // Compress char[] array to byte[]. void MacroAssembler::char_array_compress(Register src, Register dst, Register len, - FloatRegister tmp1Reg, FloatRegister tmp2Reg, - FloatRegister tmp3Reg, FloatRegister tmp4Reg, - Register result) { - encode_iso_array(src, dst, len, result, - tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); - cmp(len, zr); - csel(result, result, zr, EQ); + Register res, + FloatRegister tmp0, FloatRegister tmp1, + FloatRegister tmp2, FloatRegister tmp3) { + encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3); + // Adjust result: res == len ? len : 0 + cmp(len, res); + csel(res, res, zr, EQ); } // get_thread() can be called anywhere inside generated code so we diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index a9ebc8bdc69..8a2521f629c 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1255,14 +1255,15 @@ public: FloatRegister vtmp3, Register tmp4); void char_array_compress(Register src, Register dst, Register len, - FloatRegister tmp1Reg, FloatRegister tmp2Reg, - FloatRegister tmp3Reg, FloatRegister tmp4Reg, - Register result); + Register res, + FloatRegister vtmp0, FloatRegister vtmp1, + FloatRegister vtmp2, FloatRegister vtmp3); void encode_iso_array(Register src, Register dst, - Register len, Register result, - FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4); + Register len, Register res, bool ascii, + FloatRegister vtmp0, FloatRegister vtmp1, + FloatRegister vtmp2, FloatRegister vtmp3); + void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5, FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3, diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index f2f55e504ac..aca82240a57 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -161,6 +161,6 @@ } // Implements a variant of EncodeISOArrayNode that encode ASCII only - static const bool supports_encode_ascii_array = false; + static const bool supports_encode_ascii_array = true; #endif // CPU_AARCH64_MATCHER_AARCH64_HPP diff --git a/test/micro/org/openjdk/bench/java/nio/CharsetEncodeDecode.java b/test/micro/org/openjdk/bench/java/nio/CharsetEncodeDecode.java index 6e129a5466e..404bb0583fd 100644 --- a/test/micro/org/openjdk/bench/java/nio/CharsetEncodeDecode.java +++ b/test/micro/org/openjdk/bench/java/nio/CharsetEncodeDecode.java @@ -61,7 +61,7 @@ public class CharsetEncodeDecode { private CharsetEncoder encoder; private CharsetDecoder decoder; - @Param({"UTF-8", "BIG5", "ISO-8859-15", "ASCII", "UTF-16"}) + @Param({"UTF-8", "BIG5", "ISO-8859-15", "ISO-8859-1", "ASCII", "UTF-16"}) private String type; @Param("16384")