8274243: Implement fast-path for ASCII-compatible CharsetEncoders on aarch64

Reviewed-by: neliasso, redestad
This commit is contained in:
Patric Hedlin 2022-01-12 15:30:54 +00:00
parent 8fed8ab29c
commit ddddec7d74
6 changed files with 159 additions and 123 deletions

View File

@ -17018,16 +17018,17 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
iRegI_R0 result, rFlagsReg cr)
%{
match(Set result (StrCompressedCopy src (Binary dst len)));
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4,
USE_KILL src, USE_KILL dst, USE len, KILL cr);
format %{ "String Compress $src,$dst -> $result // KILL R1, R2, R3, R4" %}
format %{ "String Compress $src,$dst,$len -> $result // KILL $src,$dst" %}
ins_encode %{
__ char_array_compress($src$$Register, $dst$$Register, $len$$Register,
$result$$Register,
$tmp1$$FloatRegister, $tmp2$$FloatRegister,
$tmp3$$FloatRegister, $tmp4$$FloatRegister,
$result$$Register);
$tmp3$$FloatRegister, $tmp4$$FloatRegister);
%}
ins_pipe( pipe_slow );
ins_pipe(pipe_slow);
%}
// fast byte[] to char[] inflation
@ -17052,22 +17053,43 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len
// encode char[] to byte[] in ISO_8859_1
instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
vRegD_V2 Vtmp3, vRegD_V3 Vtmp4,
vRegD_V0 vtmp0, vRegD_V1 vtmp1,
vRegD_V2 vtmp2, vRegD_V3 vtmp3,
iRegI_R0 result, rFlagsReg cr)
%{
predicate(!((EncodeISOArrayNode*)n)->is_ascii());
match(Set result (EncodeISOArray src (Binary dst len)));
effect(USE_KILL src, USE_KILL dst, USE_KILL len,
KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr);
effect(USE_KILL src, USE_KILL dst, USE len,
KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr);
format %{ "Encode array $src,$dst,$len -> $result" %}
format %{ "Encode ISO array $src,$dst,$len -> $result" %}
ins_encode %{
__ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
$result$$Register, $Vtmp1$$FloatRegister, $Vtmp2$$FloatRegister,
$Vtmp3$$FloatRegister, $Vtmp4$$FloatRegister);
$result$$Register, false,
$vtmp0$$FloatRegister, $vtmp1$$FloatRegister,
$vtmp2$$FloatRegister, $vtmp3$$FloatRegister);
%}
ins_pipe( pipe_class_memory );
ins_pipe(pipe_class_memory);
%}
instruct encode_ascii_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
vRegD_V0 vtmp0, vRegD_V1 vtmp1,
vRegD_V2 vtmp2, vRegD_V3 vtmp3,
iRegI_R0 result, rFlagsReg cr)
%{
predicate(((EncodeISOArrayNode*)n)->is_ascii());
match(Set result (EncodeISOArray src (Binary dst len)));
effect(USE_KILL src, USE_KILL dst, USE len,
KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr);
format %{ "Encode ASCII array $src,$dst,$len -> $result" %}
ins_encode %{
__ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
$result$$Register, true,
$vtmp0$$FloatRegister, $vtmp1$$FloatRegister,
$vtmp2$$FloatRegister, $vtmp3$$FloatRegister);
%}
ins_pipe(pipe_class_memory);
%}
// ============================================================================

View File

@ -2424,6 +2424,12 @@ public:
INSN(cnt, 0, 0b100000010110, 0); // accepted arrangements: T8B, T16B
INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S
// Zero compare.
INSN(cmeq, 0, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(cmge, 1, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(cmgt, 0, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(cmle, 1, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(cmlt, 0, 0b100000101010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
#undef INSN

View File

@ -4923,112 +4923,119 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
bind(fini);
}
// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
// java/lang/StringUTF16.compress.
// Intrinsic for
//
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
// return the number of characters copied.
// - java/lang/StringUTF16.compress
// return zero (0) if copy fails, otherwise 'len'.
//
// This version always returns the number of characters copied, and does not
// clobber the 'len' register. A successful copy will complete with the post-
// condition: 'res' == 'len', while an unsuccessful copy will exit with the
// post-condition: 0 <= 'res' < 'len'.
//
// NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
// degrade performance (on Ampere Altra - Neoverse N1), to an extent
// beyond the acceptable, even though the footprint would be smaller.
// Using 'umaxv' in the ASCII-case comes with a small penalty but does
// avoid additional bloat.
//
void MacroAssembler::encode_iso_array(Register src, Register dst,
Register len, Register result,
FloatRegister Vtmp1, FloatRegister Vtmp2,
FloatRegister Vtmp3, FloatRegister Vtmp4)
Register len, Register res, bool ascii,
FloatRegister vtmp0, FloatRegister vtmp1,
FloatRegister vtmp2, FloatRegister vtmp3)
{
Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
NEXT_32_START, NEXT_32_PRFM_START;
Register tmp1 = rscratch1, tmp2 = rscratch2;
Register cnt = res;
Register max = rscratch1;
Register chk = rscratch2;
mov(result, len); // Save initial len
prfm(Address(src), PLDL1STRM);
movw(cnt, len);
cmp(len, (u1)8); // handle shortest strings first
br(LT, LOOP_1);
cmp(len, (u1)32);
br(LT, NEXT_8);
// The following code uses the SIMD 'uzp1' and 'uzp2' instructions
// to convert chars to bytes
if (SoftwarePrefetchHintDistance >= 0) {
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
br(LE, NEXT_32_START);
b(NEXT_32_PRFM_START);
BIND(NEXT_32_PRFM);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_PRFM_START);
prfm(Address(src, SoftwarePrefetchHintDistance));
orr(v4, T16B, Vtmp1, Vtmp2);
orr(v5, T16B, Vtmp3, Vtmp4);
uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
uzp2(v5, T16B, v4, v5); // high bytes
umov(tmp2, v5, D, 1);
fmovd(tmp1, v5);
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
stpq(Vtmp1, Vtmp3, dst);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
br(GE, NEXT_32_PRFM);
cmp(len, (u1)32);
br(LT, LOOP_8);
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_START);
} else {
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
#define ASCII(insn) do { if (ascii) { insn; } } while (0)
Label LOOP_32, DONE_32, FAIL_32;
BIND(LOOP_32);
{
cmpw(cnt, 32);
br(LT, DONE_32);
ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
// Extract lower bytes.
FloatRegister vlo0 = v4;
FloatRegister vlo1 = v5;
uzp1(vlo0, T16B, vtmp0, vtmp1);
uzp1(vlo1, T16B, vtmp2, vtmp3);
// Merge bits...
orr(vtmp0, T16B, vtmp0, vtmp1);
orr(vtmp2, T16B, vtmp2, vtmp3);
// Extract merged upper bytes.
FloatRegister vhix = vtmp0;
uzp2(vhix, T16B, vtmp0, vtmp2);
// ISO-check on hi-parts (all zero).
// ASCII-check on lo-parts (no sign).
FloatRegister vlox = vtmp1; // Merge lower bytes.
ASCII(orr(vlox, T16B, vlo0, vlo1));
umov(chk, vhix, D, 1); ASCII(cmlt(vlox, T16B, vlox));
fmovd(max, vhix); ASCII(umaxv(vlox, T16B, vlox));
orr(chk, chk, max); ASCII(umov(max, vlox, B, 0));
ASCII(orr(chk, chk, max));
cbnz(chk, FAIL_32);
subw(cnt, cnt, 32);
st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
b(LOOP_32);
}
prfm(Address(src, SoftwarePrefetchHintDistance));
uzp1(v4, T16B, Vtmp1, Vtmp2);
uzp1(v5, T16B, Vtmp3, Vtmp4);
orr(Vtmp1, T16B, Vtmp1, Vtmp2);
orr(Vtmp3, T16B, Vtmp3, Vtmp4);
uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
umov(tmp2, Vtmp1, D, 1);
fmovd(tmp1, Vtmp1);
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
stpq(v4, v5, dst);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
cmp(len, (u1)32);
br(GE, NEXT_32);
cbz(len, DONE);
BIND(FAIL_32);
sub(src, src, 64);
BIND(DONE_32);
Label LOOP_8, SKIP_8;
BIND(LOOP_8);
cmp(len, (u1)8);
br(LT, LOOP_1);
BIND(NEXT_8);
ld1(Vtmp1, T8H, src);
uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
fmovd(tmp1, Vtmp3);
cbnz(tmp1, NEXT_1);
strd(Vtmp2, dst);
{
cmpw(cnt, 8);
br(LT, SKIP_8);
FloatRegister vhi = vtmp0;
FloatRegister vlo = vtmp1;
ld1(vtmp3, T8H, src);
uzp1(vlo, T16B, vtmp3, vtmp3);
uzp2(vhi, T16B, vtmp3, vtmp3);
// ISO-check on hi-parts (all zero).
// ASCII-check on lo-parts (no sign).
ASCII(cmlt(vtmp2, T16B, vlo));
fmovd(chk, vhi); ASCII(umaxv(vtmp2, T16B, vtmp2));
ASCII(umov(max, vtmp2, B, 0));
ASCII(orr(chk, chk, max));
cbnz(chk, SKIP_8);
sub(len, len, 8);
add(dst, dst, 8);
strd(vlo, Address(post(dst, 8)));
subw(cnt, cnt, 8);
add(src, src, 16);
cmp(len, (u1)8);
br(GE, NEXT_8);
b(LOOP_8);
}
BIND(SKIP_8);
BIND(LOOP_1);
#undef ASCII
cbz(len, DONE);
BIND(NEXT_1);
ldrh(tmp1, Address(post(src, 2)));
tst(tmp1, 0xff00);
br(NE, SET_RESULT);
strb(tmp1, Address(post(dst, 1)));
subs(len, len, 1);
br(GT, NEXT_1);
Label LOOP, DONE;
BIND(SET_RESULT);
sub(result, result, len); // Return index where we stopped
// Return len == 0 if we processed all
// characters
cbz(cnt, DONE);
BIND(LOOP);
{
Register chr = rscratch1;
ldrh(chr, Address(post(src, 2)));
tst(chr, ascii ? 0xff80 : 0xff00);
br(NE, DONE);
strb(chr, Address(post(dst, 1)));
subs(cnt, cnt, 1);
br(GT, LOOP);
}
BIND(DONE);
// Return index where we stopped.
subw(res, len, cnt);
}
// Inflate byte[] array to char[].
address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
FloatRegister vtmp1, FloatRegister vtmp2,
@ -5136,13 +5143,13 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register
// Compress char[] array to byte[].
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
FloatRegister tmp1Reg, FloatRegister tmp2Reg,
FloatRegister tmp3Reg, FloatRegister tmp4Reg,
Register result) {
encode_iso_array(src, dst, len, result,
tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
cmp(len, zr);
csel(result, result, zr, EQ);
Register res,
FloatRegister tmp0, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3) {
encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3);
// Adjust result: res == len ? len : 0
cmp(len, res);
csel(res, res, zr, EQ);
}
// get_thread() can be called anywhere inside generated code so we

View File

@ -1255,14 +1255,15 @@ public:
FloatRegister vtmp3, Register tmp4);
void char_array_compress(Register src, Register dst, Register len,
FloatRegister tmp1Reg, FloatRegister tmp2Reg,
FloatRegister tmp3Reg, FloatRegister tmp4Reg,
Register result);
Register res,
FloatRegister vtmp0, FloatRegister vtmp1,
FloatRegister vtmp2, FloatRegister vtmp3);
void encode_iso_array(Register src, Register dst,
Register len, Register result,
FloatRegister Vtmp1, FloatRegister Vtmp2,
FloatRegister Vtmp3, FloatRegister Vtmp4);
Register len, Register res, bool ascii,
FloatRegister vtmp0, FloatRegister vtmp1,
FloatRegister vtmp2, FloatRegister vtmp3);
void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2,
FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5,
FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3,

View File

@ -161,6 +161,6 @@
}
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = false;
static const bool supports_encode_ascii_array = true;
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP

View File

@ -61,7 +61,7 @@ public class CharsetEncodeDecode {
private CharsetEncoder encoder;
private CharsetDecoder decoder;
@Param({"UTF-8", "BIG5", "ISO-8859-15", "ASCII", "UTF-16"})
@Param({"UTF-8", "BIG5", "ISO-8859-15", "ISO-8859-1", "ASCII", "UTF-16"})
private String type;
@Param("16384")