8274243: Implement fast-path for ASCII-compatible CharsetEncoders on aarch64
Reviewed-by: neliasso, redestad
This commit is contained in:
parent
8fed8ab29c
commit
ddddec7d74
@ -17018,16 +17018,17 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
|
||||
iRegI_R0 result, rFlagsReg cr)
|
||||
%{
|
||||
match(Set result (StrCompressedCopy src (Binary dst len)));
|
||||
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
|
||||
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4,
|
||||
USE_KILL src, USE_KILL dst, USE len, KILL cr);
|
||||
|
||||
format %{ "String Compress $src,$dst -> $result // KILL R1, R2, R3, R4" %}
|
||||
format %{ "String Compress $src,$dst,$len -> $result // KILL $src,$dst" %}
|
||||
ins_encode %{
|
||||
__ char_array_compress($src$$Register, $dst$$Register, $len$$Register,
|
||||
$result$$Register,
|
||||
$tmp1$$FloatRegister, $tmp2$$FloatRegister,
|
||||
$tmp3$$FloatRegister, $tmp4$$FloatRegister,
|
||||
$result$$Register);
|
||||
$tmp3$$FloatRegister, $tmp4$$FloatRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
// fast byte[] to char[] inflation
|
||||
@ -17052,22 +17053,43 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len
|
||||
|
||||
// encode char[] to byte[] in ISO_8859_1
|
||||
instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
|
||||
vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
|
||||
vRegD_V2 Vtmp3, vRegD_V3 Vtmp4,
|
||||
vRegD_V0 vtmp0, vRegD_V1 vtmp1,
|
||||
vRegD_V2 vtmp2, vRegD_V3 vtmp3,
|
||||
iRegI_R0 result, rFlagsReg cr)
|
||||
%{
|
||||
predicate(!((EncodeISOArrayNode*)n)->is_ascii());
|
||||
match(Set result (EncodeISOArray src (Binary dst len)));
|
||||
effect(USE_KILL src, USE_KILL dst, USE_KILL len,
|
||||
KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr);
|
||||
effect(USE_KILL src, USE_KILL dst, USE len,
|
||||
KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr);
|
||||
|
||||
format %{ "Encode array $src,$dst,$len -> $result" %}
|
||||
format %{ "Encode ISO array $src,$dst,$len -> $result" %}
|
||||
ins_encode %{
|
||||
__ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
|
||||
$result$$Register, $Vtmp1$$FloatRegister, $Vtmp2$$FloatRegister,
|
||||
$Vtmp3$$FloatRegister, $Vtmp4$$FloatRegister);
|
||||
$result$$Register, false,
|
||||
$vtmp0$$FloatRegister, $vtmp1$$FloatRegister,
|
||||
$vtmp2$$FloatRegister, $vtmp3$$FloatRegister);
|
||||
%}
|
||||
ins_pipe( pipe_class_memory );
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct encode_ascii_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
|
||||
vRegD_V0 vtmp0, vRegD_V1 vtmp1,
|
||||
vRegD_V2 vtmp2, vRegD_V3 vtmp3,
|
||||
iRegI_R0 result, rFlagsReg cr)
|
||||
%{
|
||||
predicate(((EncodeISOArrayNode*)n)->is_ascii());
|
||||
match(Set result (EncodeISOArray src (Binary dst len)));
|
||||
effect(USE_KILL src, USE_KILL dst, USE len,
|
||||
KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr);
|
||||
|
||||
format %{ "Encode ASCII array $src,$dst,$len -> $result" %}
|
||||
ins_encode %{
|
||||
__ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
|
||||
$result$$Register, true,
|
||||
$vtmp0$$FloatRegister, $vtmp1$$FloatRegister,
|
||||
$vtmp2$$FloatRegister, $vtmp3$$FloatRegister);
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
// ============================================================================
|
||||
|
@ -2424,6 +2424,12 @@ public:
|
||||
INSN(cnt, 0, 0b100000010110, 0); // accepted arrangements: T8B, T16B
|
||||
INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S
|
||||
// Zero compare.
|
||||
INSN(cmeq, 0, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
INSN(cmge, 1, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
INSN(cmgt, 0, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
INSN(cmle, 1, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
INSN(cmlt, 0, 0b100000101010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
|
||||
#undef INSN
|
||||
|
||||
|
@ -4923,112 +4923,119 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
|
||||
bind(fini);
|
||||
}
|
||||
|
||||
// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
|
||||
// java/lang/StringUTF16.compress.
|
||||
// Intrinsic for
|
||||
//
|
||||
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
|
||||
// return the number of characters copied.
|
||||
// - java/lang/StringUTF16.compress
|
||||
// return zero (0) if copy fails, otherwise 'len'.
|
||||
//
|
||||
// This version always returns the number of characters copied, and does not
|
||||
// clobber the 'len' register. A successful copy will complete with the post-
|
||||
// condition: 'res' == 'len', while an unsuccessful copy will exit with the
|
||||
// post-condition: 0 <= 'res' < 'len'.
|
||||
//
|
||||
// NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
|
||||
// degrade performance (on Ampere Altra - Neoverse N1), to an extent
|
||||
// beyond the acceptable, even though the footprint would be smaller.
|
||||
// Using 'umaxv' in the ASCII-case comes with a small penalty but does
|
||||
// avoid additional bloat.
|
||||
//
|
||||
void MacroAssembler::encode_iso_array(Register src, Register dst,
|
||||
Register len, Register result,
|
||||
FloatRegister Vtmp1, FloatRegister Vtmp2,
|
||||
FloatRegister Vtmp3, FloatRegister Vtmp4)
|
||||
Register len, Register res, bool ascii,
|
||||
FloatRegister vtmp0, FloatRegister vtmp1,
|
||||
FloatRegister vtmp2, FloatRegister vtmp3)
|
||||
{
|
||||
Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
|
||||
NEXT_32_START, NEXT_32_PRFM_START;
|
||||
Register tmp1 = rscratch1, tmp2 = rscratch2;
|
||||
Register cnt = res;
|
||||
Register max = rscratch1;
|
||||
Register chk = rscratch2;
|
||||
|
||||
mov(result, len); // Save initial len
|
||||
prfm(Address(src), PLDL1STRM);
|
||||
movw(cnt, len);
|
||||
|
||||
cmp(len, (u1)8); // handle shortest strings first
|
||||
br(LT, LOOP_1);
|
||||
cmp(len, (u1)32);
|
||||
br(LT, NEXT_8);
|
||||
// The following code uses the SIMD 'uzp1' and 'uzp2' instructions
|
||||
// to convert chars to bytes
|
||||
if (SoftwarePrefetchHintDistance >= 0) {
|
||||
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||
subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
|
||||
br(LE, NEXT_32_START);
|
||||
b(NEXT_32_PRFM_START);
|
||||
BIND(NEXT_32_PRFM);
|
||||
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||
BIND(NEXT_32_PRFM_START);
|
||||
prfm(Address(src, SoftwarePrefetchHintDistance));
|
||||
orr(v4, T16B, Vtmp1, Vtmp2);
|
||||
orr(v5, T16B, Vtmp3, Vtmp4);
|
||||
uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
|
||||
uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
|
||||
uzp2(v5, T16B, v4, v5); // high bytes
|
||||
umov(tmp2, v5, D, 1);
|
||||
fmovd(tmp1, v5);
|
||||
orr(tmp1, tmp1, tmp2);
|
||||
cbnz(tmp1, LOOP_8);
|
||||
stpq(Vtmp1, Vtmp3, dst);
|
||||
sub(len, len, 32);
|
||||
add(dst, dst, 32);
|
||||
add(src, src, 64);
|
||||
subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
|
||||
br(GE, NEXT_32_PRFM);
|
||||
cmp(len, (u1)32);
|
||||
br(LT, LOOP_8);
|
||||
BIND(NEXT_32);
|
||||
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||
BIND(NEXT_32_START);
|
||||
} else {
|
||||
BIND(NEXT_32);
|
||||
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||
}
|
||||
prfm(Address(src, SoftwarePrefetchHintDistance));
|
||||
uzp1(v4, T16B, Vtmp1, Vtmp2);
|
||||
uzp1(v5, T16B, Vtmp3, Vtmp4);
|
||||
orr(Vtmp1, T16B, Vtmp1, Vtmp2);
|
||||
orr(Vtmp3, T16B, Vtmp3, Vtmp4);
|
||||
uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
|
||||
umov(tmp2, Vtmp1, D, 1);
|
||||
fmovd(tmp1, Vtmp1);
|
||||
orr(tmp1, tmp1, tmp2);
|
||||
cbnz(tmp1, LOOP_8);
|
||||
stpq(v4, v5, dst);
|
||||
sub(len, len, 32);
|
||||
add(dst, dst, 32);
|
||||
add(src, src, 64);
|
||||
cmp(len, (u1)32);
|
||||
br(GE, NEXT_32);
|
||||
cbz(len, DONE);
|
||||
#define ASCII(insn) do { if (ascii) { insn; } } while (0)
|
||||
|
||||
BIND(LOOP_8);
|
||||
cmp(len, (u1)8);
|
||||
br(LT, LOOP_1);
|
||||
BIND(NEXT_8);
|
||||
ld1(Vtmp1, T8H, src);
|
||||
uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
|
||||
uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
|
||||
fmovd(tmp1, Vtmp3);
|
||||
cbnz(tmp1, NEXT_1);
|
||||
strd(Vtmp2, dst);
|
||||
Label LOOP_32, DONE_32, FAIL_32;
|
||||
|
||||
sub(len, len, 8);
|
||||
add(dst, dst, 8);
|
||||
add(src, src, 16);
|
||||
cmp(len, (u1)8);
|
||||
br(GE, NEXT_8);
|
||||
BIND(LOOP_32);
|
||||
{
|
||||
cmpw(cnt, 32);
|
||||
br(LT, DONE_32);
|
||||
ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
|
||||
// Extract lower bytes.
|
||||
FloatRegister vlo0 = v4;
|
||||
FloatRegister vlo1 = v5;
|
||||
uzp1(vlo0, T16B, vtmp0, vtmp1);
|
||||
uzp1(vlo1, T16B, vtmp2, vtmp3);
|
||||
// Merge bits...
|
||||
orr(vtmp0, T16B, vtmp0, vtmp1);
|
||||
orr(vtmp2, T16B, vtmp2, vtmp3);
|
||||
// Extract merged upper bytes.
|
||||
FloatRegister vhix = vtmp0;
|
||||
uzp2(vhix, T16B, vtmp0, vtmp2);
|
||||
// ISO-check on hi-parts (all zero).
|
||||
// ASCII-check on lo-parts (no sign).
|
||||
FloatRegister vlox = vtmp1; // Merge lower bytes.
|
||||
ASCII(orr(vlox, T16B, vlo0, vlo1));
|
||||
umov(chk, vhix, D, 1); ASCII(cmlt(vlox, T16B, vlox));
|
||||
fmovd(max, vhix); ASCII(umaxv(vlox, T16B, vlox));
|
||||
orr(chk, chk, max); ASCII(umov(max, vlox, B, 0));
|
||||
ASCII(orr(chk, chk, max));
|
||||
cbnz(chk, FAIL_32);
|
||||
subw(cnt, cnt, 32);
|
||||
st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
|
||||
b(LOOP_32);
|
||||
}
|
||||
BIND(FAIL_32);
|
||||
sub(src, src, 64);
|
||||
BIND(DONE_32);
|
||||
|
||||
BIND(LOOP_1);
|
||||
Label LOOP_8, SKIP_8;
|
||||
|
||||
cbz(len, DONE);
|
||||
BIND(NEXT_1);
|
||||
ldrh(tmp1, Address(post(src, 2)));
|
||||
tst(tmp1, 0xff00);
|
||||
br(NE, SET_RESULT);
|
||||
strb(tmp1, Address(post(dst, 1)));
|
||||
subs(len, len, 1);
|
||||
br(GT, NEXT_1);
|
||||
BIND(LOOP_8);
|
||||
{
|
||||
cmpw(cnt, 8);
|
||||
br(LT, SKIP_8);
|
||||
FloatRegister vhi = vtmp0;
|
||||
FloatRegister vlo = vtmp1;
|
||||
ld1(vtmp3, T8H, src);
|
||||
uzp1(vlo, T16B, vtmp3, vtmp3);
|
||||
uzp2(vhi, T16B, vtmp3, vtmp3);
|
||||
// ISO-check on hi-parts (all zero).
|
||||
// ASCII-check on lo-parts (no sign).
|
||||
ASCII(cmlt(vtmp2, T16B, vlo));
|
||||
fmovd(chk, vhi); ASCII(umaxv(vtmp2, T16B, vtmp2));
|
||||
ASCII(umov(max, vtmp2, B, 0));
|
||||
ASCII(orr(chk, chk, max));
|
||||
cbnz(chk, SKIP_8);
|
||||
|
||||
BIND(SET_RESULT);
|
||||
sub(result, result, len); // Return index where we stopped
|
||||
// Return len == 0 if we processed all
|
||||
// characters
|
||||
BIND(DONE);
|
||||
strd(vlo, Address(post(dst, 8)));
|
||||
subw(cnt, cnt, 8);
|
||||
add(src, src, 16);
|
||||
b(LOOP_8);
|
||||
}
|
||||
BIND(SKIP_8);
|
||||
|
||||
#undef ASCII
|
||||
|
||||
Label LOOP, DONE;
|
||||
|
||||
cbz(cnt, DONE);
|
||||
BIND(LOOP);
|
||||
{
|
||||
Register chr = rscratch1;
|
||||
ldrh(chr, Address(post(src, 2)));
|
||||
tst(chr, ascii ? 0xff80 : 0xff00);
|
||||
br(NE, DONE);
|
||||
strb(chr, Address(post(dst, 1)));
|
||||
subs(cnt, cnt, 1);
|
||||
br(GT, LOOP);
|
||||
}
|
||||
BIND(DONE);
|
||||
// Return index where we stopped.
|
||||
subw(res, len, cnt);
|
||||
}
|
||||
|
||||
|
||||
// Inflate byte[] array to char[].
|
||||
address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
|
||||
FloatRegister vtmp1, FloatRegister vtmp2,
|
||||
@ -5136,13 +5143,13 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register
|
||||
|
||||
// Compress char[] array to byte[].
|
||||
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
|
||||
FloatRegister tmp1Reg, FloatRegister tmp2Reg,
|
||||
FloatRegister tmp3Reg, FloatRegister tmp4Reg,
|
||||
Register result) {
|
||||
encode_iso_array(src, dst, len, result,
|
||||
tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
|
||||
cmp(len, zr);
|
||||
csel(result, result, zr, EQ);
|
||||
Register res,
|
||||
FloatRegister tmp0, FloatRegister tmp1,
|
||||
FloatRegister tmp2, FloatRegister tmp3) {
|
||||
encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3);
|
||||
// Adjust result: res == len ? len : 0
|
||||
cmp(len, res);
|
||||
csel(res, res, zr, EQ);
|
||||
}
|
||||
|
||||
// get_thread() can be called anywhere inside generated code so we
|
||||
|
@ -1255,14 +1255,15 @@ public:
|
||||
FloatRegister vtmp3, Register tmp4);
|
||||
|
||||
void char_array_compress(Register src, Register dst, Register len,
|
||||
FloatRegister tmp1Reg, FloatRegister tmp2Reg,
|
||||
FloatRegister tmp3Reg, FloatRegister tmp4Reg,
|
||||
Register result);
|
||||
Register res,
|
||||
FloatRegister vtmp0, FloatRegister vtmp1,
|
||||
FloatRegister vtmp2, FloatRegister vtmp3);
|
||||
|
||||
void encode_iso_array(Register src, Register dst,
|
||||
Register len, Register result,
|
||||
FloatRegister Vtmp1, FloatRegister Vtmp2,
|
||||
FloatRegister Vtmp3, FloatRegister Vtmp4);
|
||||
Register len, Register res, bool ascii,
|
||||
FloatRegister vtmp0, FloatRegister vtmp1,
|
||||
FloatRegister vtmp2, FloatRegister vtmp3);
|
||||
|
||||
void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2,
|
||||
FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5,
|
||||
FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3,
|
||||
|
@ -161,6 +161,6 @@
|
||||
}
|
||||
|
||||
// Implements a variant of EncodeISOArrayNode that encode ASCII only
|
||||
static const bool supports_encode_ascii_array = false;
|
||||
static const bool supports_encode_ascii_array = true;
|
||||
|
||||
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP
|
||||
|
@ -61,7 +61,7 @@ public class CharsetEncodeDecode {
|
||||
private CharsetEncoder encoder;
|
||||
private CharsetDecoder decoder;
|
||||
|
||||
@Param({"UTF-8", "BIG5", "ISO-8859-15", "ASCII", "UTF-16"})
|
||||
@Param({"UTF-8", "BIG5", "ISO-8859-15", "ISO-8859-1", "ASCII", "UTF-16"})
|
||||
private String type;
|
||||
|
||||
@Param("16384")
|
||||
|
Loading…
Reference in New Issue
Block a user