8189112: AARCH64: optimize StringUTF16 compress intrinsic
Reviewed-by: aph
This commit is contained in:
parent
1d79d38007
commit
0d65441316
@ -5408,65 +5408,103 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
|
|||||||
FloatRegister Vtmp1, FloatRegister Vtmp2,
|
FloatRegister Vtmp1, FloatRegister Vtmp2,
|
||||||
FloatRegister Vtmp3, FloatRegister Vtmp4)
|
FloatRegister Vtmp3, FloatRegister Vtmp4)
|
||||||
{
|
{
|
||||||
Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
|
Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
|
||||||
Register tmp1 = rscratch1;
|
NEXT_32_START, NEXT_32_PRFM_START;
|
||||||
|
Register tmp1 = rscratch1, tmp2 = rscratch2;
|
||||||
|
|
||||||
mov(result, len); // Save initial len
|
mov(result, len); // Save initial len
|
||||||
|
|
||||||
#ifndef BUILTIN_SIM
|
#ifndef BUILTIN_SIM
|
||||||
subs(len, len, 32);
|
cmp(len, 8); // handle shortest strings first
|
||||||
br(LT, LOOP_8);
|
br(LT, LOOP_1);
|
||||||
|
cmp(len, 32);
|
||||||
// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
|
br(LT, NEXT_8);
|
||||||
// to convert chars to bytes. These set the 'QC' bit in the FPSR if
|
// The following code uses the SIMD 'uzp1' and 'uzp2' instructions
|
||||||
// any char could not fit in a byte, so clear the FPSR so we can test it.
|
// to convert chars to bytes
|
||||||
clear_fpsr();
|
if (SoftwarePrefetchHintDistance >= 0) {
|
||||||
|
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||||
BIND(NEXT_32);
|
cmp(len, SoftwarePrefetchHintDistance/2 + 16);
|
||||||
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
br(LE, NEXT_32_START);
|
||||||
uqxtn(Vtmp1, T8B, Vtmp1, T8H); // uqxtn - write bottom half
|
b(NEXT_32_PRFM_START);
|
||||||
uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
|
BIND(NEXT_32_PRFM);
|
||||||
uqxtn(Vtmp2, T8B, Vtmp3, T8H);
|
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||||
uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
|
BIND(NEXT_32_PRFM_START);
|
||||||
get_fpsr(tmp1);
|
prfm(Address(src, SoftwarePrefetchHintDistance));
|
||||||
cbnzw(tmp1, LOOP_8);
|
orr(v4, T16B, Vtmp1, Vtmp2);
|
||||||
st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
|
orr(v5, T16B, Vtmp3, Vtmp4);
|
||||||
subs(len, len, 32);
|
uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
|
||||||
|
uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
|
||||||
|
stpq(Vtmp1, Vtmp3, dst);
|
||||||
|
uzp2(v5, T16B, v4, v5); // high bytes
|
||||||
|
umov(tmp2, v5, D, 1);
|
||||||
|
fmovd(tmp1, v5);
|
||||||
|
orr(tmp1, tmp1, tmp2);
|
||||||
|
cbnz(tmp1, LOOP_8);
|
||||||
|
sub(len, len, 32);
|
||||||
|
add(dst, dst, 32);
|
||||||
|
add(src, src, 64);
|
||||||
|
cmp(len, SoftwarePrefetchHintDistance/2 + 16);
|
||||||
|
br(GE, NEXT_32_PRFM);
|
||||||
|
cmp(len, 32);
|
||||||
|
br(LT, LOOP_8);
|
||||||
|
BIND(NEXT_32);
|
||||||
|
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||||
|
BIND(NEXT_32_START);
|
||||||
|
} else {
|
||||||
|
BIND(NEXT_32);
|
||||||
|
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
|
||||||
|
}
|
||||||
|
prfm(Address(src, SoftwarePrefetchHintDistance));
|
||||||
|
uzp1(v4, T16B, Vtmp1, Vtmp2);
|
||||||
|
uzp1(v5, T16B, Vtmp3, Vtmp4);
|
||||||
|
stpq(v4, v5, dst);
|
||||||
|
orr(Vtmp1, T16B, Vtmp1, Vtmp2);
|
||||||
|
orr(Vtmp3, T16B, Vtmp3, Vtmp4);
|
||||||
|
uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
|
||||||
|
umov(tmp2, Vtmp1, D, 1);
|
||||||
|
fmovd(tmp1, Vtmp1);
|
||||||
|
orr(tmp1, tmp1, tmp2);
|
||||||
|
cbnz(tmp1, LOOP_8);
|
||||||
|
sub(len, len, 32);
|
||||||
|
add(dst, dst, 32);
|
||||||
add(src, src, 64);
|
add(src, src, 64);
|
||||||
|
cmp(len, 32);
|
||||||
br(GE, NEXT_32);
|
br(GE, NEXT_32);
|
||||||
|
cbz(len, DONE);
|
||||||
|
|
||||||
BIND(LOOP_8);
|
BIND(LOOP_8);
|
||||||
adds(len, len, 32-8);
|
cmp(len, 8);
|
||||||
br(LT, LOOP_1);
|
br(LT, LOOP_1);
|
||||||
clear_fpsr(); // QC may be set from loop above, clear again
|
|
||||||
BIND(NEXT_8);
|
BIND(NEXT_8);
|
||||||
ld1(Vtmp1, T8H, src);
|
ld1(Vtmp1, T8H, src);
|
||||||
uqxtn(Vtmp1, T8B, Vtmp1, T8H);
|
uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
|
||||||
get_fpsr(tmp1);
|
uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
|
||||||
cbnzw(tmp1, LOOP_1);
|
strd(Vtmp2, dst);
|
||||||
st1(Vtmp1, T8B, post(dst, 8));
|
fmovd(tmp1, Vtmp3);
|
||||||
subs(len, len, 8);
|
cbnz(tmp1, NEXT_1);
|
||||||
|
|
||||||
|
sub(len, len, 8);
|
||||||
|
add(dst, dst, 8);
|
||||||
add(src, src, 16);
|
add(src, src, 16);
|
||||||
|
cmp(len, 8);
|
||||||
br(GE, NEXT_8);
|
br(GE, NEXT_8);
|
||||||
|
|
||||||
BIND(LOOP_1);
|
BIND(LOOP_1);
|
||||||
adds(len, len, 8);
|
|
||||||
br(LE, DONE);
|
|
||||||
#else
|
|
||||||
cbz(len, DONE);
|
|
||||||
#endif
|
#endif
|
||||||
|
cbz(len, DONE);
|
||||||
BIND(NEXT_1);
|
BIND(NEXT_1);
|
||||||
ldrh(tmp1, Address(post(src, 2)));
|
ldrh(tmp1, Address(post(src, 2)));
|
||||||
tst(tmp1, 0xff00);
|
|
||||||
br(NE, DONE);
|
|
||||||
strb(tmp1, Address(post(dst, 1)));
|
strb(tmp1, Address(post(dst, 1)));
|
||||||
|
tst(tmp1, 0xff00);
|
||||||
|
br(NE, SET_RESULT);
|
||||||
subs(len, len, 1);
|
subs(len, len, 1);
|
||||||
br(GT, NEXT_1);
|
br(GT, NEXT_1);
|
||||||
|
|
||||||
BIND(DONE);
|
BIND(SET_RESULT);
|
||||||
sub(result, result, len); // Return index where we stopped
|
sub(result, result, len); // Return index where we stopped
|
||||||
// Return len == 0 if we processed all
|
// Return len == 0 if we processed all
|
||||||
// characters
|
// characters
|
||||||
|
BIND(DONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user