8189112: AARCH64: optimize StringUTF16 compress intrinsic

Reviewed-by: aph
This commit is contained in:
Dmitrij Pochepko 2018-06-22 20:17:02 +03:00
parent 1d79d38007
commit 0d65441316

View File

@ -5408,65 +5408,103 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
FloatRegister Vtmp1, FloatRegister Vtmp2, FloatRegister Vtmp1, FloatRegister Vtmp2,
FloatRegister Vtmp3, FloatRegister Vtmp4) FloatRegister Vtmp3, FloatRegister Vtmp4)
{ {
Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1; Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
Register tmp1 = rscratch1; NEXT_32_START, NEXT_32_PRFM_START;
Register tmp1 = rscratch1, tmp2 = rscratch2;
mov(result, len); // Save initial len mov(result, len); // Save initial len
#ifndef BUILTIN_SIM #ifndef BUILTIN_SIM
subs(len, len, 32); cmp(len, 8); // handle shortest strings first
br(LT, LOOP_8); br(LT, LOOP_1);
cmp(len, 32);
// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions br(LT, NEXT_8);
// to convert chars to bytes. These set the 'QC' bit in the FPSR if // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
// any char could not fit in a byte, so clear the FPSR so we can test it. // to convert chars to bytes
clear_fpsr(); if (SoftwarePrefetchHintDistance >= 0) {
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32); cmp(len, SoftwarePrefetchHintDistance/2 + 16);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); br(LE, NEXT_32_START);
uqxtn(Vtmp1, T8B, Vtmp1, T8H); // uqxtn - write bottom half b(NEXT_32_PRFM_START);
uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half BIND(NEXT_32_PRFM);
uqxtn(Vtmp2, T8B, Vtmp3, T8H); ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2 BIND(NEXT_32_PRFM_START);
get_fpsr(tmp1); prfm(Address(src, SoftwarePrefetchHintDistance));
cbnzw(tmp1, LOOP_8); orr(v4, T16B, Vtmp1, Vtmp2);
st1(Vtmp1, Vtmp2, T16B, post(dst, 32)); orr(v5, T16B, Vtmp3, Vtmp4);
subs(len, len, 32); uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
stpq(Vtmp1, Vtmp3, dst);
uzp2(v5, T16B, v4, v5); // high bytes
umov(tmp2, v5, D, 1);
fmovd(tmp1, v5);
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64);
cmp(len, SoftwarePrefetchHintDistance/2 + 16);
br(GE, NEXT_32_PRFM);
cmp(len, 32);
br(LT, LOOP_8);
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
BIND(NEXT_32_START);
} else {
BIND(NEXT_32);
ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
}
prfm(Address(src, SoftwarePrefetchHintDistance));
uzp1(v4, T16B, Vtmp1, Vtmp2);
uzp1(v5, T16B, Vtmp3, Vtmp4);
stpq(v4, v5, dst);
orr(Vtmp1, T16B, Vtmp1, Vtmp2);
orr(Vtmp3, T16B, Vtmp3, Vtmp4);
uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
umov(tmp2, Vtmp1, D, 1);
fmovd(tmp1, Vtmp1);
orr(tmp1, tmp1, tmp2);
cbnz(tmp1, LOOP_8);
sub(len, len, 32);
add(dst, dst, 32);
add(src, src, 64); add(src, src, 64);
cmp(len, 32);
br(GE, NEXT_32); br(GE, NEXT_32);
cbz(len, DONE);
BIND(LOOP_8); BIND(LOOP_8);
adds(len, len, 32-8); cmp(len, 8);
br(LT, LOOP_1); br(LT, LOOP_1);
clear_fpsr(); // QC may be set from loop above, clear again
BIND(NEXT_8); BIND(NEXT_8);
ld1(Vtmp1, T8H, src); ld1(Vtmp1, T8H, src);
uqxtn(Vtmp1, T8B, Vtmp1, T8H); uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
get_fpsr(tmp1); uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
cbnzw(tmp1, LOOP_1); strd(Vtmp2, dst);
st1(Vtmp1, T8B, post(dst, 8)); fmovd(tmp1, Vtmp3);
subs(len, len, 8); cbnz(tmp1, NEXT_1);
sub(len, len, 8);
add(dst, dst, 8);
add(src, src, 16); add(src, src, 16);
cmp(len, 8);
br(GE, NEXT_8); br(GE, NEXT_8);
BIND(LOOP_1); BIND(LOOP_1);
adds(len, len, 8);
br(LE, DONE);
#else
cbz(len, DONE);
#endif #endif
cbz(len, DONE);
BIND(NEXT_1); BIND(NEXT_1);
ldrh(tmp1, Address(post(src, 2))); ldrh(tmp1, Address(post(src, 2)));
tst(tmp1, 0xff00);
br(NE, DONE);
strb(tmp1, Address(post(dst, 1))); strb(tmp1, Address(post(dst, 1)));
tst(tmp1, 0xff00);
br(NE, SET_RESULT);
subs(len, len, 1); subs(len, len, 1);
br(GT, NEXT_1); br(GT, NEXT_1);
BIND(DONE); BIND(SET_RESULT);
sub(result, result, len); // Return index where we stopped sub(result, result, len); // Return index where we stopped
// Return len == 0 if we processed all // Return len == 0 if we processed all
// characters // characters
BIND(DONE);
} }