8310268: RISC-V: misaligned memory access in String.Compare intrinsic

Co-authored-by: Feilong Jiang <fjiang@openjdk.org>
Reviewed-by: fyang
This commit is contained in:
Vladimir Kempik 2023-07-28 21:55:33 +00:00
parent 402cb6a550
commit d6245b6832
4 changed files with 100 additions and 119 deletions

@ -868,9 +868,10 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2,
// load first parts of strings and finish initialization while loading
{
if (str1_isL == str2_isL) { // LL or UU
// check if str1 and str2 is same pointer
beq(str1, str2, DONE);
// load 8 bytes once to compare
ld(tmp1, Address(str1));
beq(str1, str2, DONE);
ld(tmp2, Address(str2));
mv(t0, STUB_THRESHOLD);
bge(cnt2, t0, STUB);
@ -913,9 +914,8 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2,
addi(cnt1, cnt1, 8);
}
addi(cnt2, cnt2, isUL ? 4 : 8);
bne(tmp1, tmp2, DIFFERENCE);
bgez(cnt2, TAIL);
xorr(tmp3, tmp1, tmp2);
bnez(tmp3, DIFFERENCE);
// main loop
bind(NEXT_WORD);
@ -944,38 +944,30 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2,
addi(cnt1, cnt1, 8);
addi(cnt2, cnt2, 4);
}
bgez(cnt2, TAIL);
xorr(tmp3, tmp1, tmp2);
beqz(tmp3, NEXT_WORD);
j(DIFFERENCE);
bne(tmp1, tmp2, DIFFERENCE);
bltz(cnt2, NEXT_WORD);
bind(TAIL);
xorr(tmp3, tmp1, tmp2);
bnez(tmp3, DIFFERENCE);
// Last longword. In the case where length == 4 we compare the
// same longword twice, but that's still faster than another
// conditional branch.
if (str1_isL == str2_isL) { // LL or UU
ld(tmp1, Address(str1));
ld(tmp2, Address(str2));
load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
} else if (isLU) { // LU case
lwu(tmp1, Address(str1));
ld(tmp2, Address(str2));
load_int_misaligned(tmp1, Address(str1), tmp3, false);
load_long_misaligned(tmp2, Address(str2), tmp3, 2);
inflate_lo32(tmp3, tmp1);
mv(tmp1, tmp3);
} else { // UL case
lwu(tmp2, Address(str2));
ld(tmp1, Address(str1));
load_int_misaligned(tmp2, Address(str2), tmp3, false);
load_long_misaligned(tmp1, Address(str1), tmp3, 2);
inflate_lo32(tmp3, tmp2);
mv(tmp2, tmp3);
}
bind(TAIL_CHECK);
xorr(tmp3, tmp1, tmp2);
beqz(tmp3, DONE);
beq(tmp1, tmp2, DONE);
// Find the first different characters in the longwords and
// compute their difference.
bind(DIFFERENCE);
xorr(tmp3, tmp1, tmp2);
ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
srl(tmp1, tmp1, result);
srl(tmp2, tmp2, result);

@ -3967,18 +3967,17 @@ void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1
void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
assert_different_registers(Rd, Rs, tmp1, tmp2);
mv(tmp1, 0xFF);
mv(Rd, zr);
for (int i = 0; i <= 3; i++) {
mv(tmp1, 0xFF000000); // first byte mask at lower word
andr(Rd, Rs, tmp1);
for (int i = 0; i < 2; i++) {
slli(Rd, Rd, wordSize);
srli(tmp1, tmp1, wordSize);
andr(tmp2, Rs, tmp1);
if (i) {
slli(tmp2, tmp2, i * 8);
}
orr(Rd, Rd, tmp2);
if (i != 3) {
slli(tmp1, tmp1, 8);
}
}
slli(Rd, Rd, wordSize);
andi(tmp2, Rs, 0xFF); // last byte mask at lower word
orr(Rd, Rd, tmp2);
}
// This instruction reads adjacent 4 bytes from the upper half of source register,
@ -3987,17 +3986,8 @@ void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Regis
// Rd: 00A700A600A500A4
void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
assert_different_registers(Rd, Rs, tmp1, tmp2);
mv(tmp1, 0xFF00000000);
mv(Rd, zr);
for (int i = 0; i <= 3; i++) {
andr(tmp2, Rs, tmp1);
orr(Rd, Rd, tmp2);
srli(Rd, Rd, 8);
if (i != 3) {
slli(tmp1, tmp1, 8);
}
}
srli(Rs, Rs, 32); // only upper 32 bits are needed
inflate_lo32(Rd, Rs, tmp1, tmp2);
}
// The size of the blocks erased by the zero_blocks stub. We must

@ -2275,24 +2275,21 @@ class StubGenerator: public StubCodeGenerator {
}
// code for comparing 8 characters of strings with Latin1 and Utf16 encoding
void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
Label &DIFF2) {
const Register strU = x12, curU = x7, strL = x29, tmp = x30;
__ ld(tmpL, Address(strL));
__ addi(strL, strL, 8);
void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) {
const Register tmp = x30, tmpLval = x12;
__ ld(tmpLval, Address(strL));
__ addi(strL, strL, wordSize);
__ ld(tmpU, Address(strU));
__ addi(strU, strU, 8);
__ inflate_lo32(tmp, tmpL);
__ mv(t0, tmp);
__ xorr(tmp, curU, t0);
__ bnez(tmp, DIFF2);
__ addi(strU, strU, wordSize);
__ inflate_lo32(tmpL, tmpLval);
__ xorr(tmp, tmpU, tmpL);
__ bnez(tmp, DIFF);
__ ld(curU, Address(strU));
__ addi(strU, strU, 8);
__ inflate_hi32(tmp, tmpL);
__ mv(t0, tmp);
__ xorr(tmp, tmpU, t0);
__ bnez(tmp, DIFF1);
__ ld(tmpU, Address(strU));
__ addi(strU, strU, wordSize);
__ inflate_hi32(tmpL, tmpLval);
__ xorr(tmp, tmpU, tmpL);
__ bnez(tmp, DIFF);
}
// x10 = result
@ -2307,11 +2304,9 @@ class StubGenerator: public StubCodeGenerator {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
address entry = __ pc();
Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
DONE, CALCULATE_DIFFERENCE;
const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
RegSet spilled_regs = RegSet::of(tmp4, tmp5);
Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
// cnt2 == amount of characters left to compare
// Check already loaded first 4 symbols
@ -2319,77 +2314,81 @@ class StubGenerator: public StubCodeGenerator {
__ mv(isLU ? tmp1 : tmp2, tmp3);
__ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
__ addi(str2, str2, isLU ? wordSize : wordSize / 2);
__ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
__ push_reg(spilled_regs, sp);
__ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols
if (isLU) {
__ add(str1, str1, cnt2);
__ shadd(str2, cnt2, str2, t0, 1);
} else {
__ shadd(str1, cnt2, str1, t0, 1);
__ add(str2, str2, cnt2);
}
__ xorr(tmp3, tmp1, tmp2);
__ mv(tmp5, tmp2);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
Register strU = isLU ? str2 : str1,
strL = isLU ? str1 : str2,
tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison
tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison
tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
__ sub(tmp2, strL, cnt2); // strL pointer to load from
__ slli(t0, cnt2, 1);
__ sub(cnt1, strU, t0); // strU pointer to load from
// make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL
// cnt2 is >= 68 here, no need to check it for >= 0
__ lwu(tmpL, Address(strL));
__ addi(strL, strL, wordSize / 2);
__ ld(tmpU, Address(strU));
__ addi(strU, strU, wordSize);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ addi(cnt2, cnt2, -wordSize / 2);
__ ld(tmp4, Address(cnt1));
__ addi(cnt1, cnt1, 8);
__ beqz(cnt2, LOAD_LAST); // no characters left except last load
__ sub(cnt2, cnt2, 16);
// we are now 8-bytes aligned on strL
__ sub(cnt2, cnt2, wordSize * 2);
__ bltz(cnt2, TAIL);
__ bind(SMALL_LOOP); // smaller loop
__ sub(cnt2, cnt2, 16);
compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
__ sub(cnt2, cnt2, wordSize * 2);
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
__ bgez(cnt2, SMALL_LOOP);
__ addi(t0, cnt2, 16);
__ beqz(t0, LOAD_LAST);
__ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
// Address of 8 bytes before last 4 characters in UTF-16 string
__ shadd(cnt1, cnt2, cnt1, t0, 1);
// Address of 16 bytes before last 4 characters in Latin1 string
__ add(tmp2, tmp2, cnt2);
__ ld(tmp4, Address(cnt1, -8));
// last 16 characters before last load
compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2);
__ j(LOAD_LAST);
__ bind(DIFF2);
__ mv(tmpU, tmp4);
__ bind(DIFF1);
__ mv(tmpL, t0);
__ j(CALCULATE_DIFFERENCE);
__ bind(LOAD_LAST);
// Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU.
// No need to load it again
__ mv(tmpU, tmp4);
__ ld(tmpL, Address(strL));
__ addi(t0, cnt2, wordSize * 2);
__ beqz(t0, DONE);
__ bind(TAIL); // 1..15 characters left
// Aligned access. Load bytes in portions - 4, 2, 1.
__ addi(t0, cnt2, wordSize);
__ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
__ bltz(t0, LOAD_LAST);
// remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
__ addi(cnt2, cnt2, -wordSize);
__ beqz(cnt2, DONE); // no character left
__ bind(LOAD_LAST); // cnt2 = 1..7 characters left
__ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
__ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes
__ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
__ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string
__ load_int_misaligned(tmpL, Address(strL), t0, false);
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ beqz(tmp3, DONE);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
__ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string
__ load_int_misaligned(tmpL, Address(strL), t0, false);
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ j(DONE); // no character left
// Find the first different characters in the longwords and
// compute their difference.
__ bind(CALCULATE_DIFFERENCE);
__ ctzc_bit(tmp4, tmp3);
__ srl(tmp1, tmp1, tmp4);
__ srl(tmp5, tmp5, tmp4);
__ srl(tmp2, tmp2, tmp4);
__ andi(tmp1, tmp1, 0xFFFF);
__ andi(tmp5, tmp5, 0xFFFF);
__ sub(result, tmp1, tmp5);
__ andi(tmp2, tmp2, 0xFFFF);
__ sub(result, tmp1, tmp2);
__ bind(DONE);
__ pop_reg(spilled_regs, sp);
__ ret();
return entry;
}
@ -2502,9 +2501,9 @@ class StubGenerator: public StubCodeGenerator {
__ xorr(tmp4, tmp1, tmp2);
__ bnez(tmp4, DIFF);
__ add(str1, str1, cnt2);
__ ld(tmp5, Address(str1));
__ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
__ add(str2, str2, cnt2);
__ ld(cnt1, Address(str2));
__ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
__ xorr(tmp4, tmp5, cnt1);
__ beqz(tmp4, LENGTH_DIFF);
// Find the first different characters in the longwords and

@ -24,20 +24,20 @@
/*
* @test
* @requires os.arch=="aarch64"
* @requires os.arch=="aarch64" | os.arch=="riscv64"
* @summary String::compareTo implementation uses different algorithms for
* different string length. This test creates string with specified
* size and longer string, which is same at beginning.
* Expecting length delta to be returned. Test class takes 2
* parameters: <string length>, <maximum string length delta>
* Input parameters for this test are set according to Aarch64
* Input parameters for this test are set according to Aarch64/RISC-V
* String::compareTo intrinsic implementation specifics. Aarch64
* implementation has 1, 4, 8 -bytes loops for length < 72 and
* 16, 32, 64 -characters loops for length >= 72. Code is also affected
* 16, 32, 64 -characters loops for length >= 72. Aarch64 Code is also affected
* by SoftwarePrefetchHintDistance vm flag value.
* @run main/othervm -XX:SoftwarePrefetchHintDistance=192 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90 192 193 208 209
* @run main/othervm -XX:SoftwarePrefetchHintDistance=16 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90
* @run main/othervm -XX:SoftwarePrefetchHintDistance=-1 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:SoftwarePrefetchHintDistance=192 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90 192 193 208 209
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:SoftwarePrefetchHintDistance=16 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:SoftwarePrefetchHintDistance=-1 compiler.intrinsics.string.TestStringCompareToDifferentLength 4 2 5 10 13 17 20 23 24 25 71 72 73 88 90
*/
package compiler.intrinsics.string;