8311906: Improve robustness of String constructors with mutable array inputs

Co-authored-by: Damon Fenacci <dfenacci@openjdk.org>
Co-authored-by: Claes Redestad <redestad@openjdk.org>
Co-authored-by: Amit Kumar <amitkumar@openjdk.org>
Co-authored-by: Martin Doerr <mdoerr@openjdk.org>
Reviewed-by: rgiulietti, thartmann, redestad, dfenacci
This commit is contained in:
Roger Riggs 2023-12-04 18:28:59 +00:00
parent 316b78336c
commit 155abc576a
15 changed files with 1300 additions and 248 deletions

View File

@ -5651,7 +5651,7 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
// return the number of characters copied. // return the number of characters copied.
// - java/lang/StringUTF16.compress // - java/lang/StringUTF16.compress
// return zero (0) if copy fails, otherwise 'len'. // return index of non-latin1 character if copy fails, otherwise 'len'.
// //
// This version always returns the number of characters copied, and does not // This version always returns the number of characters copied, and does not
// clobber the 'len' register. A successful copy will complete with the post- // clobber the 'len' register. A successful copy will complete with the post-
@ -5868,15 +5868,15 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register
} }
// Compress char[] array to byte[]. // Compress char[] array to byte[].
// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
// Return the array length if every element in array can be encoded,
// otherwise, the index of first non-latin1 (> 0xff) character.
void MacroAssembler::char_array_compress(Register src, Register dst, Register len, void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
Register res, Register res,
FloatRegister tmp0, FloatRegister tmp1, FloatRegister tmp0, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp2, FloatRegister tmp3,
FloatRegister tmp4, FloatRegister tmp5) { FloatRegister tmp4, FloatRegister tmp5) {
encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
// Adjust result: res == len ? len : 0
cmp(len, res);
csel(res, res, zr, EQ);
} }
// java.math.round(double a) // java.math.round(double a)

View File

@ -12727,16 +12727,8 @@ instruct string_compress(rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegIdst re
ins_cost(300); ins_cost(300);
format %{ "String Compress $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %} format %{ "String Compress $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %}
ins_encode %{ ins_encode %{
Label Lskip, Ldone; __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register, $tmp2$$Register,
__ li($result$$Register, 0); $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, $result$$Register, false);
__ string_compress_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register,
$tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, Ldone);
__ rldicl_($tmp1$$Register, $len$$Register, 0, 64-3); // Remaining characters.
__ beq(CCR0, Lskip);
__ string_compress($src$$Register, $dst$$Register, $tmp1$$Register, $tmp2$$Register, Ldone);
__ bind(Lskip);
__ mr($result$$Register, $len$$Register);
__ bind(Ldone);
%} %}
ins_pipe(pipe_class_default); ins_pipe(pipe_class_default);
%} %}

View File

@ -1933,14 +1933,12 @@ void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Registe
} }
// Compress char[] array to byte[]. // Compress char[] array to byte[].
// result: the array length if every element in array can be encoded; 0, otherwise. // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
// result: the array length if every element in array can be encoded,
// otherwise, the index of first non-latin1 (> 0xff) character.
void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
Register result, Register tmp) { Register result, Register tmp) {
Label done;
encode_iso_array_v(src, dst, len, result, tmp, false); encode_iso_array_v(src, dst, len, result, tmp, false);
beqz(len, done);
mv(result, zr);
bind(done);
} }
// Intrinsic for // Intrinsic for
@ -1948,7 +1946,7 @@ void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Regist
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
// return the number of characters copied. // return the number of characters copied.
// - java/lang/StringUTF16.compress // - java/lang/StringUTF16.compress
// return zero (0) if copy fails, otherwise 'len'. // return index of non-latin1 character if copy fails, otherwise 'len'.
// //
// This version always returns the number of characters copied. A successful // This version always returns the number of characters copied. A successful
// copy will complete with the post-condition: 'res' == 'len', while an // copy will complete with the post-condition: 'res' == 'len', while an

View File

@ -10190,7 +10190,7 @@ instruct string_compress(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tm
format %{ "String Compress $src->$dst($len) -> $result" %} format %{ "String Compress $src->$dst($len) -> $result" %}
ins_encode %{ ins_encode %{
__ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register, __ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
$tmp$$Register, false, false); $tmp$$Register, true, false);
%} %}
ins_pipe(pipe_class_dummy); ins_pipe(pipe_class_dummy);
%} %}

View File

@ -8628,15 +8628,19 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe
#undef BLOCK_COMMENT #undef BLOCK_COMMENT
// Compress char[] array to byte[]. // Compress char[] array to byte[].
// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
// Return the array length if every element in array can be encoded,
// otherwise, the index of first non-latin1 (> 0xff) character.
// @IntrinsicCandidate // @IntrinsicCandidate
// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
// for (int i = 0; i < len; i++) { // for (int i = 0; i < len; i++) {
// int c = src[srcOff++]; // char c = src[srcOff];
// if (c >>> 8 != 0) { // if (c > 0xff) {
// return 0; // return i; // return index of non-latin1 char
// } // }
// dst[dstOff++] = (byte)c; // dst[dstOff] = (byte)c;
// srcOff++;
// dstOff++;
// } // }
// return len; // return len;
// } // }
@ -8644,7 +8648,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result, KRegister mask1, KRegister mask2) { Register tmp5, Register result, KRegister mask1, KRegister mask2) {
Label copy_chars_loop, return_length, return_zero, done; Label copy_chars_loop, done, reset_sp, copy_tail;
// rsi: src // rsi: src
// rdi: dst // rdi: dst
@ -8659,28 +8663,28 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
assert(len != result, ""); assert(len != result, "");
// save length for return // save length for return
push(len); movl(result, len);
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() && VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) { VM_Version::supports_bmi2()) {
Label copy_32_loop, copy_loop_tail, below_threshold; Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
// alignment // alignment
Label post_alignment; Label post_alignment;
// if length of the string is less than 16, handle it in an old fashioned way // if length of the string is less than 32, handle it the old fashioned way
testl(len, -32); testl(len, -32);
jcc(Assembler::zero, below_threshold); jcc(Assembler::zero, below_threshold);
// First check whether a character is compressible ( <= 0xFF). // First check whether a character is compressible ( <= 0xFF).
// Create mask to test for Unicode chars inside zmm vector // Create mask to test for Unicode chars inside zmm vector
movl(result, 0x00FF); movl(tmp5, 0x00FF);
evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
testl(len, -64); testl(len, -64);
jcc(Assembler::zero, post_alignment); jccb(Assembler::zero, post_alignment);
movl(tmp5, dst); movl(tmp5, dst);
andl(tmp5, (32 - 1)); andl(tmp5, (32 - 1));
@ -8689,18 +8693,19 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
// bail out when there is nothing to be done // bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF); testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, post_alignment); jccb(Assembler::zero, post_alignment);
// ~(~0 << len), where len is the # of remaining elements to process // ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF); movl(len, 0xFFFFFFFF);
shlxl(result, result, tmp5); shlxl(len, len, tmp5);
notl(result); notl(len);
kmovdl(mask2, result); kmovdl(mask2, len);
movl(len, result);
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
ktestd(mask1, mask2); ktestd(mask1, mask2);
jcc(Assembler::carryClear, return_zero); jcc(Assembler::carryClear, copy_tail);
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
@ -8715,7 +8720,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movl(tmp5, len); movl(tmp5, len);
andl(tmp5, (32 - 1)); // tail count (in chars) andl(tmp5, (32 - 1)); // tail count (in chars)
andl(len, ~(32 - 1)); // vector count (in chars) andl(len, ~(32 - 1)); // vector count (in chars)
jcc(Assembler::zero, copy_loop_tail); jccb(Assembler::zero, copy_loop_tail);
lea(src, Address(src, len, Address::times_2)); lea(src, Address(src, len, Address::times_2));
lea(dst, Address(dst, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_1));
@ -8725,55 +8730,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
kortestdl(mask1, mask1); kortestdl(mask1, mask1);
jcc(Assembler::carryClear, return_zero); jccb(Assembler::carryClear, reset_for_copy_tail);
// All elements in current processed chunk are valid candidates for // All elements in current processed chunk are valid candidates for
// compression. Write a truncated byte elements to the memory. // compression. Write a truncated byte elements to the memory.
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
addptr(len, 32); addptr(len, 32);
jcc(Assembler::notZero, copy_32_loop); jccb(Assembler::notZero, copy_32_loop);
bind(copy_loop_tail); bind(copy_loop_tail);
// bail out when there is nothing to be done // bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF); testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, return_length); jcc(Assembler::zero, done);
movl(len, tmp5); movl(len, tmp5);
// ~(~0 << len), where len is the # of remaining elements to process // ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF); movl(tmp5, 0xFFFFFFFF);
shlxl(result, result, len); shlxl(tmp5, tmp5, len);
notl(result); notl(tmp5);
kmovdl(mask2, result); kmovdl(mask2, tmp5);
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
ktestd(mask1, mask2); ktestd(mask1, mask2);
jcc(Assembler::carryClear, return_zero); jcc(Assembler::carryClear, copy_tail);
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
jmp(return_length); jmp(done);
bind(reset_for_copy_tail);
lea(src, Address(src, tmp5, Address::times_2));
lea(dst, Address(dst, tmp5, Address::times_1));
subptr(len, tmp5);
jmp(copy_chars_loop);
bind(below_threshold); bind(below_threshold);
} }
if (UseSSE42Intrinsics) { if (UseSSE42Intrinsics) {
Label copy_32_loop, copy_16, copy_tail; Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
movl(result, len);
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
// vectored compression // vectored compression
andl(len, 0xfffffff0); // vector count (in chars) testl(len, 0xfffffff8);
andl(result, 0x0000000f); // tail count (in chars) jcc(Assembler::zero, copy_tail);
testl(len, len);
jcc(Assembler::zero, copy_16);
// compress 16 chars per iter movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
movdl(tmp1Reg, tmp5); movdl(tmp1Reg, tmp5);
pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
andl(len, 0xfffffff0);
jccb(Assembler::zero, copy_16);
// compress 16 chars per iter
pxor(tmp4Reg, tmp4Reg); pxor(tmp4Reg, tmp4Reg);
lea(src, Address(src, len, Address::times_2)); lea(src, Address(src, len, Address::times_2));
@ -8786,59 +8796,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
por(tmp4Reg, tmp3Reg); por(tmp4Reg, tmp3Reg);
ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
jcc(Assembler::notZero, return_zero); jccb(Assembler::notZero, reset_for_copy_tail);
packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
movdqu(Address(dst, len, Address::times_1), tmp2Reg); movdqu(Address(dst, len, Address::times_1), tmp2Reg);
addptr(len, 16); addptr(len, 16);
jcc(Assembler::notZero, copy_32_loop); jccb(Assembler::notZero, copy_32_loop);
// compress next vector of 8 chars (if any) // compress next vector of 8 chars (if any)
bind(copy_16); bind(copy_16);
movl(len, result); // len = 0
andl(len, 0xfffffff8); // vector count (in chars) testl(result, 0x00000008); // check if there's a block of 8 chars to compress
andl(result, 0x00000007); // tail count (in chars) jccb(Assembler::zero, copy_tail_sse);
testl(len, len);
jccb(Assembler::zero, copy_tail);
movdl(tmp1Reg, tmp5);
pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
pxor(tmp3Reg, tmp3Reg); pxor(tmp3Reg, tmp3Reg);
movdqu(tmp2Reg, Address(src, 0)); movdqu(tmp2Reg, Address(src, 0));
ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
jccb(Assembler::notZero, return_zero); jccb(Assembler::notZero, reset_for_copy_tail);
packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
movq(Address(dst, 0), tmp2Reg); movq(Address(dst, 0), tmp2Reg);
addptr(src, 16); addptr(src, 16);
addptr(dst, 8); addptr(dst, 8);
jmpb(copy_tail_sse);
bind(copy_tail); bind(reset_for_copy_tail);
movl(tmp5, result);
andl(tmp5, 0x0000000f);
lea(src, Address(src, tmp5, Address::times_2));
lea(dst, Address(dst, tmp5, Address::times_1));
subptr(len, tmp5);
jmpb(copy_chars_loop);
bind(copy_tail_sse);
movl(len, result); movl(len, result);
andl(len, 0x00000007); // tail count (in chars)
} }
// compress 1 char per iter // compress 1 char per iter
bind(copy_tail);
testl(len, len); testl(len, len);
jccb(Assembler::zero, return_length); jccb(Assembler::zero, done);
lea(src, Address(src, len, Address::times_2)); lea(src, Address(src, len, Address::times_2));
lea(dst, Address(dst, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_1));
negptr(len); negptr(len);
bind(copy_chars_loop); bind(copy_chars_loop);
load_unsigned_short(result, Address(src, len, Address::times_2)); load_unsigned_short(tmp5, Address(src, len, Address::times_2));
testl(result, 0xff00); // check if Unicode char testl(tmp5, 0xff00); // check if Unicode char
jccb(Assembler::notZero, return_zero); jccb(Assembler::notZero, reset_sp);
movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
increment(len); increment(len);
jcc(Assembler::notZero, copy_chars_loop); jccb(Assembler::notZero, copy_chars_loop);
// if compression succeeded, return length // add len then return (len will be zero if compress succeeded, otherwise negative)
bind(return_length); bind(reset_sp);
pop(result); addl(result, len);
jmpb(done);
// if compression failed, return 0
bind(return_zero);
xorl(result, result);
addptr(rsp, wordSize);
bind(done); bind(done);
} }

View File

@ -130,6 +130,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
* as the specified {@code CharSequence}. The initial capacity of * as the specified {@code CharSequence}. The initial capacity of
* the string builder is {@code 16} plus the length of the * the string builder is {@code 16} plus the length of the
* {@code CharSequence} argument. * {@code CharSequence} argument.
* <p>
* The contents are unspecified if the {@code CharSequence}
* is modified during string construction.
* *
* @param seq the sequence to copy. * @param seq the sequence to copy.
*/ */
@ -666,6 +669,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
* If {@code s} is {@code null}, then this method appends * If {@code s} is {@code null}, then this method appends
* characters as if the s parameter was a sequence containing the four * characters as if the s parameter was a sequence containing the four
* characters {@code "null"}. * characters {@code "null"}.
* <p>
* The contents are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* *
* @param s the sequence to append. * @param s the sequence to append.
* @param start the starting index of the subsequence to be appended. * @param start the starting index of the subsequence to be appended.
@ -1241,6 +1248,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
* invocation of this object's * invocation of this object's
* {@link #insert(int,CharSequence,int,int) insert}(dstOffset, s, 0, s.length()) * {@link #insert(int,CharSequence,int,int) insert}(dstOffset, s, 0, s.length())
* method. * method.
* <p>
* The contents are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* *
* <p>If {@code s} is {@code null}, then the four characters * <p>If {@code s} is {@code null}, then the four characters
* {@code "null"} are inserted into this sequence. * {@code "null"} are inserted into this sequence.
@ -1289,6 +1300,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
* <p>If {@code s} is {@code null}, then this method inserts * <p>If {@code s} is {@code null}, then this method inserts
* characters as if the s parameter was a sequence containing the four * characters as if the s parameter was a sequence containing the four
* characters {@code "null"}. * characters {@code "null"}.
* <p>
* The contents are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* *
* @param dstOffset the offset in this sequence. * @param dstOffset the offset in this sequence.
* @param s the sequence to be inserted. * @param s the sequence to be inserted.
@ -1675,11 +1690,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
/* for readObject() */ /* for readObject() */
void initBytes(char[] value, int off, int len) { void initBytes(char[] value, int off, int len) {
if (String.COMPACT_STRINGS) { if (String.COMPACT_STRINGS) {
this.value = StringUTF16.compress(value, off, len); byte[] val = StringUTF16.compress(value, off, len);
if (this.value != null) { this.coder = StringUTF16.coderFromArrayLen(val, len);
this.coder = LATIN1; this.value = val;
return; return;
}
} }
this.coder = UTF16; this.coder = UTF16;
this.value = StringUTF16.toBytes(value, off, len); this.value = StringUTF16.toBytes(value, off, len);
@ -1720,6 +1734,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
val[j++] = (byte)c; val[j++] = (byte)c;
} else { } else {
inflate(); inflate();
// store c to make sure it has a UTF16 char
StringUTF16.putChar(this.value, j++, c);
i++;
StringUTF16.putCharsSB(this.value, j, s, i, end); StringUTF16.putCharsSB(this.value, j, s, i, end);
return; return;
} }
@ -1812,6 +1829,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
} else { } else {
count = j; count = j;
inflate(); inflate();
// Store c to make sure sb has a UTF16 char
StringUTF16.putChar(this.value, j++, c);
count = j;
i++;
StringUTF16.putCharsSB(this.value, j, s, i, end); StringUTF16.putCharsSB(this.value, j, s, i, end);
count += end - i; count += end - i;
return; return;
@ -1923,6 +1944,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
* <p> * <p>
* If {@code cs} is {@code null}, then the four characters * If {@code cs} is {@code null}, then the four characters
* {@code "null"} are repeated into this sequence. * {@code "null"} are repeated into this sequence.
* <p>
* The contents are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* *
* @param cs a {@code CharSequence} * @param cs a {@code CharSequence}
* @param count number of times to copy * @param count number of times to copy

View File

@ -57,6 +57,10 @@ public interface Appendable {
* {@code csq}, the entire sequence may not be appended. For * {@code csq}, the entire sequence may not be appended. For
* instance, if {@code csq} is a {@link java.nio.CharBuffer} then * instance, if {@code csq} is a {@link java.nio.CharBuffer} then
* the subsequence to append is defined by the buffer's position and limit. * the subsequence to append is defined by the buffer's position and limit.
* <p>
* The contents of this {@code Appendable} are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* *
* @param csq * @param csq
* The character sequence to append. If {@code csq} is * The character sequence to append. If {@code csq} is
@ -81,6 +85,10 @@ public interface Appendable {
* <pre> * <pre>
* out.append(csq.subSequence(start, end)) </pre> * out.append(csq.subSequence(start, end)) </pre>
* *
* <p>
* The contents of this {@code Appendable} are unspecified if the {@code CharSequence}
* is modified during the method call or an exception is thrown
* when accessing the {@code CharSequence}.
* @param csq * @param csq
* The character sequence from which a subsequence will be * The character sequence from which a subsequence will be
* appended. If {@code csq} is {@code null}, then characters * appended. If {@code csq} is {@code null}, then characters

View File

@ -273,6 +273,9 @@ public final class String
* contents of the character array are copied; subsequent modification of * contents of the character array are copied; subsequent modification of
* the character array does not affect the newly created string. * the character array does not affect the newly created string.
* *
* <p> The contents of the string are unspecified if the character array
* is modified during string construction.
*
* @param value * @param value
* The initial value of the string * The initial value of the string
*/ */
@ -288,6 +291,9 @@ public final class String
* subarray are copied; subsequent modification of the character array does * subarray are copied; subsequent modification of the character array does
* not affect the newly created string. * not affect the newly created string.
* *
* <p> The contents of the string are unspecified if the character array
* is modified during string construction.
*
* @param value * @param value
* Array that is the source of characters * Array that is the source of characters
* *
@ -319,6 +325,9 @@ public final class String
* {@code char}s; subsequent modification of the {@code int} array does not * {@code char}s; subsequent modification of the {@code int} array does not
* affect the newly created string. * affect the newly created string.
* *
* <p> The contents of the string are unspecified if the codepoints array
* is modified during string construction.
*
* @param codePoints * @param codePoints
* Array that is the source of Unicode code points * Array that is the source of Unicode code points
* *
@ -346,12 +355,10 @@ public final class String
return; return;
} }
if (COMPACT_STRINGS) { if (COMPACT_STRINGS) {
byte[] val = StringLatin1.toBytes(codePoints, offset, count); byte[] val = StringUTF16.compress(codePoints, offset, count);
if (val != null) { this.coder = StringUTF16.coderFromArrayLen(val, count);
this.coder = LATIN1; this.value = val;
this.value = val; return;
return;
}
} }
this.coder = UTF16; this.coder = UTF16;
this.value = StringUTF16.toBytes(codePoints, offset, count); this.value = StringUTF16.toBytes(codePoints, offset, count);
@ -368,6 +375,9 @@ public final class String
* <p> Each {@code byte} in the subarray is converted to a {@code char} as * <p> Each {@code byte} in the subarray is converted to a {@code char} as
* specified in the {@link #String(byte[],int) String(byte[],int)} constructor. * specified in the {@link #String(byte[],int) String(byte[],int)} constructor.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @deprecated This method does not properly convert bytes into characters. * @deprecated This method does not properly convert bytes into characters.
* As of JDK&nbsp;1.1, the preferred way to do this is via the * As of JDK&nbsp;1.1, the preferred way to do this is via the
* {@code String} constructors that take a {@link Charset}, charset name, * {@code String} constructors that take a {@link Charset}, charset name,
@ -429,6 +439,9 @@ public final class String
* | (<b><i>b</i></b> &amp; 0xff)) * | (<b><i>b</i></b> &amp; 0xff))
* </pre></blockquote> * </pre></blockquote>
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @deprecated This method does not properly convert bytes into * @deprecated This method does not properly convert bytes into
* characters. As of JDK&nbsp;1.1, the preferred way to do this is via the * characters. As of JDK&nbsp;1.1, the preferred way to do this is via the
* {@code String} constructors that take a {@link Charset}, charset name, * {@code String} constructors that take a {@link Charset}, charset name,
@ -463,6 +476,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -501,6 +517,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -543,47 +562,43 @@ public final class String
this.coder = LATIN1; this.coder = LATIN1;
return; return;
} }
int sl = offset + length; // Decode with a stable copy, to be the result if the decoded length is the same
byte[] dst = new byte[length]; byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
if (dp > 0) { int sp = dp; // first dp bytes are already in the copy
System.arraycopy(bytes, offset, dst, 0, dp); while (sp < length) {
offset += dp; int b1 = latin1[sp++];
}
while (offset < sl) {
int b1 = bytes[offset++];
if (b1 >= 0) { if (b1 >= 0) {
dst[dp++] = (byte)b1; latin1[dp++] = (byte)b1;
continue; continue;
} }
if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
int b2 = bytes[offset]; int b2 = latin1[sp];
if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
dst[dp++] = (byte)decode2(b1, b2); latin1[dp++] = (byte)decode2(b1, b2);
offset++; sp++;
continue; continue;
} }
} }
// anything not a latin1, including the REPL // anything not a latin1, including the REPL
// we have to go with the utf16 // we have to go with the utf16
offset--; sp--;
break; break;
} }
if (offset == sl) { if (sp == length) {
if (dp != dst.length) { if (dp != latin1.length) {
dst = Arrays.copyOf(dst, dp); latin1 = Arrays.copyOf(latin1, dp);
} }
this.value = dst; this.value = latin1;
this.coder = LATIN1; this.coder = LATIN1;
return; return;
} }
byte[] buf = new byte[length << 1]; byte[] utf16 = new byte[length << 1];
StringLatin1.inflate(dst, 0, buf, 0, dp); StringLatin1.inflate(latin1, 0, utf16, 0, dp);
dst = buf; dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true);
if (dp != length) { if (dp != length) {
dst = Arrays.copyOf(dst, dp << 1); utf16 = Arrays.copyOf(utf16, dp << 1);
} }
this.value = dst; this.value = utf16;
this.coder = UTF16; this.coder = UTF16;
} else { // !COMPACT_STRINGS } else { // !COMPACT_STRINGS
byte[] dst = new byte[length << 1]; byte[] dst = new byte[length << 1];
@ -655,12 +670,10 @@ public final class String
char[] ca = new char[en]; char[] ca = new char[en];
int clen = ad.decode(bytes, offset, length, ca); int clen = ad.decode(bytes, offset, length, ca);
if (COMPACT_STRINGS) { if (COMPACT_STRINGS) {
byte[] bs = StringUTF16.compress(ca, 0, clen); byte[] val = StringUTF16.compress(ca, 0, clen);;
if (bs != null) { this.coder = StringUTF16.coderFromArrayLen(val, clen);
value = bs; this.value = val;
coder = LATIN1; return;
return;
}
} }
coder = UTF16; coder = UTF16;
value = StringUTF16.toBytes(ca, 0, clen); value = StringUTF16.toBytes(ca, 0, clen);
@ -686,12 +699,10 @@ public final class String
throw new Error(x); throw new Error(x);
} }
if (COMPACT_STRINGS) { if (COMPACT_STRINGS) {
byte[] bs = StringUTF16.compress(ca, 0, caLen); byte[] val = StringUTF16.compress(ca, 0, caLen);
if (bs != null) { this.coder = StringUTF16.coderFromArrayLen(val, caLen);
value = bs; this.value = val;
coder = LATIN1; return;
return;
}
} }
coder = UTF16; coder = UTF16;
value = StringUTF16.toBytes(ca, 0, caLen); value = StringUTF16.toBytes(ca, 0, caLen);
@ -829,10 +840,9 @@ public final class String
throw new IllegalArgumentException(x); throw new IllegalArgumentException(x);
} }
if (COMPACT_STRINGS) { if (COMPACT_STRINGS) {
byte[] bs = StringUTF16.compress(ca, 0, caLen); byte[] val = StringUTF16.compress(ca, 0, caLen);
if (bs != null) { int coder = StringUTF16.coderFromArrayLen(val, len);
return new String(bs, LATIN1); return new String(val, coder);
}
} }
return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16);
} }
@ -1386,6 +1396,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -1414,6 +1427,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -1438,6 +1454,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -1468,6 +1487,9 @@ public final class String
* java.nio.charset.CharsetDecoder} class should be used when more control * java.nio.charset.CharsetDecoder} class should be used when more control
* over the decoding process is required. * over the decoding process is required.
* *
* <p> The contents of the string are unspecified if the byte array
* is modified during string construction.
*
* @param bytes * @param bytes
* The bytes to be decoded into characters * The bytes to be decoded into characters
* *
@ -1496,6 +1518,9 @@ public final class String
* string builder are copied; subsequent modification of the string builder * string builder are copied; subsequent modification of the string builder
* does not affect the newly created string. * does not affect the newly created string.
* *
* <p> The contents of the string are unspecified if the {@code StringBuilder}
* is modified during string construction.
*
* <p> This constructor is provided to ease migration to {@code * <p> This constructor is provided to ease migration to {@code
* StringBuilder}. Obtaining a string from a string builder via the {@code * StringBuilder}. Obtaining a string from a string builder via the {@code
* toString} method is likely to run faster and is generally preferred. * toString} method is likely to run faster and is generally preferred.
@ -4488,6 +4513,9 @@ public final class String
* modification of the character array does not affect the returned * modification of the character array does not affect the returned
* string. * string.
* *
* <p> The contents of the string are unspecified if the character array
* is modified during string construction.
*
* @param data the character array. * @param data the character array.
* @return a {@code String} that contains the characters of the * @return a {@code String} that contains the characters of the
* character array. * character array.
@ -4506,6 +4534,9 @@ public final class String
* are copied; subsequent modification of the character array does not * are copied; subsequent modification of the character array does not
* affect the returned string. * affect the returned string.
* *
* <p> The contents of the string are unspecified if the character array
* is modified during string construction.
*
* @param data the character array. * @param data the character array.
* @param offset initial offset of the subarray. * @param offset initial offset of the subarray.
* @param count length of the subarray. * @param count length of the subarray.
@ -4767,15 +4798,18 @@ public final class String
} }
/* /*
* Package private constructor. Trailing Void argument is there for * Private constructor. Trailing Void argument is there for
* disambiguating it against other (public) constructors. * disambiguating it against other (public) constructors.
* *
* Stores the char[] value into a byte[] that each byte represents * Stores the char[] value into a byte[] that each byte represents
* the8 low-order bits of the corresponding character, if the char[] * the8 low-order bits of the corresponding character, if the char[]
* contains only latin1 character. Or a byte[] that stores all * contains only latin1 character. Or a byte[] that stores all
* characters in their byte sequences defined by the {@code StringUTF16}. * characters in their byte sequences defined by the {@code StringUTF16}.
*
* <p> The contents of the string are unspecified if the character array
* is modified during string construction.
*/ */
String(char[] value, int off, int len, Void sig) { private String(char[] value, int off, int len, Void sig) {
if (len == 0) { if (len == 0) {
this.value = "".value; this.value = "".value;
this.coder = "".coder; this.coder = "".coder;
@ -4783,11 +4817,9 @@ public final class String
} }
if (COMPACT_STRINGS) { if (COMPACT_STRINGS) {
byte[] val = StringUTF16.compress(value, off, len); byte[] val = StringUTF16.compress(value, off, len);
if (val != null) { this.coder = StringUTF16.coderFromArrayLen(val, len);
this.value = val; this.value = val;
this.coder = LATIN1; return;
return;
}
} }
this.coder = UTF16; this.coder = UTF16;
this.value = StringUTF16.toBytes(value, off, len); this.value = StringUTF16.toBytes(value, off, len);
@ -4796,6 +4828,9 @@ public final class String
/* /*
* Package private constructor. Trailing Void argument is there for * Package private constructor. Trailing Void argument is there for
* disambiguating it against other (public) constructors. * disambiguating it against other (public) constructors.
*
* <p> The contents of the string are unspecified if the {@code StringBuilder}
* is modified during string construction.
*/ */
String(AbstractStringBuilder asb, Void sig) { String(AbstractStringBuilder asb, Void sig) {
byte[] val = asb.getValue(); byte[] val = asb.getValue();
@ -4806,12 +4841,9 @@ public final class String
} else { } else {
// only try to compress val if some characters were deleted. // only try to compress val if some characters were deleted.
if (COMPACT_STRINGS && asb.maybeLatin1) { if (COMPACT_STRINGS && asb.maybeLatin1) {
byte[] buf = StringUTF16.compress(val, 0, length); this.value = StringUTF16.compress(val, 0, length);
if (buf != null) { this.coder = StringUTF16.coderFromArrayLen(this.value, length);
this.coder = LATIN1; return;
this.value = buf;
return;
}
} }
this.coder = UTF16; this.coder = UTF16;
this.value = Arrays.copyOfRange(val, 0, length << 1); this.value = Arrays.copyOfRange(val, 0, length << 1);

View File

@ -47,8 +47,12 @@ final class StringLatin1 {
return (char)(value[index] & 0xff); return (char)(value[index] & 0xff);
} }
public static boolean canEncode(char cp) {
return cp <= 0xff;
}
public static boolean canEncode(int cp) { public static boolean canEncode(int cp) {
return cp >>> 8 == 0; return cp >=0 && cp <= 0xff;
} }
public static int length(byte[] value) { public static int length(byte[] value) {

View File

@ -34,7 +34,6 @@ import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import jdk.internal.util.ArraysSupport; import jdk.internal.util.ArraysSupport;
import jdk.internal.util.DecimalDigits; import jdk.internal.util.DecimalDigits;
import jdk.internal.vm.annotation.DontInline;
import jdk.internal.vm.annotation.ForceInline; import jdk.internal.vm.annotation.ForceInline;
import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.IntrinsicCandidate;
@ -54,6 +53,19 @@ final class StringUTF16 {
return new byte[len << 1]; return new byte[len << 1];
} }
// Check the size of a UTF16-coded string
// Throw an exception if out of range
public static int newBytesLength(int len) {
if (len < 0) {
throw new NegativeArraySizeException();
}
if (len > MAX_LENGTH) {
throw new OutOfMemoryError("UTF16 String size is " + len +
", should be less than " + MAX_LENGTH);
}
return len << 1;
}
@IntrinsicCandidate @IntrinsicCandidate
// intrinsic performs no bounds checks // intrinsic performs no bounds checks
static void putChar(byte[] val, int index, int c) { static void putChar(byte[] val, int index, int c) {
@ -148,6 +160,13 @@ final class StringUTF16 {
return dst; return dst;
} }
/**
* {@return an encoded byte[] for the UTF16 characters in char[]}
* No checking is done on the characters, some may or may not be latin1.
* @param value a char array
* @param off an offset
* @param len a length
*/
@IntrinsicCandidate @IntrinsicCandidate
public static byte[] toBytes(char[] value, int off, int len) { public static byte[] toBytes(char[] value, int off, int len) {
byte[] val = newBytesFor(len); byte[] val = newBytesFor(len);
@ -158,20 +177,209 @@ final class StringUTF16 {
return val; return val;
} }
public static byte[] compress(char[] val, int off, int len) { // Clever way to get the coder from a byte array returned from compress
byte[] ret = new byte[len]; // that maybe either latin1 or UTF16-coded
if (compress(val, off, ret, 0, len) == len) { // Equivalent to (len == val.length) ? LATIN1 : UTF16
return ret; @ForceInline
} static byte coderFromArrayLen(byte[] value, int len) {
return null; return (byte) ((len - value.length) >>> Integer.SIZE - 1);
} }
public static byte[] compress(byte[] val, int off, int len) { /**
byte[] ret = new byte[len]; * {@return Compress the char array (containing UTF16) into a compact strings byte array}
if (compress(val, off, ret, 0, len) == len) { * If all the chars are LATIN1, it returns an array with len == count,
return ret; * otherwise, it contains UTF16 characters.
* <p>
* A UTF16 array is returned *only* if at least 1 non-latin1 character is present.
* This must be true even if the input array is modified while this method is executing.
* This is assured by copying the characters while checking for latin1.
* If all characters are latin1, a byte array with length equals count is returned,
* indicating all latin1 chars. The scan may be implemented as an intrinsic,
* which returns the index of the first non-latin1 character.
* When the first non-latin1 character is found, it switches to creating a new
* buffer; the saved prefix of latin1 characters is copied to the new buffer;
* and the remaining input characters are copied to the buffer.
* The index of the known non-latin1 character is checked, if it is latin1,
* the input has been changed. In this case, a second attempt is made to compress to
* latin1 from the copy made in the first pass to the originally allocated latin1 buffer.
* If it succeeds the return value is latin1, otherwise, the utf16 value is returned.
* In this unusual case, the result is correct for the snapshot of the value.
* The resulting string contents are unspecified if the input array is modified during this
* operation, but it is ensured that at least 1 non-latin1 character is present in
* the non-latin1 buffer.
*
* @param val a char array
* @param off starting offset
* @param count count of chars to be compressed, {@code count} > 0
*/
@ForceInline
public static byte[] compress(final char[] val, final int off, final int count) {
byte[] latin1 = new byte[count];
int ndx = compress(val, off, latin1, 0, count);
if (ndx != count) {
// Switch to UTF16
byte[] utf16 = toBytes(val, off, count);
// If the original character that was found to be non-latin1 is latin1 in the copy
// try to make a latin1 string from the copy
if (getChar(utf16, ndx) > 0xff
|| compress(utf16, 0, latin1, 0, count) != count) {
return utf16;
}
} }
return null; return latin1; // latin1 success
}
/**
* {@return Compress the internal byte array (containing UTF16) into a compact strings byte array}
* If all the chars are LATIN1, it returns an array with len == count,
* otherwise, it contains UTF16 characters.
* <p>
* Refer to the description of the algorithm in {@link #compress(char[], int, int)}.
*
* @param val a byte array with UTF16 coding
* @param off starting offset
* @param count count of chars to be compressed, {@code count} > 0
*/
public static byte[] compress(final byte[] val, final int off, final int count) {
byte[] latin1 = new byte[count];
int ndx = compress(val, off, latin1, 0, count);
if (ndx != count) {// Switch to UTF16
byte[] utf16 = Arrays.copyOfRange(val, off << 1, newBytesLength(off + count));
// If the original character that was found to be non-latin1 is latin1 in the copy
// try to make a latin1 string from the copy
if (getChar(utf16, ndx) > 0xff
|| compress(utf16, 0, latin1, 0, count) != count) {
return utf16;
}
}
return latin1; // latin1 success
}
/**
* {@return compress the code points into a compact strings byte array}
* If all the chars are LATIN1, returns an array with len == count.
* If not, a new byte array is allocated and code points converted to UTF16.
* The algorithm is similar to that of {@link #compress(char[], int, int)}.
* <p>
* The resulting encoding is attempted in several steps:
* <UL>
* <LI>If no non-latin1 characters are found, the encoding is latin1</LI>
* <LI>If an estimate of the number of characters needed to represent the codepoints is
* equal to the string length, they are all BMP with at least 1 UTF16 character
* and are copied to the result. </LI>
* <LI>The extractCodePoints method is called to carefully expand surrogates. </LI>
* </UL>
*
* @param val an int array of code points
* @param off starting offset
* @param count length of code points to be compressed, length > 0
*/
public static byte[] compress(final int[] val, int off, final int count) {
// Optimistically copy all latin1 code points to the destination
byte[] latin1 = new byte[count];
final int end = off + count;
for (int ndx = 0; ndx < count; ndx++, off++) {
int cp = val[off];
if (cp >= 0 && cp <= 0xff) {
latin1[ndx] = (byte)cp;
} else {
// Pass 1: Compute precise size of char[]; see extractCodePoints for caveat
int estSize = ndx + computeCodePointSize(val, off, end);
// Pass 2: Switch to UTF16
// cp = val[ndx] is at least one code point known to be UTF16
byte[] utf16 = newBytesFor(estSize);
if (ndx > 0) {
StringLatin1.inflate(latin1, 0, utf16, 0, ndx); // inflate latin1 bytes
}
if (estSize == count) {
// Based on the computed size, all remaining code points are BMP and
// can be copied without checking again
putChar(utf16, ndx, cp); // ensure utf16 has a UTF16 char
off++;
for (int i = ndx + 1; i < count; i++, off++) {
putChar(utf16, i, val[off]);
}
} else {
// Some codepoint is a surrogate pair
utf16 = extractCodepoints(val, off, end, utf16, ndx);
// The original character that was found to be UTF16 is not UTF16 in the copy
// Try to make a latin1 string from the copy
if (getChar(utf16, ndx) <= 0xff &&
compress(utf16, 0, latin1, 0, count) == count) {
return latin1; // latin1 success
}
}
return utf16;
}
}
return latin1; // Latin1 success
}
// Extract code points into chars in the byte array
//
// Guard against possible races with the input array changing between the previous
// computation of the required output size and storing the bmp or surrogates.
// If a BMP code point is changed to a supplementary code point it would require 2 chars
// in the output. Changing a supplementary char to BMP would reduce the size.
// If the utf16 destination is not large enough, it is resized to fit the
// remaining codepoints assuming they occupy 2 characters.
// The destination may be copied to return exactly the final length.
// The additional allocations and compression only occur if the input array is modified.
private static byte[] extractCodepoints(int[] val, int off, int end, byte[] dst, int dstOff) {
while (off < end) {
// Compute a minimum estimate on the number of characters can be put into the dst
// given the current codepoint and the number of remaining codepoints
int codePoint = val[off]; // read each codepoint from val only once
int dstLimit = dstOff
+ Character.charCount(codePoint)
+ (end - off - 1);
if (dstLimit > (dst.length >> 1)) {
// Resize to hold the remaining codepoints assuming they are all surrogates.
// By resizing to the maximum that might be needed, only a single resize will occur.
// dstLimit includes only a single char per codepoint, pad with an additional for each.
int maxRemaining = dstLimit + (end - off - 1);
dst = Arrays.copyOf(dst, newBytesLength(maxRemaining));
}
// Efficiently copy as many codepoints as fit within the current estimated limit
// The dst at least enough space for the current codepoint.
while (true) {
if (Character.isBmpCodePoint(codePoint)) {
putChar(dst, dstOff++, codePoint);
} else {
putChar(dst, dstOff++, Character.highSurrogate(codePoint));
putChar(dst, dstOff++, Character.lowSurrogate(codePoint));
}
off++;
if (dstOff + 2 > dstLimit)
break; // no space for another surrogate; recompute limit
codePoint = val[off];
}
}
if (dstOff != (dst.length >> 1)) {
// Truncate to actual length; should only occur if a codepoint was racily
// changed from a surrogate to a BMP character.
return Arrays.copyOf(dst, newBytesLength(dstOff));
}
return dst;
}
// Compute the number of chars needed to represent the code points from off to end-1
private static int computeCodePointSize(int[] val, int off, int end) {
int n = end - off;
while (off < end) {
int codePoint = val[off++];
if (Character.isBmpCodePoint(codePoint)) {
continue;
} else if (Character.isValidCodePoint(codePoint)) {
n++;
} else {
throw new IllegalArgumentException(Integer.toString(codePoint));
}
}
return n;
} }
// compressedCopy char[] -> byte[] // compressedCopy char[] -> byte[]
@ -179,9 +387,8 @@ final class StringUTF16 {
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
char c = src[srcOff]; char c = src[srcOff];
if (c > 0xFF) { if (c > 0xff) {
len = 0; return i; // return index of non-latin1 char
break;
} }
dst[dstOff] = (byte)c; dst[dstOff] = (byte)c;
srcOff++; srcOff++;
@ -197,9 +404,8 @@ final class StringUTF16 {
checkBoundsOffCount(srcOff, len, src); checkBoundsOffCount(srcOff, len, src);
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
char c = getChar(src, srcOff); char c = getChar(src, srcOff);
if (c > 0xFF) { if (c > 0xff) {
len = 0; return i; // return index of non-latin1 char
break;
} }
dst[dstOff] = (byte)c; dst[dstOff] = (byte)c;
srcOff++; srcOff++;
@ -208,31 +414,14 @@ final class StringUTF16 {
return len; return len;
} }
// Create the UTF16 buffer for !COMPACT_STRINGS
public static byte[] toBytes(int[] val, int index, int len) { public static byte[] toBytes(int[] val, int index, int len) {
final int end = index + len; final int end = index + len;
// Pass 1: Compute precise size of char[] int n = computeCodePointSize(val, index, end);
int n = len;
for (int i = index; i < end; i++) {
int cp = val[i];
if (Character.isBmpCodePoint(cp))
continue;
else if (Character.isValidCodePoint(cp))
n++;
else throw new IllegalArgumentException(Integer.toString(cp));
}
// Pass 2: Allocate and fill in <high, low> pair
byte[] buf = newBytesFor(n); byte[] buf = newBytesFor(n);
for (int i = index, j = 0; i < end; i++, j++) { return extractCodepoints(val, index, len, buf, 0);
int cp = val[i]; }
if (Character.isBmpCodePoint(cp)) {
putChar(buf, j, cp);
} else {
putChar(buf, j++, Character.highSurrogate(cp));
putChar(buf, j, Character.lowSurrogate(cp));
}
}
return buf;
}
public static byte[] toBytes(char c) { public static byte[] toBytes(char c) {
byte[] result = new byte[2]; byte[] result = new byte[2];
@ -653,10 +842,9 @@ final class StringUTF16 {
if (String.COMPACT_STRINGS && if (String.COMPACT_STRINGS &&
!StringLatin1.canEncode(oldChar) && !StringLatin1.canEncode(oldChar) &&
StringLatin1.canEncode(newChar)) { StringLatin1.canEncode(newChar)) {
byte[] val = compress(buf, 0, len); byte[] res = StringUTF16.compress(buf, 0, len);
if (val != null) { byte coder = StringUTF16.coderFromArrayLen(res, len);
return new String(val, LATIN1); return new String(res, coder);
}
} }
return new String(buf, UTF16); return new String(buf, UTF16);
} }
@ -771,10 +959,9 @@ final class StringUTF16 {
if (String.COMPACT_STRINGS && replLat1 && !targLat1) { if (String.COMPACT_STRINGS && replLat1 && !targLat1) {
// combination 6 // combination 6
byte[] lat1Result = compress(result, 0, resultLen); byte[] res = StringUTF16.compress(result, 0, resultLen);
if (lat1Result != null) { byte coder = StringUTF16.coderFromArrayLen(res, resultLen);
return new String(lat1Result, LATIN1); return new String(res, coder); // combination 6
}
} }
return new String(result, UTF16); return new String(result, UTF16);
} }
@ -838,7 +1025,7 @@ final class StringUTF16 {
bits |= cp; bits |= cp;
putChar(result, i, cp); putChar(result, i, cp);
} }
if (bits > 0xFF) { if (bits < 0 || bits > 0xff) {
return new String(result, UTF16); return new String(result, UTF16);
} else { } else {
return newString(result, 0, len); return newString(result, 0, len);
@ -939,7 +1126,7 @@ final class StringUTF16 {
bits |= cp; bits |= cp;
putChar(result, i, cp); putChar(result, i, cp);
} }
if (bits > 0xFF) { if (bits < 0 || bits > 0xff) {
return new String(result, UTF16); return new String(result, UTF16);
} else { } else {
return newString(result, 0, len); return newString(result, 0, len);
@ -1168,10 +1355,9 @@ final class StringUTF16 {
return ""; return "";
} }
if (String.COMPACT_STRINGS) { if (String.COMPACT_STRINGS) {
byte[] buf = compress(val, index, len); byte[] res = StringUTF16.compress(val, index, len);
if (buf != null) { byte coder = StringUTF16.coderFromArrayLen(res, len);
return new String(buf, LATIN1); return new String(res, coder);
}
} }
int last = index + len; int last = index + len;
return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16); return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16);
@ -1502,8 +1688,8 @@ final class StringUTF16 {
private static native boolean isBigEndian(); private static native boolean isBigEndian();
static final int HI_BYTE_SHIFT; private static final int HI_BYTE_SHIFT;
static final int LO_BYTE_SHIFT; private static final int LO_BYTE_SHIFT;
static { static {
if (isBigEndian()) { if (isBigEndian()) {
HI_BYTE_SHIFT = 8; HI_BYTE_SHIFT = 8;

View File

@ -0,0 +1,253 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 8311906
* @summary Validates String constructor intrinsics using varied input data.
* @key randomness
* @library /compiler/patches /test/lib
* @build java.base/java.lang.Helper
* @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 compiler.intrinsics.string.TestStringConstructionIntrinsics
*/
/*
* @test
* @bug 8311906
* @summary Validates String constructor intrinsic for AVX3 works with and without
* AVX3Threshold=0
* @key randomness
* @library /compiler/patches /test/lib
* @build java.base/java.lang.Helper
* @requires vm.cpu.features ~= ".*avx512.*"
* @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 compiler.intrinsics.string.TestStringConstructionIntrinsics
* @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:AVX3Threshold=0 compiler.intrinsics.string.TestStringConstructionIntrinsics
*/
package compiler.intrinsics.string;
import java.lang.Helper;
import java.util.Random;
import jdk.test.lib.Utils;
public class TestStringConstructionIntrinsics {
private static byte[] bytes = new byte[2 * (4096 + 32)];
private static char[] chars = new char[4096 + 32];
// Used a scratch buffer, sized to accommodate inflated
private static byte[] dst = new byte[bytes.length * 2];
private static final Random RANDOM = Utils.getRandomInstance();
/**
* Completely initialize the bytes test array. The lowest index that will be
* non-latin1 is marked by nlOffset
*/
public static void initializeBytes(int off, int len, int nonLatin1, int nlOffset) {
int maxLen = bytes.length >> 1;
assert (len + off < maxLen);
// insert "canary" (non-latin1) values before offset
for (int i = 0; i < off; i++) {
Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
}
// fill the array segment
for (int i = off; i < len + off; i++) {
Helper.putCharSB(bytes, i, ((i - off + 15) & 0xFF));
}
if (nonLatin1 != 0) {
// modify a number disparate indexes to be non-latin1
for (int i = 0; i < nonLatin1; i++) {
int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset;
Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
}
}
// insert "canary" non-latin1 values after array segment
for (int i = len + off; i < maxLen; i++) {
Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
}
}
/**
* Completely initialize the char test array. The lowest index that will be
* non-latin1 is marked by nlOffset
*/
public static void initializeChars(int off, int len, int nonLatin1, int nlOffset) {
assert (len + off <= chars.length);
// insert "canary" non-latin1 values before offset
for (int i = 0; i < off; ++i) {
chars[i] = (char) (((i + 15) & 0x7F) | 0x180);
}
// fill the array segment
for (int i = off; i < len + off; ++i) {
chars[i] = (char) (((i - off + 15) & 0xFF));
}
if (nonLatin1 != 0) {
// modify a number disparate chars inside
// segment to be non-latin1.
for (int i = 0; i < nonLatin1; i++) {
int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset;
chars[idx] = (char) (0x180 | chars[idx]);
}
}
// insert "canary" non-latin1 values after array segment
for (int i = len + off; i < chars.length; ++i) {
chars[i] = (char) (((i + 15) & 0x7F) | 0x180);
}
}
/**
* Test different array segment sizes, offsets, and number of non-latin1
* chars.
*/
public static void testConstructBytes() throws Exception {
for (int off = 0; off < 16; off++) { // starting offset of array segment
// Test all array segment sizes 1-63
for (int len = 1; len < 64; len++) {
testConstructBytes(off, len, 0, 0);
testConstructBytes(off, len, 1, 0);
testConstructBytes(off, len, RANDOM.nextInt(30) + 2, 0);
}
// Test a random selection of sizes between 64 and 4099, inclusive
for (int i = 0; i < 20; i++) {
int len = 64 + RANDOM.nextInt(4100 - 64);
testConstructBytes(off, len, 0, 0);
testConstructBytes(off, len, 1, 0);
testConstructBytes(off, len, RANDOM.nextInt(len) + 2, 0);
}
for (int len : new int[] { 128, 2048 }) {
// test with negatives only in a 1-63 byte tail
int tail = RANDOM.nextInt(63) + 1;
int ng = RANDOM.nextInt(tail) + 1;
testConstructBytes(off, len + tail, ng, len);
}
}
}
private static void testConstructBytes(int off, int len, int ng, int ngOffset) throws Exception {
assert (len + off < bytes.length);
initializeBytes(off, len, ng, ngOffset);
byte[] dst = new byte[bytes.length];
int calculated = Helper.compress(bytes, off, dst, 0, len);
int expected = compress(bytes, off, dst, 0, len);
if (calculated != expected) {
if (expected != len && ng >= 0 && calculated >= 0 && calculated < expected) {
// allow intrinsics to return early with a lower value,
// but only if we're not expecting the full length (no
// negative bytes)
return;
}
throw new Exception("Failed testConstructBytes: " + "offset: " + off + " "
+ "length: " + len + " " + "return: " + calculated + " expected: " + expected + " negatives: "
+ ng + " offset: " + ngOffset);
}
}
private static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
for (int i = 0; i < len; i++) {
char c = Helper.charAt(src, srcOff);
if (c > 0xff) {
return i; // return index of non-latin1 char
}
dst[dstOff] = (byte)c;
srcOff++;
dstOff++;
}
return len;
}
/**
* Test different array segment sizes, offsets, and number of non-latin1
* chars.
*/
public static void testConstructChars() throws Exception {
for (int off = 0; off < 16; off++) { // starting offset of array segment
// Test all array segment sizes 1-63
for (int len = 1; len < 64; len++) {
testConstructChars(off, len, 0, 0);
testConstructChars(off, len, 1, 0);
testConstructChars(off, len, RANDOM.nextInt(30) + 2, 0);
}
// Test a random selection of sizes between 64 and 4099, inclusive
for (int i = 0; i < 20; i++) {
int len = 64 + RANDOM.nextInt(4100 - 64);
testConstructChars(off, len, 0, 0);
testConstructChars(off, len, 1, 0);
testConstructChars(off, len, RANDOM.nextInt(len) + 2, 0);
}
for (int len : new int[] { 128, 2048 }) {
// test with negatives only in a 1-63 byte tail
int tail = RANDOM.nextInt(63) + 1;
int ng = RANDOM.nextInt(tail) + 1;
testConstructChars(off, len + tail, ng, len);
}
}
}
private static void testConstructChars(int off, int len, int nonLatin1, int nlOffset) throws Exception {
assert (len + off < bytes.length);
initializeChars(off, len, nonLatin1, nlOffset);
int calculated = Helper.compress(chars, off, dst, 0, len);
int expected = compress(chars, off, dst, 0, len);
if (calculated != expected) {
if (expected != len && nonLatin1 >= 0 && calculated >= 0 && calculated < expected) {
// allow intrinsics to return early with a lower value,
// but only if we're not expecting the full length (no
// negative bytes)
return;
}
throw new Exception("Failed testConstructChars: " + "offset: " + off + " "
+ "length: " + len + " " + "return: " + calculated + " expected: " + expected + " non-latin1: "
+ nonLatin1 + " offset: " + nlOffset);
}
}
private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
for (int i = 0; i < len; i++) {
char c = src[srcOff];
if (c > 0xff) {
return i; // return index of non-latin1 char
}
dst[dstOff] = (byte)c;
srcOff++;
dstOff++;
}
return len;
}
public void run() throws Exception {
// iterate to eventually get intrinsic inlined
for (int j = 0; j < 200; ++j) {
testConstructBytes();
testConstructChars();
}
}
public static void main(String[] args) throws Exception {
(new TestStringConstructionIntrinsics()).run();
System.out.println("string construction intrinsics validated");
}
}

View File

@ -44,6 +44,11 @@ public class Helper {
return dst; return dst;
} }
@jdk.internal.vm.annotation.ForceInline
public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
return StringUTF16.compress(src, srcOff, dst, dstOff, len);
}
@jdk.internal.vm.annotation.ForceInline @jdk.internal.vm.annotation.ForceInline
public static byte[] compressChar(char[] src, int srcOff, int dstSize, int dstOff, int len) { public static byte[] compressChar(char[] src, int srcOff, int dstSize, int dstOff, int len) {
byte[] dst = new byte[dstSize]; byte[] dst = new byte[dstSize];
@ -51,6 +56,11 @@ public class Helper {
return dst; return dst;
} }
@jdk.internal.vm.annotation.ForceInline
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
return StringUTF16.compress(src, srcOff, dst, dstOff, len);
}
@jdk.internal.vm.annotation.ForceInline @jdk.internal.vm.annotation.ForceInline
public static byte[] inflateByte(byte[] src, int srcOff, int dstSize, int dstOff, int len) { public static byte[] inflateByte(byte[] src, int srcOff, int dstSize, int dstOff, int len) {
byte[] dst = new byte[dstSize]; byte[] dst = new byte[dstSize];

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -22,10 +22,10 @@
*/ */
/* /*
@test * @test
@bug 8054307 * @bug 8054307 8311906
@summary test chars() and codePoints() * @summary test String chars() and codePoints()
*/ */
import java.util.Arrays; import java.util.Arrays;
import java.util.Random; import java.util.Random;
@ -44,6 +44,7 @@ public class Chars {
cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x80)); cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x80));
} }
testChars(cc, ccExp); testChars(cc, ccExp);
testCharsSubrange(cc, ccExp);
testCPs(cc, cpExp); testCPs(cc, cpExp);
// bmp without surrogates // bmp without surrogates
@ -51,6 +52,7 @@ public class Chars {
cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x8000)); cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x8000));
} }
testChars(cc, ccExp); testChars(cc, ccExp);
testCharsSubrange(cc, ccExp);
testCPs(cc, cpExp); testCPs(cc, cpExp);
// bmp with surrogates // bmp with surrogates
@ -69,6 +71,7 @@ public class Chars {
} }
cpExp = Arrays.copyOf(cpExp, k); cpExp = Arrays.copyOf(cpExp, k);
testChars(cc, ccExp); testChars(cc, ccExp);
testCharsSubrange(cc, ccExp);
testCPs(cc, cpExp); testCPs(cc, cpExp);
} }
} }
@ -76,14 +79,35 @@ public class Chars {
static void testChars(char[] cc, int[] expected) { static void testChars(char[] cc, int[] expected) {
String str = new String(cc); String str = new String(cc);
if (!Arrays.equals(expected, str.chars().toArray())) { if (!Arrays.equals(expected, str.chars().toArray())) {
throw new RuntimeException("chars/codePoints() failed!"); throw new RuntimeException("testChars failed!");
}
}
static void testCharsSubrange(char[] cc, int[] expected) {
int[] offsets = { 7, 31 }; // offsets to test
int LENGTH = 13;
for (int i = 0; i < offsets.length; i++) {
int offset = Math.max(0, offsets[i]); // confine to the input array
int count = Math.min(LENGTH, cc.length - offset);
String str = new String(cc, offset, count);
int[] actual = str.chars().toArray();
int errOffset = Arrays.mismatch(actual, 0, actual.length,
expected, offset, offset + count);
if (errOffset >= 0) {
System.err.printf("expected[%d] (%d) != actual[%d] (%d)%n",
offset + errOffset, expected[offset + errOffset],
errOffset, actual[errOffset]);
System.err.println("expected: " + Arrays.toString(expected));
System.err.println("actual: " + Arrays.toString(actual));
throw new RuntimeException("testCharsSubrange failed!");
}
} }
} }
static void testCPs(char[] cc, int[] expected) { static void testCPs(char[] cc, int[] expected) {
String str = new String(cc); String str = new String(cc);
if (!Arrays.equals(expected, str.codePoints().toArray())) { if (!Arrays.equals(expected, str.codePoints().toArray())) {
throw new RuntimeException("chars/codePoints() failed!"); throw new RuntimeException("testCPs failed!");
} }
} }
} }

View File

@ -0,0 +1,437 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package test.java.lang.String;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.ConcurrentModificationException;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.EnabledIf;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
/*
* @test
* @bug 8311906
* @modules java.base/java.lang:open
* @summary check String's racy constructors
* @run junit/othervm -XX:+CompactStrings test.java.lang.String.StringRacyConstructor
* @run junit/othervm -XX:-CompactStrings test.java.lang.String.StringRacyConstructor
*/
public class StringRacyConstructor {
private static final byte LATIN1 = 0;
private static final byte UTF16 = 1;
private static final Field STRING_CODER_FIELD;
private static final Field SB_CODER_FIELD;
private static final boolean COMPACT_STRINGS;
static {
try {
STRING_CODER_FIELD = String.class.getDeclaredField("coder");
STRING_CODER_FIELD.setAccessible(true);
SB_CODER_FIELD = Class.forName("java.lang.AbstractStringBuilder").getDeclaredField("coder");
SB_CODER_FIELD.setAccessible(true);
COMPACT_STRINGS = isCompactStrings();
} catch (NoSuchFieldException ex ) {
throw new ExceptionInInitializerError(ex);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
/* {@return true iff CompactStrings are enabled}
*/
public static boolean isCompactStrings() {
try {
Field compactStringField = String.class.getDeclaredField("COMPACT_STRINGS");
compactStringField.setAccessible(true);
return compactStringField.getBoolean(null);
} catch (NoSuchFieldException ex) {
throw new ExceptionInInitializerError(ex);
} catch (IllegalAccessException iae) {
throw new AssertionError(iae);
}
}
// Return the coder for the String
private static int coder(String s) {
try {
return STRING_CODER_FIELD.getByte(s);
} catch (IllegalAccessException iae) {
throw new AssertionError(iae);
}
}
// Return the coder for the StringBuilder
private static int sbCoder(StringBuilder sb) {
try {
return SB_CODER_FIELD.getByte(sb);
} catch (IllegalAccessException iae) {
throw new AssertionError(iae);
}
}
// Return a summary of the internals of the String
// The coder and indicate if the coder matches the string contents
private static String inspectString(String s) {
try {
char[] chars = s.toCharArray();
String r = new String(chars);
boolean invalidCoder = coder(s) != coder(r);
String coder = STRING_CODER_FIELD.getByte(s) == 0 ? "isLatin1" : "utf16";
return (invalidCoder ? "INVALID CODER" : "" ) + " \"" + s + "\", coder: " + coder;
} catch (IllegalAccessException ex ) {
return "EXCEPTION: " + ex.getMessage();
}
}
/**
* {@return true if the coder matches the presence/lack of UTF16 characters}
* If it returns false, the coder and the contents have failed the precondition for string.
* @param orig a string
*/
private static boolean validCoder(String orig) {
if (!COMPACT_STRINGS) {
assertEquals(UTF16, coder(orig), "Non-COMPACT STRINGS coder must be UTF16");
}
int accum = 0;
for (int i = 0; i < orig.length(); i++)
accum |= orig.charAt(i);
byte expectedCoder = (accum < 256) ? LATIN1 : UTF16;
return expectedCoder == coder(orig);
}
// Check a StringBuilder for consistency of coder and latin1 vs UTF16
private static boolean validCoder(StringBuilder orig) {
int accum = 0;
for (int i = 0; i < orig.length(); i++)
accum |= orig.charAt(i);
byte expectedCoder = (accum < 256) ? LATIN1 : UTF16;
return expectedCoder == sbCoder(orig);
}
@Test
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void checkStringRange() {
char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23};
String orig = new String(chars);
char[] xx = orig.toCharArray();
String stringFromChars = new String(xx);
assertEquals(orig, stringFromChars, "mixed chars");
assertTrue(validCoder(stringFromChars), "invalid coder"
+ ", invalid coder: " + inspectString(stringFromChars));
}
private static List<String> strings() {
return List.of("01234", " ");
}
@ParameterizedTest
@MethodSource("strings")
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void racyString(String orig) {
String racyString = racyStringConstruction(orig);
// The contents are indeterminate due to the race
assertTrue(validCoder(racyString), orig + " string invalid"
+ ", racyString: " + inspectString(racyString));
}
@ParameterizedTest
@MethodSource("strings")
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void racyCodePoint(String orig) {
String iffyString = racyStringConstructionCodepoints(orig);
// The contents are indeterminate due to the race
assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string"
+ ", orig:" + inspectString(orig)
+ ", iffyString: " + inspectString(iffyString));
}
@ParameterizedTest
@MethodSource("strings")
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void racyCodePointSurrogates(String orig) {
String iffyString = racyStringConstructionCodepointsSurrogates(orig);
// The contents are indeterminate due to the race
if (!orig.equals(iffyString))
System.err.println("orig: " + orig + ", iffy: " + iffyString + Arrays.toString(iffyString.codePoints().toArray()));
assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string"
+ ", orig:" + inspectString(orig)
+ ", iffyString: " + inspectString(iffyString));
}
// Test the private methods of StringUTF16 that compress and copy COMPRESSED_STRING
// encoded byte arrays.
@Test
public void verifyUTF16CopyBytes()
throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {
Class<?> stringUTF16 = Class.forName("java.lang.StringUTF16");
Method mCompressChars = stringUTF16.getDeclaredMethod("compress",
char[].class, int.class, byte[].class, int.class, int.class);
mCompressChars.setAccessible(true);
// First warmup the intrinsic and check 1 case
char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23};
byte[] bytes = new byte[chars.length];
int printWarningCount = 0;
for (int i = 0; i < 1_000_000; i++) { // repeat to get C2 to kick in
// Copy only latin1 chars from UTF-16 converted prefix (3 chars -> 3 bytes)
int intResult = (int) mCompressChars.invoke(null, chars, 0, bytes, 0, chars.length);
if (intResult == 0) {
if (printWarningCount == 0) {
printWarningCount = 1;
System.err.println("Intrinsic for StringUTF16.compress returned 0, may not have been updated.");
}
} else {
assertEquals(3, intResult, "return length not-equal, iteration: " + i);
}
}
// Exhaustively check compress returning the correct index of the non-latin1 char.
final int SIZE = 48;
final byte FILL_BYTE = 'R';
chars = new char[SIZE];
bytes = new byte[chars.length];
for (int i = 0; i < SIZE; i++) { // Every starting index
for (int j = i; j < SIZE; j++) { // Every location of non-latin1
Arrays.fill(chars, 'A');
Arrays.fill(bytes, FILL_BYTE);
chars[j] = 0xFF21;
int intResult = (int) mCompressChars.invoke(null, chars, i, bytes, 0, chars.length - i);
assertEquals(j - i, intResult, "compress found wrong index");
assertEquals(FILL_BYTE, bytes[j], "extra character stored");
}
}
}
// Check that a concatenated "hello" has a valid coder
@Test
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void checkConcatAndIntern() {
var helloWorld = "hello world";
String helloToo = racyStringConstruction("hell".concat("o"));
String o = helloToo.intern();
var hello = "hello";
assertTrue(validCoder(helloToo), "startsWith: "
+ ", hell: " + inspectString(helloToo)
+ ", o: " + inspectString(o)
+ ", hello: " + inspectString(hello)
+ ", hello world: " + inspectString(helloWorld));
}
// Check that an empty string with racy construction has a valid coder
@Test
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
public void racyEmptyString() {
var space = racyStringConstruction(" ");
var trimmed = space.trim();
assertTrue(validCoder(trimmed), "empty string invalid coder"
+ ", trimmed: " + inspectString(trimmed));
}
// Check that an exception in a user implemented CharSequence doesn't result in
// an invalid coder when appended to a StringBuilder
@Test
@EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
void charSequenceException() {
ThrowingCharSequence bs = new ThrowingCharSequence("A\u2030\uFFFD");
var sb = new StringBuilder();
try {
sb.append(bs);
fail("An IllegalArgumentException should have been thrown");
} catch (IllegalArgumentException ex) {
// ignore expected
}
assertTrue(validCoder(sb), "invalid coder in StringBuilder");
}
/**
* Given a latin-1 String, attempt to create a copy that is
* incorrectly encoded as UTF-16.
*/
public static String racyStringConstruction(String original) throws ConcurrentModificationException {
if (original.chars().max().getAsInt() >= 256) {
throw new IllegalArgumentException(
"Only work with latin-1 Strings");
}
char[] chars = original.toCharArray();
// In another thread, flip the first character back
// and forth between being latin-1 or not
Thread thread = new Thread(() -> {
while (!Thread.interrupted()) {
chars[0] ^= 256;
}
});
thread.start();
// at the same time call the String constructor,
// until we hit the race condition
int i = 0;
while (true) {
i++;
String s = new String(chars);
if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) {
thread.interrupt();
try {
thread.join();
} catch (InterruptedException ie) {
// ignore interrupt
}
return s;
}
}
}
/**
* Given a latin-1 String, creates a copy that is
* incorrectly encoded as UTF-16 using the APIs for Codepoints.
*/
public static String racyStringConstructionCodepoints(String original) throws ConcurrentModificationException {
if (original.chars().max().getAsInt() >= 256) {
throw new IllegalArgumentException(
"Can only work with latin-1 Strings");
}
int len = original.length();
int[] codePoints = new int[len];
for (int i = 0; i < len; i++) {
codePoints[i] = original.charAt(i);
}
// In another thread, flip the first character back
// and forth between being latin-1 or not
Thread thread = new Thread(() -> {
while (!Thread.interrupted()) {
codePoints[0] ^= 256;
}
});
thread.start();
// at the same time call the String constructor,
// until we hit the race condition
int i = 0;
while (true) {
i++;
String s = new String(codePoints, 0, len);
if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) {
thread.interrupt();
try {
thread.join();
} catch (InterruptedException ie) {
// ignore interrupt
}
return s;
}
}
}
/**
* Returns a string created from a codepoint array that has been racily
* modified to contain high and low surrogates. The string is a different length
* than the original due to the surrogate encoding.
*/
public static String racyStringConstructionCodepointsSurrogates(String original) throws ConcurrentModificationException {
if (original.chars().max().getAsInt() >= 256) {
throw new IllegalArgumentException(
"Can only work with latin-1 Strings");
}
int len = original.length();
int[] codePoints = new int[len];
for (int i = 0; i < len; i++) {
codePoints[i] = original.charAt(i);
}
// In another thread, flip the first character back
// and forth between being latin-1 or as a surrogate pair.
Thread thread = new Thread(() -> {
while (!Thread.interrupted()) {
codePoints[0] ^= 0x10000;
}
});
thread.start();
// at the same time call the String constructor,
// until we hit the race condition
int i = 0;
while (true) {
i++;
String s = new String(codePoints, 0, len);
if ((s.length() != original.length()) || i > 1_000_000) {
thread.interrupt();
try {
thread.join();
} catch (InterruptedException ie) {
// ignore interrupt
}
return s;
}
}
}
// A CharSequence that returns characters from a string and throws IllegalArgumentException
// when the character requested is 0xFFFD (the replacement character)
// The string contents determine when the exception is thrown.
static class ThrowingCharSequence implements CharSequence {
private final String aString;
ThrowingCharSequence(String aString) {
this.aString = aString;
}
@Override
public int length() {
return aString.length();
}
@Override
public char charAt(int index) {
char ch = aString.charAt(index);
if (ch == 0xFFFD) {
throw new IllegalArgumentException("Replacement character at index " + index);
}
return ch;
}
@Override
// Not used; returns the entire string
public CharSequence subSequence(int start, int end) {
return this;
}
}
}

View File

@ -21,11 +21,13 @@
* questions. * questions.
*/ */
package micro.org.openjdk.bench.java.lang; package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@State(Scope.Thread) @State(Scope.Thread)
@ -36,45 +38,115 @@ import java.util.concurrent.TimeUnit;
@Fork(3) @Fork(3)
public class StringConstructor { public class StringConstructor {
@Param({"7", "64"}) private static final char INTEROBANG = 0x2030;
public int size;
// Offset to use for ranged newStrings // Fixed offset to use for ranged newStrings
@Param("1") public final int offset = 1;
public int offset;
private byte[] array;
@Setup @Param({"7", "64"})
public void setup() { public int size;
if (offset > size) {
offset = size;
}
array = "a".repeat(size).getBytes(StandardCharsets.UTF_8);
}
@Benchmark private byte[] array;
public String newStringFromArray() { private char[] chars;
return new String(array); private char[] charsMixedBegin;
} private char[] charsMixedSmall;
private char[] charsMixedEnd;
private int[] codePointsLatin1;
private int[] codePointsMixedBegin;
private int[] codePointsMixedSmall;
@Benchmark private static int[] intCopyOfChars(char[] chars, int newLength) {
public String newStringFromArrayWithCharset() { int[] res = new int[newLength];
return new String(array, StandardCharsets.UTF_8); for (int i = 0; i < Math.min(chars.length, newLength); i++)
} res[i] = chars[i];
return res;
}
@Benchmark @Setup
public String newStringFromArrayWithCharsetName() throws Exception { public void setup() {
return new String(array, StandardCharsets.UTF_8.name()); String s = "a".repeat(size);
} array = s.getBytes(StandardCharsets.UTF_8);
chars = s.toCharArray();
charsMixedBegin = Arrays.copyOf(chars, array.length);
charsMixedBegin[0] = INTEROBANG;
charsMixedSmall = Arrays.copyOf(chars, array.length);
charsMixedSmall[Math.min(charsMixedSmall.length - 1, 7)] = INTEROBANG;
charsMixedEnd = new char[size + 7];
Arrays.fill(charsMixedEnd, 'a');
charsMixedEnd[charsMixedEnd.length - 1] = INTEROBANG;
@Benchmark codePointsLatin1 = intCopyOfChars(chars, array.length);
public String newStringFromRangedArray() { codePointsMixedBegin = intCopyOfChars(chars, array.length);
return new String(array, offset, array.length - offset); codePointsMixedBegin[0] = INTEROBANG;
} codePointsMixedSmall = intCopyOfChars(chars, array.length);
codePointsMixedSmall[Math.min(codePointsMixedSmall.length - 1, 7)] = INTEROBANG;
}
@Benchmark @Benchmark
public String newStringFromRangedArrayWithCharset() { public String newStringFromBytes() {
return new String(array, offset, array.length - offset, StandardCharsets.UTF_8); return new String(array);
} }
@Benchmark
public String newStringFromBytesRanged() {
return new String(array, offset, array.length - offset);
}
@Benchmark
public String newStringFromBytesRangedWithCharsetUTF8() {
return new String(array, offset, array.length - offset, StandardCharsets.UTF_8);
}
@Benchmark
public String newStringFromBytesWithCharsetUTF8() {
return new String(array, StandardCharsets.UTF_8);
}
@Benchmark
public String newStringFromBytesWithCharsetNameUTF8() throws Exception {
return new String(array, StandardCharsets.UTF_8.name());
}
@Benchmark
public String newStringFromCharsLatin1() {
return new String(chars);
}
@Benchmark
public String newStringFromCharsMixedBegin() {
return new String(charsMixedBegin);
}
@Benchmark
public String newStringFromCharsMixedSmall() {
return new String(charsMixedSmall);
}
@Benchmark
public String newStringFromCharsMixedEnd() {
return new String(charsMixedEnd);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public void newStringFromCharsMixedAll(Blackhole bh) {
bh.consume(new String(charsMixedBegin));
bh.consume(new String(charsMixedSmall));
bh.consume(new String(chars));
}
@Benchmark
public String newStringFromCodePointRangedLatin1() {
return new String(codePointsLatin1, 0, codePointsLatin1.length);
}
@Benchmark
public String newStringFromCodePointRangedMixedBegin() {
return new String(codePointsMixedBegin, 0, codePointsMixedBegin.length);
}
@Benchmark
public String newStringFromCodePointRangedMixedSmall() {
return new String(codePointsMixedSmall, 0, codePointsMixedSmall.length);
}
} }