diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index 82316d5152a..4b465d7a8ca 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -5651,7 +5651,7 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray // return the number of characters copied. // - java/lang/StringUTF16.compress -// return zero (0) if copy fails, otherwise 'len'. +// return index of non-latin1 character if copy fails, otherwise 'len'. // // This version always returns the number of characters copied, and does not // clobber the 'len' register. A successful copy will complete with the post- @@ -5868,15 +5868,15 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register } // Compress char[] array to byte[]. +// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) +// Return the array length if every element in array can be encoded, +// otherwise, the index of first non-latin1 (> 0xff) character. void MacroAssembler::char_array_compress(Register src, Register dst, Register len, Register res, FloatRegister tmp0, FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4, FloatRegister tmp5) { encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); - // Adjust result: res == len ? len : 0 - cmp(len, res); - csel(res, res, zr, EQ); } // java.math.round(double a) diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index fb857182265..be37ff1785b 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -12727,16 +12727,8 @@ instruct string_compress(rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegIdst re ins_cost(300); format %{ "String Compress $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %} ins_encode %{ - Label Lskip, Ldone; - __ li($result$$Register, 0); - __ string_compress_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register, - $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, Ldone); - __ rldicl_($tmp1$$Register, $len$$Register, 0, 64-3); // Remaining characters. - __ beq(CCR0, Lskip); - __ string_compress($src$$Register, $dst$$Register, $tmp1$$Register, $tmp2$$Register, Ldone); - __ bind(Lskip); - __ mr($result$$Register, $len$$Register); - __ bind(Ldone); + __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register, $tmp2$$Register, + $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, $result$$Register, false); %} ins_pipe(pipe_class_default); %} diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 5daeff51192..7a88c416f2a 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -1933,14 +1933,12 @@ void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Registe } // Compress char[] array to byte[]. -// result: the array length if every element in array can be encoded; 0, otherwise. +// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) +// result: the array length if every element in array can be encoded, +// otherwise, the index of first non-latin1 (> 0xff) character. void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, Register result, Register tmp) { - Label done; encode_iso_array_v(src, dst, len, result, tmp, false); - beqz(len, done); - mv(result, zr); - bind(done); } // Intrinsic for @@ -1948,7 +1946,7 @@ void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Regist // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray // return the number of characters copied. // - java/lang/StringUTF16.compress -// return zero (0) if copy fails, otherwise 'len'. +// return index of non-latin1 character if copy fails, otherwise 'len'. // // This version always returns the number of characters copied. A successful // copy will complete with the post-condition: 'res' == 'len', while an diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index e1d7104c6d6..9f4e182a9e4 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -10190,7 +10190,7 @@ instruct string_compress(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tm format %{ "String Compress $src->$dst($len) -> $result" %} ins_encode %{ __ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register, - $tmp$$Register, false, false); + $tmp$$Register, true, false); %} ins_pipe(pipe_class_dummy); %} diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 6f04cdef508..aec1f3c9105 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -8628,15 +8628,19 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe #undef BLOCK_COMMENT // Compress char[] array to byte[]. -// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java +// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) +// Return the array length if every element in array can be encoded, +// otherwise, the index of first non-latin1 (> 0xff) character. // @IntrinsicCandidate -// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { +// public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { // for (int i = 0; i < len; i++) { -// int c = src[srcOff++]; -// if (c >>> 8 != 0) { -// return 0; +// char c = src[srcOff]; +// if (c > 0xff) { +// return i; // return index of non-latin1 char // } -// dst[dstOff++] = (byte)c; +// dst[dstOff] = (byte)c; +// srcOff++; +// dstOff++; // } // return len; // } @@ -8644,7 +8648,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg, Register tmp5, Register result, KRegister mask1, KRegister mask2) { - Label copy_chars_loop, return_length, return_zero, done; + Label copy_chars_loop, done, reset_sp, copy_tail; // rsi: src // rdi: dst @@ -8659,28 +8663,28 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le assert(len != result, ""); // save length for return - push(len); + movl(result, len); if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2()) { - Label copy_32_loop, copy_loop_tail, below_threshold; + Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail; // alignment Label post_alignment; - // if length of the string is less than 16, handle it in an old fashioned way + // if length of the string is less than 32, handle it the old fashioned way testl(len, -32); jcc(Assembler::zero, below_threshold); // First check whether a character is compressible ( <= 0xFF). // Create mask to test for Unicode chars inside zmm vector - movl(result, 0x00FF); - evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); + movl(tmp5, 0x00FF); + evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit); testl(len, -64); - jcc(Assembler::zero, post_alignment); + jccb(Assembler::zero, post_alignment); movl(tmp5, dst); andl(tmp5, (32 - 1)); @@ -8689,18 +8693,19 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le // bail out when there is nothing to be done testl(tmp5, 0xFFFFFFFF); - jcc(Assembler::zero, post_alignment); + jccb(Assembler::zero, post_alignment); // ~(~0 << len), where len is the # of remaining elements to process - movl(result, 0xFFFFFFFF); - shlxl(result, result, tmp5); - notl(result); - kmovdl(mask2, result); + movl(len, 0xFFFFFFFF); + shlxl(len, len, tmp5); + notl(len); + kmovdl(mask2, len); + movl(len, result); evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); ktestd(mask1, mask2); - jcc(Assembler::carryClear, return_zero); + jcc(Assembler::carryClear, copy_tail); evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); @@ -8715,7 +8720,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le movl(tmp5, len); andl(tmp5, (32 - 1)); // tail count (in chars) andl(len, ~(32 - 1)); // vector count (in chars) - jcc(Assembler::zero, copy_loop_tail); + jccb(Assembler::zero, copy_loop_tail); lea(src, Address(src, len, Address::times_2)); lea(dst, Address(dst, len, Address::times_1)); @@ -8725,55 +8730,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); kortestdl(mask1, mask1); - jcc(Assembler::carryClear, return_zero); + jccb(Assembler::carryClear, reset_for_copy_tail); // All elements in current processed chunk are valid candidates for // compression. Write a truncated byte elements to the memory. evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); addptr(len, 32); - jcc(Assembler::notZero, copy_32_loop); + jccb(Assembler::notZero, copy_32_loop); bind(copy_loop_tail); // bail out when there is nothing to be done testl(tmp5, 0xFFFFFFFF); - jcc(Assembler::zero, return_length); + jcc(Assembler::zero, done); movl(len, tmp5); // ~(~0 << len), where len is the # of remaining elements to process - movl(result, 0xFFFFFFFF); - shlxl(result, result, len); - notl(result); + movl(tmp5, 0xFFFFFFFF); + shlxl(tmp5, tmp5, len); + notl(tmp5); - kmovdl(mask2, result); + kmovdl(mask2, tmp5); evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); ktestd(mask1, mask2); - jcc(Assembler::carryClear, return_zero); + jcc(Assembler::carryClear, copy_tail); evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); - jmp(return_length); + jmp(done); + + bind(reset_for_copy_tail); + lea(src, Address(src, tmp5, Address::times_2)); + lea(dst, Address(dst, tmp5, Address::times_1)); + subptr(len, tmp5); + jmp(copy_chars_loop); bind(below_threshold); } if (UseSSE42Intrinsics) { - Label copy_32_loop, copy_16, copy_tail; - - movl(result, len); - - movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors + Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail; // vectored compression - andl(len, 0xfffffff0); // vector count (in chars) - andl(result, 0x0000000f); // tail count (in chars) - testl(len, len); - jcc(Assembler::zero, copy_16); + testl(len, 0xfffffff8); + jcc(Assembler::zero, copy_tail); - // compress 16 chars per iter + movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors movdl(tmp1Reg, tmp5); pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg + + andl(len, 0xfffffff0); + jccb(Assembler::zero, copy_16); + + // compress 16 chars per iter pxor(tmp4Reg, tmp4Reg); lea(src, Address(src, len, Address::times_2)); @@ -8786,59 +8796,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters por(tmp4Reg, tmp3Reg); ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector - jcc(Assembler::notZero, return_zero); + jccb(Assembler::notZero, reset_for_copy_tail); packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte movdqu(Address(dst, len, Address::times_1), tmp2Reg); addptr(len, 16); - jcc(Assembler::notZero, copy_32_loop); + jccb(Assembler::notZero, copy_32_loop); // compress next vector of 8 chars (if any) bind(copy_16); - movl(len, result); - andl(len, 0xfffffff8); // vector count (in chars) - andl(result, 0x00000007); // tail count (in chars) - testl(len, len); - jccb(Assembler::zero, copy_tail); + // len = 0 + testl(result, 0x00000008); // check if there's a block of 8 chars to compress + jccb(Assembler::zero, copy_tail_sse); - movdl(tmp1Reg, tmp5); - pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg pxor(tmp3Reg, tmp3Reg); movdqu(tmp2Reg, Address(src, 0)); ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector - jccb(Assembler::notZero, return_zero); + jccb(Assembler::notZero, reset_for_copy_tail); packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte movq(Address(dst, 0), tmp2Reg); addptr(src, 16); addptr(dst, 8); + jmpb(copy_tail_sse); - bind(copy_tail); + bind(reset_for_copy_tail); + movl(tmp5, result); + andl(tmp5, 0x0000000f); + lea(src, Address(src, tmp5, Address::times_2)); + lea(dst, Address(dst, tmp5, Address::times_1)); + subptr(len, tmp5); + jmpb(copy_chars_loop); + + bind(copy_tail_sse); movl(len, result); + andl(len, 0x00000007); // tail count (in chars) } // compress 1 char per iter + bind(copy_tail); testl(len, len); - jccb(Assembler::zero, return_length); + jccb(Assembler::zero, done); lea(src, Address(src, len, Address::times_2)); lea(dst, Address(dst, len, Address::times_1)); negptr(len); bind(copy_chars_loop); - load_unsigned_short(result, Address(src, len, Address::times_2)); - testl(result, 0xff00); // check if Unicode char - jccb(Assembler::notZero, return_zero); - movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte + load_unsigned_short(tmp5, Address(src, len, Address::times_2)); + testl(tmp5, 0xff00); // check if Unicode char + jccb(Assembler::notZero, reset_sp); + movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte increment(len); - jcc(Assembler::notZero, copy_chars_loop); + jccb(Assembler::notZero, copy_chars_loop); - // if compression succeeded, return length - bind(return_length); - pop(result); - jmpb(done); - - // if compression failed, return 0 - bind(return_zero); - xorl(result, result); - addptr(rsp, wordSize); + // add len then return (len will be zero if compress succeeded, otherwise negative) + bind(reset_sp); + addl(result, len); bind(done); } diff --git a/src/java.base/share/classes/java/lang/AbstractStringBuilder.java b/src/java.base/share/classes/java/lang/AbstractStringBuilder.java index b41b5db3783..187e041e674 100644 --- a/src/java.base/share/classes/java/lang/AbstractStringBuilder.java +++ b/src/java.base/share/classes/java/lang/AbstractStringBuilder.java @@ -130,6 +130,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence * as the specified {@code CharSequence}. The initial capacity of * the string builder is {@code 16} plus the length of the * {@code CharSequence} argument. + *

+ * The contents are unspecified if the {@code CharSequence} + * is modified during string construction. * * @param seq the sequence to copy. */ @@ -666,6 +669,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence * If {@code s} is {@code null}, then this method appends * characters as if the s parameter was a sequence containing the four * characters {@code "null"}. + *

+ * The contents are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * * @param s the sequence to append. * @param start the starting index of the subsequence to be appended. @@ -1241,6 +1248,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence * invocation of this object's * {@link #insert(int,CharSequence,int,int) insert}(dstOffset, s, 0, s.length()) * method. + *

+ * The contents are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * *

If {@code s} is {@code null}, then the four characters * {@code "null"} are inserted into this sequence. @@ -1289,6 +1300,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence *

If {@code s} is {@code null}, then this method inserts * characters as if the s parameter was a sequence containing the four * characters {@code "null"}. + *

+ * The contents are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * * @param dstOffset the offset in this sequence. * @param s the sequence to be inserted. @@ -1675,11 +1690,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence /* for readObject() */ void initBytes(char[] value, int off, int len) { if (String.COMPACT_STRINGS) { - this.value = StringUTF16.compress(value, off, len); - if (this.value != null) { - this.coder = LATIN1; - return; - } + byte[] val = StringUTF16.compress(value, off, len); + this.coder = StringUTF16.coderFromArrayLen(val, len); + this.value = val; + return; } this.coder = UTF16; this.value = StringUTF16.toBytes(value, off, len); @@ -1720,6 +1734,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence val[j++] = (byte)c; } else { inflate(); + // store c to make sure it has a UTF16 char + StringUTF16.putChar(this.value, j++, c); + i++; StringUTF16.putCharsSB(this.value, j, s, i, end); return; } @@ -1812,6 +1829,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence } else { count = j; inflate(); + // Store c to make sure sb has a UTF16 char + StringUTF16.putChar(this.value, j++, c); + count = j; + i++; StringUTF16.putCharsSB(this.value, j, s, i, end); count += end - i; return; @@ -1923,6 +1944,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence *

* If {@code cs} is {@code null}, then the four characters * {@code "null"} are repeated into this sequence. + *

+ * The contents are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * * @param cs a {@code CharSequence} * @param count number of times to copy diff --git a/src/java.base/share/classes/java/lang/Appendable.java b/src/java.base/share/classes/java/lang/Appendable.java index 0113d6f63f5..b1b40c4c3e8 100644 --- a/src/java.base/share/classes/java/lang/Appendable.java +++ b/src/java.base/share/classes/java/lang/Appendable.java @@ -57,6 +57,10 @@ public interface Appendable { * {@code csq}, the entire sequence may not be appended. For * instance, if {@code csq} is a {@link java.nio.CharBuffer} then * the subsequence to append is defined by the buffer's position and limit. + *

+ * The contents of this {@code Appendable} are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * * @param csq * The character sequence to append. If {@code csq} is @@ -81,6 +85,10 @@ public interface Appendable { *

      *     out.append(csq.subSequence(start, end)) 
* + *

+ * The contents of this {@code Appendable} are unspecified if the {@code CharSequence} + * is modified during the method call or an exception is thrown + * when accessing the {@code CharSequence}. * @param csq * The character sequence from which a subsequence will be * appended. If {@code csq} is {@code null}, then characters diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 96c9ec5f176..5869e086191 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -273,6 +273,9 @@ public final class String * contents of the character array are copied; subsequent modification of * the character array does not affect the newly created string. * + *

The contents of the string are unspecified if the character array + * is modified during string construction. + * * @param value * The initial value of the string */ @@ -288,6 +291,9 @@ public final class String * subarray are copied; subsequent modification of the character array does * not affect the newly created string. * + *

The contents of the string are unspecified if the character array + * is modified during string construction. + * * @param value * Array that is the source of characters * @@ -319,6 +325,9 @@ public final class String * {@code char}s; subsequent modification of the {@code int} array does not * affect the newly created string. * + *

The contents of the string are unspecified if the codepoints array + * is modified during string construction. + * * @param codePoints * Array that is the source of Unicode code points * @@ -346,12 +355,10 @@ public final class String return; } if (COMPACT_STRINGS) { - byte[] val = StringLatin1.toBytes(codePoints, offset, count); - if (val != null) { - this.coder = LATIN1; - this.value = val; - return; - } + byte[] val = StringUTF16.compress(codePoints, offset, count); + this.coder = StringUTF16.coderFromArrayLen(val, count); + this.value = val; + return; } this.coder = UTF16; this.value = StringUTF16.toBytes(codePoints, offset, count); @@ -368,6 +375,9 @@ public final class String *

Each {@code byte} in the subarray is converted to a {@code char} as * specified in the {@link #String(byte[],int) String(byte[],int)} constructor. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @deprecated This method does not properly convert bytes into characters. * As of JDK 1.1, the preferred way to do this is via the * {@code String} constructors that take a {@link Charset}, charset name, @@ -429,6 +439,9 @@ public final class String * | (b & 0xff)) * * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @deprecated This method does not properly convert bytes into * characters. As of JDK 1.1, the preferred way to do this is via the * {@code String} constructors that take a {@link Charset}, charset name, @@ -463,6 +476,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -501,6 +517,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -543,47 +562,43 @@ public final class String this.coder = LATIN1; return; } - int sl = offset + length; - byte[] dst = new byte[length]; - if (dp > 0) { - System.arraycopy(bytes, offset, dst, 0, dp); - offset += dp; - } - while (offset < sl) { - int b1 = bytes[offset++]; + // Decode with a stable copy, to be the result if the decoded length is the same + byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length); + int sp = dp; // first dp bytes are already in the copy + while (sp < length) { + int b1 = latin1[sp++]; if (b1 >= 0) { - dst[dp++] = (byte)b1; + latin1[dp++] = (byte)b1; continue; } - if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 - int b2 = bytes[offset]; + if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3 + int b2 = latin1[sp]; if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 - dst[dp++] = (byte)decode2(b1, b2); - offset++; + latin1[dp++] = (byte)decode2(b1, b2); + sp++; continue; } } // anything not a latin1, including the REPL // we have to go with the utf16 - offset--; + sp--; break; } - if (offset == sl) { - if (dp != dst.length) { - dst = Arrays.copyOf(dst, dp); + if (sp == length) { + if (dp != latin1.length) { + latin1 = Arrays.copyOf(latin1, dp); } - this.value = dst; + this.value = latin1; this.coder = LATIN1; return; } - byte[] buf = new byte[length << 1]; - StringLatin1.inflate(dst, 0, buf, 0, dp); - dst = buf; - dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); + byte[] utf16 = new byte[length << 1]; + StringLatin1.inflate(latin1, 0, utf16, 0, dp); + dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true); if (dp != length) { - dst = Arrays.copyOf(dst, dp << 1); + utf16 = Arrays.copyOf(utf16, dp << 1); } - this.value = dst; + this.value = utf16; this.coder = UTF16; } else { // !COMPACT_STRINGS byte[] dst = new byte[length << 1]; @@ -655,12 +670,10 @@ public final class String char[] ca = new char[en]; int clen = ad.decode(bytes, offset, length, ca); if (COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(ca, 0, clen); - if (bs != null) { - value = bs; - coder = LATIN1; - return; - } + byte[] val = StringUTF16.compress(ca, 0, clen);; + this.coder = StringUTF16.coderFromArrayLen(val, clen); + this.value = val; + return; } coder = UTF16; value = StringUTF16.toBytes(ca, 0, clen); @@ -686,12 +699,10 @@ public final class String throw new Error(x); } if (COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(ca, 0, caLen); - if (bs != null) { - value = bs; - coder = LATIN1; - return; - } + byte[] val = StringUTF16.compress(ca, 0, caLen); + this.coder = StringUTF16.coderFromArrayLen(val, caLen); + this.value = val; + return; } coder = UTF16; value = StringUTF16.toBytes(ca, 0, caLen); @@ -829,10 +840,9 @@ public final class String throw new IllegalArgumentException(x); } if (COMPACT_STRINGS) { - byte[] bs = StringUTF16.compress(ca, 0, caLen); - if (bs != null) { - return new String(bs, LATIN1); - } + byte[] val = StringUTF16.compress(ca, 0, caLen); + int coder = StringUTF16.coderFromArrayLen(val, len); + return new String(val, coder); } return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); } @@ -1386,6 +1396,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -1414,6 +1427,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -1438,6 +1454,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -1468,6 +1487,9 @@ public final class String * java.nio.charset.CharsetDecoder} class should be used when more control * over the decoding process is required. * + *

The contents of the string are unspecified if the byte array + * is modified during string construction. + * * @param bytes * The bytes to be decoded into characters * @@ -1496,6 +1518,9 @@ public final class String * string builder are copied; subsequent modification of the string builder * does not affect the newly created string. * + *

The contents of the string are unspecified if the {@code StringBuilder} + * is modified during string construction. + * *

This constructor is provided to ease migration to {@code * StringBuilder}. Obtaining a string from a string builder via the {@code * toString} method is likely to run faster and is generally preferred. @@ -4488,6 +4513,9 @@ public final class String * modification of the character array does not affect the returned * string. * + *

The contents of the string are unspecified if the character array + * is modified during string construction. + * * @param data the character array. * @return a {@code String} that contains the characters of the * character array. @@ -4506,6 +4534,9 @@ public final class String * are copied; subsequent modification of the character array does not * affect the returned string. * + *

The contents of the string are unspecified if the character array + * is modified during string construction. + * * @param data the character array. * @param offset initial offset of the subarray. * @param count length of the subarray. @@ -4767,15 +4798,18 @@ public final class String } /* - * Package private constructor. Trailing Void argument is there for + * Private constructor. Trailing Void argument is there for * disambiguating it against other (public) constructors. * * Stores the char[] value into a byte[] that each byte represents * the8 low-order bits of the corresponding character, if the char[] * contains only latin1 character. Or a byte[] that stores all * characters in their byte sequences defined by the {@code StringUTF16}. + * + *

The contents of the string are unspecified if the character array + * is modified during string construction. */ - String(char[] value, int off, int len, Void sig) { + private String(char[] value, int off, int len, Void sig) { if (len == 0) { this.value = "".value; this.coder = "".coder; @@ -4783,11 +4817,9 @@ public final class String } if (COMPACT_STRINGS) { byte[] val = StringUTF16.compress(value, off, len); - if (val != null) { - this.value = val; - this.coder = LATIN1; - return; - } + this.coder = StringUTF16.coderFromArrayLen(val, len); + this.value = val; + return; } this.coder = UTF16; this.value = StringUTF16.toBytes(value, off, len); @@ -4796,6 +4828,9 @@ public final class String /* * Package private constructor. Trailing Void argument is there for * disambiguating it against other (public) constructors. + * + *

The contents of the string are unspecified if the {@code StringBuilder} + * is modified during string construction. */ String(AbstractStringBuilder asb, Void sig) { byte[] val = asb.getValue(); @@ -4806,12 +4841,9 @@ public final class String } else { // only try to compress val if some characters were deleted. if (COMPACT_STRINGS && asb.maybeLatin1) { - byte[] buf = StringUTF16.compress(val, 0, length); - if (buf != null) { - this.coder = LATIN1; - this.value = buf; - return; - } + this.value = StringUTF16.compress(val, 0, length); + this.coder = StringUTF16.coderFromArrayLen(this.value, length); + return; } this.coder = UTF16; this.value = Arrays.copyOfRange(val, 0, length << 1); diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java index 71776cca485..a83cf65e140 100644 --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -47,8 +47,12 @@ final class StringLatin1 { return (char)(value[index] & 0xff); } + public static boolean canEncode(char cp) { + return cp <= 0xff; + } + public static boolean canEncode(int cp) { - return cp >>> 8 == 0; + return cp >=0 && cp <= 0xff; } public static int length(byte[] value) { diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java index eca2a9b00cb..42a1fa5ed21 100644 --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -34,7 +34,6 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; import jdk.internal.util.ArraysSupport; import jdk.internal.util.DecimalDigits; -import jdk.internal.vm.annotation.DontInline; import jdk.internal.vm.annotation.ForceInline; import jdk.internal.vm.annotation.IntrinsicCandidate; @@ -54,6 +53,19 @@ final class StringUTF16 { return new byte[len << 1]; } + // Check the size of a UTF16-coded string + // Throw an exception if out of range + public static int newBytesLength(int len) { + if (len < 0) { + throw new NegativeArraySizeException(); + } + if (len > MAX_LENGTH) { + throw new OutOfMemoryError("UTF16 String size is " + len + + ", should be less than " + MAX_LENGTH); + } + return len << 1; + } + @IntrinsicCandidate // intrinsic performs no bounds checks static void putChar(byte[] val, int index, int c) { @@ -148,6 +160,13 @@ final class StringUTF16 { return dst; } + /** + * {@return an encoded byte[] for the UTF16 characters in char[]} + * No checking is done on the characters, some may or may not be latin1. + * @param value a char array + * @param off an offset + * @param len a length + */ @IntrinsicCandidate public static byte[] toBytes(char[] value, int off, int len) { byte[] val = newBytesFor(len); @@ -158,20 +177,209 @@ final class StringUTF16 { return val; } - public static byte[] compress(char[] val, int off, int len) { - byte[] ret = new byte[len]; - if (compress(val, off, ret, 0, len) == len) { - return ret; - } - return null; + // Clever way to get the coder from a byte array returned from compress + // that maybe either latin1 or UTF16-coded + // Equivalent to (len == val.length) ? LATIN1 : UTF16 + @ForceInline + static byte coderFromArrayLen(byte[] value, int len) { + return (byte) ((len - value.length) >>> Integer.SIZE - 1); } - public static byte[] compress(byte[] val, int off, int len) { - byte[] ret = new byte[len]; - if (compress(val, off, ret, 0, len) == len) { - return ret; + /** + * {@return Compress the char array (containing UTF16) into a compact strings byte array} + * If all the chars are LATIN1, it returns an array with len == count, + * otherwise, it contains UTF16 characters. + *

+ * A UTF16 array is returned *only* if at least 1 non-latin1 character is present. + * This must be true even if the input array is modified while this method is executing. + * This is assured by copying the characters while checking for latin1. + * If all characters are latin1, a byte array with length equals count is returned, + * indicating all latin1 chars. The scan may be implemented as an intrinsic, + * which returns the index of the first non-latin1 character. + * When the first non-latin1 character is found, it switches to creating a new + * buffer; the saved prefix of latin1 characters is copied to the new buffer; + * and the remaining input characters are copied to the buffer. + * The index of the known non-latin1 character is checked, if it is latin1, + * the input has been changed. In this case, a second attempt is made to compress to + * latin1 from the copy made in the first pass to the originally allocated latin1 buffer. + * If it succeeds the return value is latin1, otherwise, the utf16 value is returned. + * In this unusual case, the result is correct for the snapshot of the value. + * The resulting string contents are unspecified if the input array is modified during this + * operation, but it is ensured that at least 1 non-latin1 character is present in + * the non-latin1 buffer. + * + * @param val a char array + * @param off starting offset + * @param count count of chars to be compressed, {@code count} > 0 + */ + @ForceInline + public static byte[] compress(final char[] val, final int off, final int count) { + byte[] latin1 = new byte[count]; + int ndx = compress(val, off, latin1, 0, count); + if (ndx != count) { + // Switch to UTF16 + byte[] utf16 = toBytes(val, off, count); + // If the original character that was found to be non-latin1 is latin1 in the copy + // try to make a latin1 string from the copy + if (getChar(utf16, ndx) > 0xff + || compress(utf16, 0, latin1, 0, count) != count) { + return utf16; + } } - return null; + return latin1; // latin1 success + } + + /** + * {@return Compress the internal byte array (containing UTF16) into a compact strings byte array} + * If all the chars are LATIN1, it returns an array with len == count, + * otherwise, it contains UTF16 characters. + *

+ * Refer to the description of the algorithm in {@link #compress(char[], int, int)}. + * + * @param val a byte array with UTF16 coding + * @param off starting offset + * @param count count of chars to be compressed, {@code count} > 0 + */ + public static byte[] compress(final byte[] val, final int off, final int count) { + byte[] latin1 = new byte[count]; + int ndx = compress(val, off, latin1, 0, count); + if (ndx != count) {// Switch to UTF16 + byte[] utf16 = Arrays.copyOfRange(val, off << 1, newBytesLength(off + count)); + // If the original character that was found to be non-latin1 is latin1 in the copy + // try to make a latin1 string from the copy + if (getChar(utf16, ndx) > 0xff + || compress(utf16, 0, latin1, 0, count) != count) { + return utf16; + } + } + return latin1; // latin1 success + } + + /** + * {@return compress the code points into a compact strings byte array} + * If all the chars are LATIN1, returns an array with len == count. + * If not, a new byte array is allocated and code points converted to UTF16. + * The algorithm is similar to that of {@link #compress(char[], int, int)}. + *

+ * The resulting encoding is attempted in several steps: + *

+ * + * @param val an int array of code points + * @param off starting offset + * @param count length of code points to be compressed, length > 0 + */ + public static byte[] compress(final int[] val, int off, final int count) { + // Optimistically copy all latin1 code points to the destination + byte[] latin1 = new byte[count]; + final int end = off + count; + for (int ndx = 0; ndx < count; ndx++, off++) { + int cp = val[off]; + if (cp >= 0 && cp <= 0xff) { + latin1[ndx] = (byte)cp; + } else { + // Pass 1: Compute precise size of char[]; see extractCodePoints for caveat + int estSize = ndx + computeCodePointSize(val, off, end); + + // Pass 2: Switch to UTF16 + // cp = val[ndx] is at least one code point known to be UTF16 + byte[] utf16 = newBytesFor(estSize); + if (ndx > 0) { + StringLatin1.inflate(latin1, 0, utf16, 0, ndx); // inflate latin1 bytes + } + + if (estSize == count) { + // Based on the computed size, all remaining code points are BMP and + // can be copied without checking again + putChar(utf16, ndx, cp); // ensure utf16 has a UTF16 char + off++; + for (int i = ndx + 1; i < count; i++, off++) { + putChar(utf16, i, val[off]); + } + } else { + // Some codepoint is a surrogate pair + utf16 = extractCodepoints(val, off, end, utf16, ndx); + + // The original character that was found to be UTF16 is not UTF16 in the copy + // Try to make a latin1 string from the copy + if (getChar(utf16, ndx) <= 0xff && + compress(utf16, 0, latin1, 0, count) == count) { + return latin1; // latin1 success + } + } + return utf16; + } + } + return latin1; // Latin1 success + } + + // Extract code points into chars in the byte array + // + // Guard against possible races with the input array changing between the previous + // computation of the required output size and storing the bmp or surrogates. + // If a BMP code point is changed to a supplementary code point it would require 2 chars + // in the output. Changing a supplementary char to BMP would reduce the size. + // If the utf16 destination is not large enough, it is resized to fit the + // remaining codepoints assuming they occupy 2 characters. + // The destination may be copied to return exactly the final length. + // The additional allocations and compression only occur if the input array is modified. + private static byte[] extractCodepoints(int[] val, int off, int end, byte[] dst, int dstOff) { + while (off < end) { + // Compute a minimum estimate on the number of characters can be put into the dst + // given the current codepoint and the number of remaining codepoints + int codePoint = val[off]; // read each codepoint from val only once + int dstLimit = dstOff + + Character.charCount(codePoint) + + (end - off - 1); + if (dstLimit > (dst.length >> 1)) { + // Resize to hold the remaining codepoints assuming they are all surrogates. + // By resizing to the maximum that might be needed, only a single resize will occur. + // dstLimit includes only a single char per codepoint, pad with an additional for each. + int maxRemaining = dstLimit + (end - off - 1); + dst = Arrays.copyOf(dst, newBytesLength(maxRemaining)); + } + // Efficiently copy as many codepoints as fit within the current estimated limit + // The dst at least enough space for the current codepoint. + while (true) { + if (Character.isBmpCodePoint(codePoint)) { + putChar(dst, dstOff++, codePoint); + } else { + putChar(dst, dstOff++, Character.highSurrogate(codePoint)); + putChar(dst, dstOff++, Character.lowSurrogate(codePoint)); + } + off++; + if (dstOff + 2 > dstLimit) + break; // no space for another surrogate; recompute limit + codePoint = val[off]; + } + } + if (dstOff != (dst.length >> 1)) { + // Truncate to actual length; should only occur if a codepoint was racily + // changed from a surrogate to a BMP character. + return Arrays.copyOf(dst, newBytesLength(dstOff)); + } + return dst; + } + + // Compute the number of chars needed to represent the code points from off to end-1 + private static int computeCodePointSize(int[] val, int off, int end) { + int n = end - off; + while (off < end) { + int codePoint = val[off++]; + if (Character.isBmpCodePoint(codePoint)) { + continue; + } else if (Character.isValidCodePoint(codePoint)) { + n++; + } else { + throw new IllegalArgumentException(Integer.toString(codePoint)); + } + } + return n; } // compressedCopy char[] -> byte[] @@ -179,9 +387,8 @@ final class StringUTF16 { public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { for (int i = 0; i < len; i++) { char c = src[srcOff]; - if (c > 0xFF) { - len = 0; - break; + if (c > 0xff) { + return i; // return index of non-latin1 char } dst[dstOff] = (byte)c; srcOff++; @@ -197,9 +404,8 @@ final class StringUTF16 { checkBoundsOffCount(srcOff, len, src); for (int i = 0; i < len; i++) { char c = getChar(src, srcOff); - if (c > 0xFF) { - len = 0; - break; + if (c > 0xff) { + return i; // return index of non-latin1 char } dst[dstOff] = (byte)c; srcOff++; @@ -208,31 +414,14 @@ final class StringUTF16 { return len; } + // Create the UTF16 buffer for !COMPACT_STRINGS public static byte[] toBytes(int[] val, int index, int len) { final int end = index + len; - // Pass 1: Compute precise size of char[] - int n = len; - for (int i = index; i < end; i++) { - int cp = val[i]; - if (Character.isBmpCodePoint(cp)) - continue; - else if (Character.isValidCodePoint(cp)) - n++; - else throw new IllegalArgumentException(Integer.toString(cp)); - } - // Pass 2: Allocate and fill in pair + int n = computeCodePointSize(val, index, end); + byte[] buf = newBytesFor(n); - for (int i = index, j = 0; i < end; i++, j++) { - int cp = val[i]; - if (Character.isBmpCodePoint(cp)) { - putChar(buf, j, cp); - } else { - putChar(buf, j++, Character.highSurrogate(cp)); - putChar(buf, j, Character.lowSurrogate(cp)); - } - } - return buf; - } + return extractCodepoints(val, index, len, buf, 0); + } public static byte[] toBytes(char c) { byte[] result = new byte[2]; @@ -653,10 +842,9 @@ final class StringUTF16 { if (String.COMPACT_STRINGS && !StringLatin1.canEncode(oldChar) && StringLatin1.canEncode(newChar)) { - byte[] val = compress(buf, 0, len); - if (val != null) { - return new String(val, LATIN1); - } + byte[] res = StringUTF16.compress(buf, 0, len); + byte coder = StringUTF16.coderFromArrayLen(res, len); + return new String(res, coder); } return new String(buf, UTF16); } @@ -771,10 +959,9 @@ final class StringUTF16 { if (String.COMPACT_STRINGS && replLat1 && !targLat1) { // combination 6 - byte[] lat1Result = compress(result, 0, resultLen); - if (lat1Result != null) { - return new String(lat1Result, LATIN1); - } + byte[] res = StringUTF16.compress(result, 0, resultLen); + byte coder = StringUTF16.coderFromArrayLen(res, resultLen); + return new String(res, coder); // combination 6 } return new String(result, UTF16); } @@ -838,7 +1025,7 @@ final class StringUTF16 { bits |= cp; putChar(result, i, cp); } - if (bits > 0xFF) { + if (bits < 0 || bits > 0xff) { return new String(result, UTF16); } else { return newString(result, 0, len); @@ -939,7 +1126,7 @@ final class StringUTF16 { bits |= cp; putChar(result, i, cp); } - if (bits > 0xFF) { + if (bits < 0 || bits > 0xff) { return new String(result, UTF16); } else { return newString(result, 0, len); @@ -1168,10 +1355,9 @@ final class StringUTF16 { return ""; } if (String.COMPACT_STRINGS) { - byte[] buf = compress(val, index, len); - if (buf != null) { - return new String(buf, LATIN1); - } + byte[] res = StringUTF16.compress(val, index, len); + byte coder = StringUTF16.coderFromArrayLen(res, len); + return new String(res, coder); } int last = index + len; return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16); @@ -1502,8 +1688,8 @@ final class StringUTF16 { private static native boolean isBigEndian(); - static final int HI_BYTE_SHIFT; - static final int LO_BYTE_SHIFT; + private static final int HI_BYTE_SHIFT; + private static final int LO_BYTE_SHIFT; static { if (isBigEndian()) { HI_BYTE_SHIFT = 8; diff --git a/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java b/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java new file mode 100644 index 00000000000..8bec9822462 --- /dev/null +++ b/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8311906 + * @summary Validates String constructor intrinsics using varied input data. + * @key randomness + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 compiler.intrinsics.string.TestStringConstructionIntrinsics + */ +/* + * @test + * @bug 8311906 + * @summary Validates String constructor intrinsic for AVX3 works with and without + * AVX3Threshold=0 + * @key randomness + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @requires vm.cpu.features ~= ".*avx512.*" + * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 compiler.intrinsics.string.TestStringConstructionIntrinsics + * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:AVX3Threshold=0 compiler.intrinsics.string.TestStringConstructionIntrinsics + */ + +package compiler.intrinsics.string; + +import java.lang.Helper; +import java.util.Random; + +import jdk.test.lib.Utils; + +public class TestStringConstructionIntrinsics { + + private static byte[] bytes = new byte[2 * (4096 + 32)]; + + private static char[] chars = new char[4096 + 32]; + + // Used a scratch buffer, sized to accommodate inflated + private static byte[] dst = new byte[bytes.length * 2]; + + private static final Random RANDOM = Utils.getRandomInstance(); + + /** + * Completely initialize the bytes test array. The lowest index that will be + * non-latin1 is marked by nlOffset + */ + public static void initializeBytes(int off, int len, int nonLatin1, int nlOffset) { + int maxLen = bytes.length >> 1; + assert (len + off < maxLen); + // insert "canary" (non-latin1) values before offset + for (int i = 0; i < off; i++) { + Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180); + } + // fill the array segment + for (int i = off; i < len + off; i++) { + Helper.putCharSB(bytes, i, ((i - off + 15) & 0xFF)); + } + if (nonLatin1 != 0) { + // modify a number disparate indexes to be non-latin1 + for (int i = 0; i < nonLatin1; i++) { + int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset; + Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180); + } + } + // insert "canary" non-latin1 values after array segment + for (int i = len + off; i < maxLen; i++) { + Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180); + } + } + + /** + * Completely initialize the char test array. The lowest index that will be + * non-latin1 is marked by nlOffset + */ + public static void initializeChars(int off, int len, int nonLatin1, int nlOffset) { + assert (len + off <= chars.length); + // insert "canary" non-latin1 values before offset + for (int i = 0; i < off; ++i) { + chars[i] = (char) (((i + 15) & 0x7F) | 0x180); + } + // fill the array segment + for (int i = off; i < len + off; ++i) { + chars[i] = (char) (((i - off + 15) & 0xFF)); + } + if (nonLatin1 != 0) { + // modify a number disparate chars inside + // segment to be non-latin1. + for (int i = 0; i < nonLatin1; i++) { + int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset; + chars[idx] = (char) (0x180 | chars[idx]); + } + } + // insert "canary" non-latin1 values after array segment + for (int i = len + off; i < chars.length; ++i) { + chars[i] = (char) (((i + 15) & 0x7F) | 0x180); + } + } + + /** + * Test different array segment sizes, offsets, and number of non-latin1 + * chars. + */ + public static void testConstructBytes() throws Exception { + for (int off = 0; off < 16; off++) { // starting offset of array segment + // Test all array segment sizes 1-63 + for (int len = 1; len < 64; len++) { + testConstructBytes(off, len, 0, 0); + testConstructBytes(off, len, 1, 0); + testConstructBytes(off, len, RANDOM.nextInt(30) + 2, 0); + } + // Test a random selection of sizes between 64 and 4099, inclusive + for (int i = 0; i < 20; i++) { + int len = 64 + RANDOM.nextInt(4100 - 64); + testConstructBytes(off, len, 0, 0); + testConstructBytes(off, len, 1, 0); + testConstructBytes(off, len, RANDOM.nextInt(len) + 2, 0); + } + for (int len : new int[] { 128, 2048 }) { + // test with negatives only in a 1-63 byte tail + int tail = RANDOM.nextInt(63) + 1; + int ng = RANDOM.nextInt(tail) + 1; + testConstructBytes(off, len + tail, ng, len); + } + } + } + + private static void testConstructBytes(int off, int len, int ng, int ngOffset) throws Exception { + assert (len + off < bytes.length); + initializeBytes(off, len, ng, ngOffset); + byte[] dst = new byte[bytes.length]; + + int calculated = Helper.compress(bytes, off, dst, 0, len); + int expected = compress(bytes, off, dst, 0, len); + if (calculated != expected) { + if (expected != len && ng >= 0 && calculated >= 0 && calculated < expected) { + // allow intrinsics to return early with a lower value, + // but only if we're not expecting the full length (no + // negative bytes) + return; + } + throw new Exception("Failed testConstructBytes: " + "offset: " + off + " " + + "length: " + len + " " + "return: " + calculated + " expected: " + expected + " negatives: " + + ng + " offset: " + ngOffset); + } + } + + private static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) { + for (int i = 0; i < len; i++) { + char c = Helper.charAt(src, srcOff); + if (c > 0xff) { + return i; // return index of non-latin1 char + } + dst[dstOff] = (byte)c; + srcOff++; + dstOff++; + } + return len; + } + + /** + * Test different array segment sizes, offsets, and number of non-latin1 + * chars. + */ + public static void testConstructChars() throws Exception { + for (int off = 0; off < 16; off++) { // starting offset of array segment + // Test all array segment sizes 1-63 + for (int len = 1; len < 64; len++) { + testConstructChars(off, len, 0, 0); + testConstructChars(off, len, 1, 0); + testConstructChars(off, len, RANDOM.nextInt(30) + 2, 0); + } + // Test a random selection of sizes between 64 and 4099, inclusive + for (int i = 0; i < 20; i++) { + int len = 64 + RANDOM.nextInt(4100 - 64); + testConstructChars(off, len, 0, 0); + testConstructChars(off, len, 1, 0); + testConstructChars(off, len, RANDOM.nextInt(len) + 2, 0); + } + for (int len : new int[] { 128, 2048 }) { + // test with negatives only in a 1-63 byte tail + int tail = RANDOM.nextInt(63) + 1; + int ng = RANDOM.nextInt(tail) + 1; + testConstructChars(off, len + tail, ng, len); + } + } + } + + private static void testConstructChars(int off, int len, int nonLatin1, int nlOffset) throws Exception { + assert (len + off < bytes.length); + initializeChars(off, len, nonLatin1, nlOffset); + + int calculated = Helper.compress(chars, off, dst, 0, len); + int expected = compress(chars, off, dst, 0, len); + if (calculated != expected) { + if (expected != len && nonLatin1 >= 0 && calculated >= 0 && calculated < expected) { + // allow intrinsics to return early with a lower value, + // but only if we're not expecting the full length (no + // negative bytes) + return; + } + throw new Exception("Failed testConstructChars: " + "offset: " + off + " " + + "length: " + len + " " + "return: " + calculated + " expected: " + expected + " non-latin1: " + + nonLatin1 + " offset: " + nlOffset); + } + } + + private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { + for (int i = 0; i < len; i++) { + char c = src[srcOff]; + if (c > 0xff) { + return i; // return index of non-latin1 char + } + dst[dstOff] = (byte)c; + srcOff++; + dstOff++; + } + return len; + } + + public void run() throws Exception { + // iterate to eventually get intrinsic inlined + for (int j = 0; j < 200; ++j) { + testConstructBytes(); + testConstructChars(); + } + } + + public static void main(String[] args) throws Exception { + (new TestStringConstructionIntrinsics()).run(); + System.out.println("string construction intrinsics validated"); + } +} diff --git a/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java b/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java index 49cb89b6f7f..a24d7b98ada 100644 --- a/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java +++ b/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java @@ -44,6 +44,11 @@ public class Helper { return dst; } + @jdk.internal.vm.annotation.ForceInline + public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) { + return StringUTF16.compress(src, srcOff, dst, dstOff, len); + } + @jdk.internal.vm.annotation.ForceInline public static byte[] compressChar(char[] src, int srcOff, int dstSize, int dstOff, int len) { byte[] dst = new byte[dstSize]; @@ -51,6 +56,11 @@ public class Helper { return dst; } + @jdk.internal.vm.annotation.ForceInline + public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { + return StringUTF16.compress(src, srcOff, dst, dstOff, len); + } + @jdk.internal.vm.annotation.ForceInline public static byte[] inflateByte(byte[] src, int srcOff, int dstSize, int dstOff, int len) { byte[] dst = new byte[dstSize]; diff --git a/test/jdk/java/lang/String/Chars.java b/test/jdk/java/lang/String/Chars.java index ab6771b8e0b..20cde9d003d 100644 --- a/test/jdk/java/lang/String/Chars.java +++ b/test/jdk/java/lang/String/Chars.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,10 +22,10 @@ */ /* - @test - @bug 8054307 - @summary test chars() and codePoints() -*/ + * @test + * @bug 8054307 8311906 + * @summary test String chars() and codePoints() + */ import java.util.Arrays; import java.util.Random; @@ -44,6 +44,7 @@ public class Chars { cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x80)); } testChars(cc, ccExp); + testCharsSubrange(cc, ccExp); testCPs(cc, cpExp); // bmp without surrogates @@ -51,6 +52,7 @@ public class Chars { cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x8000)); } testChars(cc, ccExp); + testCharsSubrange(cc, ccExp); testCPs(cc, cpExp); // bmp with surrogates @@ -69,6 +71,7 @@ public class Chars { } cpExp = Arrays.copyOf(cpExp, k); testChars(cc, ccExp); + testCharsSubrange(cc, ccExp); testCPs(cc, cpExp); } } @@ -76,14 +79,35 @@ public class Chars { static void testChars(char[] cc, int[] expected) { String str = new String(cc); if (!Arrays.equals(expected, str.chars().toArray())) { - throw new RuntimeException("chars/codePoints() failed!"); + throw new RuntimeException("testChars failed!"); + } + } + + static void testCharsSubrange(char[] cc, int[] expected) { + int[] offsets = { 7, 31 }; // offsets to test + int LENGTH = 13; + for (int i = 0; i < offsets.length; i++) { + int offset = Math.max(0, offsets[i]); // confine to the input array + int count = Math.min(LENGTH, cc.length - offset); + String str = new String(cc, offset, count); + int[] actual = str.chars().toArray(); + int errOffset = Arrays.mismatch(actual, 0, actual.length, + expected, offset, offset + count); + if (errOffset >= 0) { + System.err.printf("expected[%d] (%d) != actual[%d] (%d)%n", + offset + errOffset, expected[offset + errOffset], + errOffset, actual[errOffset]); + System.err.println("expected: " + Arrays.toString(expected)); + System.err.println("actual: " + Arrays.toString(actual)); + throw new RuntimeException("testCharsSubrange failed!"); + } } } static void testCPs(char[] cc, int[] expected) { String str = new String(cc); if (!Arrays.equals(expected, str.codePoints().toArray())) { - throw new RuntimeException("chars/codePoints() failed!"); + throw new RuntimeException("testCPs failed!"); } } } diff --git a/test/jdk/java/lang/String/StringRacyConstructor.java b/test/jdk/java/lang/String/StringRacyConstructor.java new file mode 100644 index 00000000000..bfec99da75e --- /dev/null +++ b/test/jdk/java/lang/String/StringRacyConstructor.java @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package test.java.lang.String; + +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.ConcurrentModificationException; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +/* + * @test + * @bug 8311906 + * @modules java.base/java.lang:open + * @summary check String's racy constructors + * @run junit/othervm -XX:+CompactStrings test.java.lang.String.StringRacyConstructor + * @run junit/othervm -XX:-CompactStrings test.java.lang.String.StringRacyConstructor + */ + +public class StringRacyConstructor { + private static final byte LATIN1 = 0; + private static final byte UTF16 = 1; + + private static final Field STRING_CODER_FIELD; + private static final Field SB_CODER_FIELD; + private static final boolean COMPACT_STRINGS; + + static { + try { + STRING_CODER_FIELD = String.class.getDeclaredField("coder"); + STRING_CODER_FIELD.setAccessible(true); + SB_CODER_FIELD = Class.forName("java.lang.AbstractStringBuilder").getDeclaredField("coder"); + SB_CODER_FIELD.setAccessible(true); + COMPACT_STRINGS = isCompactStrings(); + } catch (NoSuchFieldException ex ) { + throw new ExceptionInInitializerError(ex); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /* {@return true iff CompactStrings are enabled} + */ + public static boolean isCompactStrings() { + try { + Field compactStringField = String.class.getDeclaredField("COMPACT_STRINGS"); + compactStringField.setAccessible(true); + return compactStringField.getBoolean(null); + } catch (NoSuchFieldException ex) { + throw new ExceptionInInitializerError(ex); + } catch (IllegalAccessException iae) { + throw new AssertionError(iae); + } + } + + // Return the coder for the String + private static int coder(String s) { + try { + return STRING_CODER_FIELD.getByte(s); + } catch (IllegalAccessException iae) { + throw new AssertionError(iae); + } + } + + // Return the coder for the StringBuilder + private static int sbCoder(StringBuilder sb) { + try { + return SB_CODER_FIELD.getByte(sb); + } catch (IllegalAccessException iae) { + throw new AssertionError(iae); + } + } + + // Return a summary of the internals of the String + // The coder and indicate if the coder matches the string contents + private static String inspectString(String s) { + try { + char[] chars = s.toCharArray(); + String r = new String(chars); + + boolean invalidCoder = coder(s) != coder(r); + String coder = STRING_CODER_FIELD.getByte(s) == 0 ? "isLatin1" : "utf16"; + return (invalidCoder ? "INVALID CODER" : "" ) + " \"" + s + "\", coder: " + coder; + } catch (IllegalAccessException ex ) { + return "EXCEPTION: " + ex.getMessage(); + } + } + + /** + * {@return true if the coder matches the presence/lack of UTF16 characters} + * If it returns false, the coder and the contents have failed the precondition for string. + * @param orig a string + */ + private static boolean validCoder(String orig) { + if (!COMPACT_STRINGS) { + assertEquals(UTF16, coder(orig), "Non-COMPACT STRINGS coder must be UTF16"); + } + int accum = 0; + for (int i = 0; i < orig.length(); i++) + accum |= orig.charAt(i); + byte expectedCoder = (accum < 256) ? LATIN1 : UTF16; + return expectedCoder == coder(orig); + } + + // Check a StringBuilder for consistency of coder and latin1 vs UTF16 + private static boolean validCoder(StringBuilder orig) { + int accum = 0; + for (int i = 0; i < orig.length(); i++) + accum |= orig.charAt(i); + byte expectedCoder = (accum < 256) ? LATIN1 : UTF16; + return expectedCoder == sbCoder(orig); + } + + @Test + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void checkStringRange() { + char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23}; + String orig = new String(chars); + char[] xx = orig.toCharArray(); + String stringFromChars = new String(xx); + assertEquals(orig, stringFromChars, "mixed chars"); + assertTrue(validCoder(stringFromChars), "invalid coder" + + ", invalid coder: " + inspectString(stringFromChars)); + } + + private static List strings() { + return List.of("01234", " "); + } + + @ParameterizedTest + @MethodSource("strings") + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void racyString(String orig) { + String racyString = racyStringConstruction(orig); + // The contents are indeterminate due to the race + assertTrue(validCoder(racyString), orig + " string invalid" + + ", racyString: " + inspectString(racyString)); + } + + @ParameterizedTest + @MethodSource("strings") + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void racyCodePoint(String orig) { + String iffyString = racyStringConstructionCodepoints(orig); + // The contents are indeterminate due to the race + assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string" + + ", orig:" + inspectString(orig) + + ", iffyString: " + inspectString(iffyString)); + } + + @ParameterizedTest + @MethodSource("strings") + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void racyCodePointSurrogates(String orig) { + String iffyString = racyStringConstructionCodepointsSurrogates(orig); + // The contents are indeterminate due to the race + if (!orig.equals(iffyString)) + System.err.println("orig: " + orig + ", iffy: " + iffyString + Arrays.toString(iffyString.codePoints().toArray())); + assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string" + + ", orig:" + inspectString(orig) + + ", iffyString: " + inspectString(iffyString)); + } + + // Test the private methods of StringUTF16 that compress and copy COMPRESSED_STRING + // encoded byte arrays. + @Test + public void verifyUTF16CopyBytes() + throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { + Class stringUTF16 = Class.forName("java.lang.StringUTF16"); + Method mCompressChars = stringUTF16.getDeclaredMethod("compress", + char[].class, int.class, byte[].class, int.class, int.class); + mCompressChars.setAccessible(true); + + // First warmup the intrinsic and check 1 case + char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23}; + byte[] bytes = new byte[chars.length]; + int printWarningCount = 0; + + for (int i = 0; i < 1_000_000; i++) { // repeat to get C2 to kick in + // Copy only latin1 chars from UTF-16 converted prefix (3 chars -> 3 bytes) + int intResult = (int) mCompressChars.invoke(null, chars, 0, bytes, 0, chars.length); + if (intResult == 0) { + if (printWarningCount == 0) { + printWarningCount = 1; + System.err.println("Intrinsic for StringUTF16.compress returned 0, may not have been updated."); + } + } else { + assertEquals(3, intResult, "return length not-equal, iteration: " + i); + } + } + + // Exhaustively check compress returning the correct index of the non-latin1 char. + final int SIZE = 48; + final byte FILL_BYTE = 'R'; + chars = new char[SIZE]; + bytes = new byte[chars.length]; + for (int i = 0; i < SIZE; i++) { // Every starting index + for (int j = i; j < SIZE; j++) { // Every location of non-latin1 + Arrays.fill(chars, 'A'); + Arrays.fill(bytes, FILL_BYTE); + chars[j] = 0xFF21; + int intResult = (int) mCompressChars.invoke(null, chars, i, bytes, 0, chars.length - i); + assertEquals(j - i, intResult, "compress found wrong index"); + assertEquals(FILL_BYTE, bytes[j], "extra character stored"); + } + } + + } + + // Check that a concatenated "hello" has a valid coder + @Test + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void checkConcatAndIntern() { + var helloWorld = "hello world"; + String helloToo = racyStringConstruction("hell".concat("o")); + String o = helloToo.intern(); + var hello = "hello"; + assertTrue(validCoder(helloToo), "startsWith: " + + ", hell: " + inspectString(helloToo) + + ", o: " + inspectString(o) + + ", hello: " + inspectString(hello) + + ", hello world: " + inspectString(helloWorld)); + } + + // Check that an empty string with racy construction has a valid coder + @Test + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + public void racyEmptyString() { + var space = racyStringConstruction(" "); + var trimmed = space.trim(); + assertTrue(validCoder(trimmed), "empty string invalid coder" + + ", trimmed: " + inspectString(trimmed)); + } + + // Check that an exception in a user implemented CharSequence doesn't result in + // an invalid coder when appended to a StringBuilder + @Test + @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings") + void charSequenceException() { + ThrowingCharSequence bs = new ThrowingCharSequence("A\u2030\uFFFD"); + var sb = new StringBuilder(); + try { + sb.append(bs); + fail("An IllegalArgumentException should have been thrown"); + } catch (IllegalArgumentException ex) { + // ignore expected + } + assertTrue(validCoder(sb), "invalid coder in StringBuilder"); + } + + /** + * Given a latin-1 String, attempt to create a copy that is + * incorrectly encoded as UTF-16. + */ + public static String racyStringConstruction(String original) throws ConcurrentModificationException { + if (original.chars().max().getAsInt() >= 256) { + throw new IllegalArgumentException( + "Only work with latin-1 Strings"); + } + + char[] chars = original.toCharArray(); + + // In another thread, flip the first character back + // and forth between being latin-1 or not + Thread thread = new Thread(() -> { + while (!Thread.interrupted()) { + chars[0] ^= 256; + } + }); + thread.start(); + + // at the same time call the String constructor, + // until we hit the race condition + int i = 0; + while (true) { + i++; + String s = new String(chars); + if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) { + thread.interrupt(); + try { + thread.join(); + } catch (InterruptedException ie) { + // ignore interrupt + } + return s; + } + } + } + + /** + * Given a latin-1 String, creates a copy that is + * incorrectly encoded as UTF-16 using the APIs for Codepoints. + */ + public static String racyStringConstructionCodepoints(String original) throws ConcurrentModificationException { + if (original.chars().max().getAsInt() >= 256) { + throw new IllegalArgumentException( + "Can only work with latin-1 Strings"); + } + + int len = original.length(); + int[] codePoints = new int[len]; + for (int i = 0; i < len; i++) { + codePoints[i] = original.charAt(i); + } + + // In another thread, flip the first character back + // and forth between being latin-1 or not + Thread thread = new Thread(() -> { + while (!Thread.interrupted()) { + codePoints[0] ^= 256; + } + }); + thread.start(); + + // at the same time call the String constructor, + // until we hit the race condition + int i = 0; + while (true) { + i++; + String s = new String(codePoints, 0, len); + if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) { + thread.interrupt(); + try { + thread.join(); + } catch (InterruptedException ie) { + // ignore interrupt + } + return s; + } + } + } + + /** + * Returns a string created from a codepoint array that has been racily + * modified to contain high and low surrogates. The string is a different length + * than the original due to the surrogate encoding. + */ + public static String racyStringConstructionCodepointsSurrogates(String original) throws ConcurrentModificationException { + if (original.chars().max().getAsInt() >= 256) { + throw new IllegalArgumentException( + "Can only work with latin-1 Strings"); + } + + int len = original.length(); + int[] codePoints = new int[len]; + for (int i = 0; i < len; i++) { + codePoints[i] = original.charAt(i); + } + + // In another thread, flip the first character back + // and forth between being latin-1 or as a surrogate pair. + Thread thread = new Thread(() -> { + while (!Thread.interrupted()) { + codePoints[0] ^= 0x10000; + } + }); + thread.start(); + + // at the same time call the String constructor, + // until we hit the race condition + int i = 0; + while (true) { + i++; + String s = new String(codePoints, 0, len); + if ((s.length() != original.length()) || i > 1_000_000) { + thread.interrupt(); + try { + thread.join(); + } catch (InterruptedException ie) { + // ignore interrupt + } + return s; + } + } + } + + // A CharSequence that returns characters from a string and throws IllegalArgumentException + // when the character requested is 0xFFFD (the replacement character) + // The string contents determine when the exception is thrown. + static class ThrowingCharSequence implements CharSequence { + private final String aString; + + ThrowingCharSequence(String aString) { + this.aString = aString; + } + + @Override + public int length() { + return aString.length(); + } + + @Override + public char charAt(int index) { + char ch = aString.charAt(index); + if (ch == 0xFFFD) { + throw new IllegalArgumentException("Replacement character at index " + index); + } + return ch; + } + + @Override + // Not used; returns the entire string + public CharSequence subSequence(int start, int end) { + return this; + } + } +} diff --git a/test/micro/org/openjdk/bench/java/lang/StringConstructor.java b/test/micro/org/openjdk/bench/java/lang/StringConstructor.java index 1509d6b798f..e9ed0022eda 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringConstructor.java +++ b/test/micro/org/openjdk/bench/java/lang/StringConstructor.java @@ -21,11 +21,13 @@ * questions. */ -package micro.org.openjdk.bench.java.lang; +package org.openjdk.bench.java.lang; import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.concurrent.TimeUnit; @State(Scope.Thread) @@ -36,45 +38,115 @@ import java.util.concurrent.TimeUnit; @Fork(3) public class StringConstructor { - @Param({"7", "64"}) - public int size; + private static final char INTEROBANG = 0x2030; - // Offset to use for ranged newStrings - @Param("1") - public int offset; - private byte[] array; + // Fixed offset to use for ranged newStrings + public final int offset = 1; - @Setup - public void setup() { - if (offset > size) { - offset = size; - } - array = "a".repeat(size).getBytes(StandardCharsets.UTF_8); - } + @Param({"7", "64"}) + public int size; - @Benchmark - public String newStringFromArray() { - return new String(array); - } + private byte[] array; + private char[] chars; + private char[] charsMixedBegin; + private char[] charsMixedSmall; + private char[] charsMixedEnd; + private int[] codePointsLatin1; + private int[] codePointsMixedBegin; + private int[] codePointsMixedSmall; - @Benchmark - public String newStringFromArrayWithCharset() { - return new String(array, StandardCharsets.UTF_8); - } + private static int[] intCopyOfChars(char[] chars, int newLength) { + int[] res = new int[newLength]; + for (int i = 0; i < Math.min(chars.length, newLength); i++) + res[i] = chars[i]; + return res; + } - @Benchmark - public String newStringFromArrayWithCharsetName() throws Exception { - return new String(array, StandardCharsets.UTF_8.name()); - } + @Setup + public void setup() { + String s = "a".repeat(size); + array = s.getBytes(StandardCharsets.UTF_8); + chars = s.toCharArray(); + charsMixedBegin = Arrays.copyOf(chars, array.length); + charsMixedBegin[0] = INTEROBANG; + charsMixedSmall = Arrays.copyOf(chars, array.length); + charsMixedSmall[Math.min(charsMixedSmall.length - 1, 7)] = INTEROBANG; + charsMixedEnd = new char[size + 7]; + Arrays.fill(charsMixedEnd, 'a'); + charsMixedEnd[charsMixedEnd.length - 1] = INTEROBANG; - @Benchmark - public String newStringFromRangedArray() { - return new String(array, offset, array.length - offset); - } + codePointsLatin1 = intCopyOfChars(chars, array.length); + codePointsMixedBegin = intCopyOfChars(chars, array.length); + codePointsMixedBegin[0] = INTEROBANG; + codePointsMixedSmall = intCopyOfChars(chars, array.length); + codePointsMixedSmall[Math.min(codePointsMixedSmall.length - 1, 7)] = INTEROBANG; + } - @Benchmark - public String newStringFromRangedArrayWithCharset() { - return new String(array, offset, array.length - offset, StandardCharsets.UTF_8); - } + @Benchmark + public String newStringFromBytes() { + return new String(array); + } + @Benchmark + public String newStringFromBytesRanged() { + return new String(array, offset, array.length - offset); + } + + @Benchmark + public String newStringFromBytesRangedWithCharsetUTF8() { + return new String(array, offset, array.length - offset, StandardCharsets.UTF_8); + } + + @Benchmark + public String newStringFromBytesWithCharsetUTF8() { + return new String(array, StandardCharsets.UTF_8); + } + + @Benchmark + public String newStringFromBytesWithCharsetNameUTF8() throws Exception { + return new String(array, StandardCharsets.UTF_8.name()); + } + + @Benchmark + public String newStringFromCharsLatin1() { + return new String(chars); + } + + @Benchmark + public String newStringFromCharsMixedBegin() { + return new String(charsMixedBegin); + } + + @Benchmark + public String newStringFromCharsMixedSmall() { + return new String(charsMixedSmall); + } + + @Benchmark + public String newStringFromCharsMixedEnd() { + return new String(charsMixedEnd); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void newStringFromCharsMixedAll(Blackhole bh) { + bh.consume(new String(charsMixedBegin)); + bh.consume(new String(charsMixedSmall)); + bh.consume(new String(chars)); + } + + @Benchmark + public String newStringFromCodePointRangedLatin1() { + return new String(codePointsLatin1, 0, codePointsLatin1.length); + } + + @Benchmark + public String newStringFromCodePointRangedMixedBegin() { + return new String(codePointsMixedBegin, 0, codePointsMixedBegin.length); + } + + @Benchmark + public String newStringFromCodePointRangedMixedSmall() { + return new String(codePointsMixedSmall, 0, codePointsMixedSmall.length); + } }