8222955: Optimize String.replace(CharSequence, CharSequence) for common cases

Reviewed-by: redestad, tvaleev
This commit is contained in:
Ivan Gerasimov 2019-05-06 18:07:55 -07:00
parent 0c6f7e5516
commit 4f02d011b0
5 changed files with 408 additions and 34 deletions
src/java.base/share/classes/java/lang
test
jdk/java/lang/String
micro/org/openjdk/bench/java/lang

@ -75,7 +75,7 @@ import static java.util.function.Predicate.not;
* System.out.println("abc");
* String cde = "cde";
* System.out.println("abc" + cde);
* String c = "abc".substring(2,3);
* String c = "abc".substring(2, 3);
* String d = cde.substring(1, 2);
* </pre></blockquote>
* <p>
@ -2160,27 +2160,48 @@ public final class String
* @since 1.5
*/
public String replace(CharSequence target, CharSequence replacement) {
String tgtStr = target.toString();
String trgtStr = target.toString();
String replStr = replacement.toString();
int j = indexOf(tgtStr);
if (j < 0) {
return this;
}
int tgtLen = tgtStr.length();
int tgtLen1 = Math.max(tgtLen, 1);
int thisLen = length();
int trgtLen = trgtStr.length();
int replLen = replStr.length();
int newLenHint = thisLen - tgtLen + replStr.length();
if (newLenHint < 0) {
throw new OutOfMemoryError();
if (trgtLen > 0) {
if (trgtLen == 1 && replLen == 1) {
return replace(trgtStr.charAt(0), replStr.charAt(0));
}
boolean thisIsLatin1 = this.isLatin1();
boolean trgtIsLatin1 = trgtStr.isLatin1();
boolean replIsLatin1 = replStr.isLatin1();
String ret = (thisIsLatin1 && trgtIsLatin1 && replIsLatin1)
? StringLatin1.replace(value, thisLen,
trgtStr.value, trgtLen,
replStr.value, replLen)
: StringUTF16.replace(value, thisLen, thisIsLatin1,
trgtStr.value, trgtLen, trgtIsLatin1,
replStr.value, replLen, replIsLatin1);
if (ret != null) {
return ret;
}
return this;
} else { // trgtLen == 0
int resultLen;
try {
resultLen = Math.addExact(thisLen, Math.multiplyExact(
Math.addExact(thisLen, 1), replLen));
} catch (ArithmeticException ignored) {
throw new OutOfMemoryError();
}
StringBuilder sb = new StringBuilder(resultLen);
sb.append(replStr);
for (int i = 0; i < thisLen; ++i) {
sb.append(charAt(i)).append(replStr);
}
return sb.toString();
}
StringBuilder sb = new StringBuilder(newLenHint);
int i = 0;
do {
sb.append(this, i, j).append(replStr);
i = j + tgtLen;
} while (j < thisLen && (j = indexOf(tgtStr, j + tgtLen1)) > 0);
return sb.append(this, i, thisLen).toString();
}
/**

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -42,6 +42,14 @@ import static java.lang.String.checkOffset;
final class StringLatin1 {
/**
* The maximum size of array to allocate (unless necessary).
* Some VMs reserve some header words in an array.
* Attempts to allocate larger arrays may result in
* OutOfMemoryError: Requested array size exceeds VM limit
*/
private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
public static char charAt(byte[] value, int index) {
if (index < 0 || index >= value.length) {
throw new StringIndexOutOfBoundsException(index);
@ -304,7 +312,7 @@ final class StringLatin1 {
}
if (i < len) {
if (canEncode(newChar)) {
byte buf[] = new byte[len];
byte[] buf = StringConcatHelper.newArray(len);
for (int j = 0; j < i; j++) { // TBD arraycopy?
buf[j] = value[j];
}
@ -330,6 +338,64 @@ final class StringLatin1 {
return null; // for string to return this;
}
public static String replace(byte[] value, int valLen, byte[] targ,
int targLen, byte[] repl, int replLen)
{
assert targLen > 0;
int i, j, p = 0;
if (valLen == 0 || (i = indexOf(value, valLen, targ, targLen, 0)) < 0) {
return null; // for string to return this;
}
// find and store indices of substrings to replace
int[] pos = new int[16];
pos[0] = i;
i += targLen;
while ((j = indexOf(value, valLen, targ, targLen, i)) > 0) {
if (++p == pos.length) {
int cap = p + (p >> 1);
// overflow-conscious code
if (cap - MAX_ARRAY_SIZE > 0) {
if (p == MAX_ARRAY_SIZE) {
throw new OutOfMemoryError();
}
cap = MAX_ARRAY_SIZE;
}
pos = Arrays.copyOf(pos, cap);
}
pos[p] = j;
i = j + targLen;
}
int resultLen;
try {
resultLen = Math.addExact(valLen,
Math.multiplyExact(++p, replLen - targLen));
} catch (ArithmeticException ignored) {
throw new OutOfMemoryError();
}
if (resultLen == 0) {
return "";
}
byte[] result = StringConcatHelper.newArray(resultLen);
int posFrom = 0, posTo = 0;
for (int q = 0; q < p; ++q) {
int nextPos = pos[q];
while (posFrom < nextPos) {
result[posTo++] = value[posFrom++];
}
posFrom += targLen;
for (int k = 0; k < replLen; ++k) {
result[posTo++] = repl[k];
}
}
while (posFrom < valLen) {
result[posTo++] = value[posFrom++];
}
return new String(result, LATIN1);
}
// case insensitive
public static boolean regionMatchesCI(byte[] value, int toffset,
byte[] other, int ooffset, int len) {

@ -574,7 +574,7 @@ final class StringUTF16 {
}
}
if (i < len) {
byte buf[] = new byte[value.length];
byte[] buf = new byte[value.length];
for (int j = 0; j < i; j++) {
putChar(buf, j, getChar(value, j)); // TBD:arraycopy?
}
@ -582,21 +582,145 @@ final class StringUTF16 {
char c = getChar(value, i);
putChar(buf, i, c == oldChar ? newChar : c);
i++;
}
// Check if we should try to compress to latin1
if (String.COMPACT_STRINGS &&
!StringLatin1.canEncode(oldChar) &&
StringLatin1.canEncode(newChar)) {
byte[] val = compress(buf, 0, len);
if (val != null) {
return new String(val, LATIN1);
}
}
return new String(buf, UTF16);
}
// Check if we should try to compress to latin1
if (String.COMPACT_STRINGS &&
!StringLatin1.canEncode(oldChar) &&
StringLatin1.canEncode(newChar)) {
byte[] val = compress(buf, 0, len);
if (val != null) {
return new String(val, LATIN1);
}
}
return new String(buf, UTF16);
}
return null;
}
public static String replace(byte[] value, int valLen, boolean valLat1,
byte[] targ, int targLen, boolean targLat1,
byte[] repl, int replLen, boolean replLat1)
{
assert targLen > 0;
assert !valLat1 || !targLat1 || !replLat1;
// Possible combinations of the arguments/result encodings:
// +---+--------+--------+--------+-----------------------+
// | # | VALUE | TARGET | REPL | RESULT |
// +===+========+========+========+=======================+
// | 1 | Latin1 | Latin1 | UTF16 | null or UTF16 |
// +---+--------+--------+--------+-----------------------+
// | 2 | Latin1 | UTF16 | Latin1 | null |
// +---+--------+--------+--------+-----------------------+
// | 3 | Latin1 | UTF16 | UTF16 | null |
// +---+--------+--------+--------+-----------------------+
// | 4 | UTF16 | Latin1 | Latin1 | null or UTF16 |
// +---+--------+--------+--------+-----------------------+
// | 5 | UTF16 | Latin1 | UTF16 | null or UTF16 |
// +---+--------+--------+--------+-----------------------+
// | 6 | UTF16 | UTF16 | Latin1 | null, Latin1 or UTF16 |
// +---+--------+--------+--------+-----------------------+
// | 7 | UTF16 | UTF16 | UTF16 | null or UTF16 |
// +---+--------+--------+--------+-----------------------+
if (String.COMPACT_STRINGS && valLat1 && !targLat1) {
// combinations 2 or 3
return null; // for string to return this;
}
int i = (String.COMPACT_STRINGS && valLat1)
? StringLatin1.indexOf(value, targ) :
(String.COMPACT_STRINGS && targLat1)
? indexOfLatin1(value, targ)
: indexOf(value, targ);
if (i < 0) {
return null; // for string to return this;
}
// find and store indices of substrings to replace
int j, p = 0;
int[] pos = new int[16];
pos[0] = i;
i += targLen;
while ((j = ((String.COMPACT_STRINGS && valLat1)
? StringLatin1.indexOf(value, valLen, targ, targLen, i) :
(String.COMPACT_STRINGS && targLat1)
? indexOfLatin1(value, valLen, targ, targLen, i)
: indexOf(value, valLen, targ, targLen, i))) > 0)
{
if (++p == pos.length) {
int cap = p + (p >> 1);
// overflow-conscious code
if (cap - MAX_ARRAY_SIZE > 0) {
if (p == MAX_ARRAY_SIZE) {
throw new OutOfMemoryError();
}
cap = MAX_ARRAY_SIZE;
}
pos = Arrays.copyOf(pos, cap);
}
pos[p] = j;
i = j + targLen;
}
int resultLen;
try {
resultLen = Math.addExact(valLen,
Math.multiplyExact(++p, replLen - targLen));
} catch (ArithmeticException ignored) {
throw new OutOfMemoryError();
}
if (resultLen == 0) {
return "";
}
byte[] result = newBytesFor(resultLen);
int posFrom = 0, posTo = 0;
for (int q = 0; q < p; ++q) {
int nextPos = pos[q];
if (String.COMPACT_STRINGS && valLat1) {
while (posFrom < nextPos) {
char c = (char)(value[posFrom++] & 0xff);
putChar(result, posTo++, c);
}
} else {
while (posFrom < nextPos) {
putChar(result, posTo++, getChar(value, posFrom++));
}
}
posFrom += targLen;
if (String.COMPACT_STRINGS && replLat1) {
for (int k = 0; k < replLen; ++k) {
char c = (char)(repl[k] & 0xff);
putChar(result, posTo++, c);
}
} else {
for (int k = 0; k < replLen; ++k) {
putChar(result, posTo++, getChar(repl, k));
}
}
}
if (String.COMPACT_STRINGS && valLat1) {
while (posFrom < valLen) {
char c = (char)(value[posFrom++] & 0xff);
putChar(result, posTo++, c);
}
} else {
while (posFrom < valLen) {
putChar(result, posTo++, getChar(value, posFrom++));
}
}
if (String.COMPACT_STRINGS && replLat1 && !targLat1) {
// combination 6
byte[] lat1Result = compress(result, 0, resultLen);
if (lat1Result != null) {
return new String(lat1Result, LATIN1);
}
}
return new String(result, UTF16);
}
public static boolean regionMatchesCI(byte[] value, int toffset,
byte[] other, int ooffset, int len) {
int last = toffset + len;
@ -1430,6 +1554,15 @@ final class StringUTF16 {
static final int MAX_LENGTH = Integer.MAX_VALUE >> 1;
/**
* The maximum size of array to allocate (unless necessary).
* Some VMs reserve some header words in an array.
* Attempts to allocate larger arrays may result in
* OutOfMemoryError: Requested array size exceeds VM limit
*/
private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
// Used by trusted callers. Assumes all necessary bounds checks have
// been done by the caller.

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -22,7 +22,7 @@
*/
/* @test
* @bug 8058779 8054307
* @bug 8058779 8054307 8222955
* @library /test/lib
* @build jdk.test.lib.RandomFactory
* @run testng LiteralReplace
@ -79,6 +79,10 @@ public class LiteralReplace {
source.replace(target, replacement);
}
@Test(expectedExceptions = {OutOfMemoryError.class})
public void testOOM() {
"1".repeat(65537).replace("1", "2".repeat(65537));
}
@DataProvider
public static Object[][] sourceTargetReplacementExpected() {
@ -104,6 +108,14 @@ public class LiteralReplace {
{"abcdefgh", "[a-h]", "X", "abcdefgh"},
{"aa+", "a+", "", "a"},
{"^abc$", "abc", "x", "^x$"},
{"abc", "b", "_", "a_c"},
{"abc", "bc", "_", "a_"},
{"abc".repeat(65537) + "end", "b", "_XYZ_", "a_XYZ_c".repeat(65537) + "end"},
{"abc".repeat(65537) + "end", "a", "_", "_bc".repeat(65537) + "end"},
{"a".repeat(65537), "a", "", ""},
{"ab".repeat(65537), "a", "", "b".repeat(65537)},
{"ab".repeat(65537), "ab", "", ""},
{"b" + "ab".repeat(65537), "ab", "", "b"},
// more with non-latin1 characters
{"\u4e00\u4e00\u4e00",
@ -207,6 +219,42 @@ public class LiteralReplace {
"\u4e00\u4e01\u4e02\u4e03\u4e04\u4e05",
"",
""},
{"\u4e01\u4e02\u4e03", "\u4e02", "\u4e02", "\u4e01\u4e02\u4e03"},
{"\u4e01\u4e02\u4e03", "\u4e02", "\u4e04", "\u4e01\u4e04\u4e03"},
{"\u4e01\u4e02\u4e03", "\u4e02", "_", "\u4e01_\u4e03"},
{"a\u4e02c", "\u4e02", "_", "a_c"},
{"\u4e01@\u4e03", "@", "_", "\u4e01_\u4e03"},
{"\u4e01@\u4e03", "@", "\u4e02", "\u4e01\u4e02\u4e03"},
{"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e02\u4e03", "\u4e01\u4e02\u4e03"},
{"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e04\u4e05", "\u4e01\u4e04\u4e05"},
{"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e06", "\u4e01\u4e06"},
{"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "<>", "\u4e01<>"},
{"@\u4e02\u4e03", "\u4e02\u4e03", "<>", "@<>"},
{"\u4e01@@", "\u4e01@", "", "@"},
{"\u4e01@@", "\u4e01@", "#", "#@"},
{"\u4e01@@", "\u4e01@", "\u4e09", "\u4e09@"},
{"\u4e01@@", "\u4e01@", "#\u4e09", "#\u4e09@"},
{"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e02", "\u4e01\u4e02\u4e03".repeat(32771) + "end"},
{"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e04", "\u4e01\u4e04\u4e03".repeat(32771) + "end"},
{"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e04\u4e05", "\u4e01\u4e04\u4e05\u4e03".repeat(32771) + "end"},
{"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "_", "\u4e01_\u4e03".repeat(32771) + "end"},
{"\u4e01_\u4e03".repeat(32771) + "end", "_", "_", "\u4e01_\u4e03".repeat(32771) + "end"},
{"\u4e01_\u4e03".repeat(32771) + "end", "_", "\u4e06", "\u4e01\u4e06\u4e03".repeat(32771) + "end"},
{"\u4e01_\u4e03".repeat(32771) + "end", "_", "\u4e06\u4e06", "\u4e01\u4e06\u4e06\u4e03".repeat(32771) + "end"},
{"X_Y".repeat(32771) + "end", "_", "\u4e07", "X\u4e07Y".repeat(32771) + "end"},
{"X_Y".repeat(32771) + "end", "_", "\u4e07\u4e08", "X\u4e07\u4e08Y".repeat(32771) + "end"},
{"X_Y".repeat(32771) + "end", "_", ".\u4e08.", "X.\u4e08.Y".repeat(32771) + "end"},
{"\u4e0a".repeat(32771), "\u4e0a", "", ""},
{"\u4e0a\u4e0b".repeat(32771), "\u4e0a", "", "\u4e0b".repeat(32771)},
{"\u4e0a\u4e0b".repeat(32771), "\u4e0b", "", "\u4e0a".repeat(32771)},
{"\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", ""},
{"\u4e0b" + "\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", "\u4e0b"},
{"\u4e0a\u4e0b".repeat(32771) + "\u4e0a", "\u4e0a\u4e0b", "", "\u4e0a"},
{"\u4e0b" + "\u4e0a\u4e0b".repeat(32771) + "\u4e0a", "\u4e0a\u4e0b", "", "\u4e0b\u4e0a"},
{"b" + "\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", "b"},
{"\u4e0a\u4e0b".repeat(32771) + "a", "\u4e0a\u4e0b", "", "a"},
{"b" + "\u4e0a\u4e0b".repeat(32771) + "a", "\u4e0a\u4e0b", "", "ba"},
};
}

@ -0,0 +1,106 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class StringReplace {
public String slat1 = new String("java.lang.String");
public String sutf16 = new String("\u1D36\u1D43\u1D5B\u1D43 \uFF11\uFF13");
@Benchmark
public String replace0x1_1_Latin1() {
return slat1.replace("Z", "z");
}
@Benchmark
public String replace1x1_0_Latin1() {
return slat1.replace("g", "");
}
@Benchmark
public String replace1x1_1_Latin1() {
return slat1.replace("g", "k");
}
@Benchmark
public String replace1x1_2_Latin1() {
return slat1.replace("g", "th");
}
@Benchmark
public String replace2x1_0_Latin1() {
return slat1.replace(".", "");
}
@Benchmark
public String replace2x1_1_Latin1() {
return slat1.replace(".", "/");
}
@Benchmark
public String replace2x1_2_Latin1() {
return slat1.replace(".", "::");
}
@Benchmark
public String replace0x1_1_UTF16() {
return sutf16.replace("\u1D36", "\u1D38\u1D3A");
}
@Benchmark
public String replace1x1_0_UTF16() {
return sutf16.replace("\uFF11", "");
}
@Benchmark
public String replace1x1_1_UTF16() {
return sutf16.replace("\u1D36", "\u24BF");
}
@Benchmark
public String replace1x1_2_UTF16() {
return sutf16.replace("\uFF11", "\uFF11\uFF12");
}
@Benchmark
public String replace2x1_0_UTF16() {
return sutf16.replace("\u1D43", "");
}
@Benchmark
public String replace2x1_1_UTF16() {
return sutf16.replace("\u1D43", "\u1D2C");
}
@Benchmark
public String replace2x1_2_UTF16() {
return sutf16.replace("\u1D43", "\u21DB\u21DB");
}
}