6937112: String.lastIndexOf confused by unpaired trailing surrogate

Rewrite lastIndexOf for performance and correctness

Reviewed-by: sherman
This commit is contained in:
Ulf Zibis 2010-06-30 16:11:32 -07:00 committed by Martin Buchholz
parent a0f3e72c24
commit 9c81e25271
4 changed files with 147 additions and 90 deletions

View File

@ -37,6 +37,7 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import sun.nio.cs.Surrogate;
/**
@ -1575,9 +1576,6 @@ public final class String
* if the character does not occur.
*/
public int indexOf(int ch, int fromIndex) {
int max = offset + count;
char v[] = value;
if (fromIndex < 0) {
fromIndex = 0;
} else if (fromIndex >= count) {
@ -1585,29 +1583,36 @@ public final class String
return -1;
}
int i = offset + fromIndex;
if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a
// negative value (invalid code point))
for (; i < max ; i++) {
if (v[i] == ch) {
final char[] value = this.value;
final int offset = this.offset;
final int max = offset + count;
for (int i = offset + fromIndex; i < max ; i++) {
if (value[i] == ch) {
return i - offset;
}
}
return -1;
} else {
return indexOfSupplementary(ch, fromIndex);
}
}
if (ch <= Character.MAX_CODE_POINT) {
// handle supplementary characters here
char[] surrogates = Character.toChars(ch);
for (; i < max; i++) {
if (v[i] == surrogates[0]) {
if (i + 1 == max) {
break;
}
if (v[i+1] == surrogates[1]) {
return i - offset;
}
/**
* Handles (rare) calls of indexOf with a supplementary character.
*/
private int indexOfSupplementary(int ch, int fromIndex) {
if (Character.isValidCodePoint(ch)) {
final char[] value = this.value;
final int offset = this.offset;
final char hi = Surrogate.high(ch);
final char lo = Surrogate.low(ch);
final int max = offset + count - 1;
for (int i = offset + fromIndex; i < max; i++) {
if (value[i] == hi && value[i+1] == lo) {
return i - offset;
}
}
}
@ -1676,34 +1681,36 @@ public final class String
* if the character does not occur before that point.
*/
public int lastIndexOf(int ch, int fromIndex) {
int min = offset;
char v[] = value;
int i = offset + ((fromIndex >= count) ? count - 1 : fromIndex);
if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a
// negative value (invalid code point))
for (; i >= min ; i--) {
if (v[i] == ch) {
final char[] value = this.value;
final int offset = this.offset;
int i = offset + Math.min(fromIndex, count - 1);
for (; i >= offset ; i--) {
if (value[i] == ch) {
return i - offset;
}
}
return -1;
} else {
return lastIndexOfSupplementary(ch, fromIndex);
}
}
int max = offset + count;
if (ch <= Character.MAX_CODE_POINT) {
// handle supplementary characters here
char[] surrogates = Character.toChars(ch);
for (; i >= min; i--) {
if (v[i] == surrogates[0]) {
if (i + 1 == max) {
break;
}
if (v[i+1] == surrogates[1]) {
return i - offset;
}
/**
* Handles (rare) calls of lastIndexOf with a supplementary character.
*/
private int lastIndexOfSupplementary(int ch, int fromIndex) {
if (Character.isValidCodePoint(ch)) {
final char[] value = this.value;
final int offset = this.offset;
char hi = Surrogate.high(ch);
char lo = Surrogate.low(ch);
int i = offset + Math.min(fromIndex, count - 2);
for (; i >= offset; i--) {
if (value[i] == hi && value[i+1] == lo) {
return i - offset;
}
}
}

View File

@ -62,7 +62,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0
// includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -168,7 +168,7 @@ public class Supplementary {
* string in input[m].
*
* The meaning of each element in golden3[][n]
* golden3[][0]: characater which is searched.
* golden3[][0]: character which is searched.
* golden3[][2]: the golden data for indexOf(int ch)
* From golden3[][2] to golden3[][n-1]:
* the golden data for indexOf(int ch, int fromIndex)
@ -201,17 +201,17 @@ public class Supplementary {
/*
* Normal case
*/
testIndexOf(First, s, golden3[i][0], golden3[i][2]);
testIndexOf(s, golden3[i][0], golden3[i][2]);
/*
* Abnormal case - char which isn't included in the string.
*/
testIndexOf(First, s, 'Z', -1);
testIndexOf(First, s, 0xDB98, -1);
testIndexOf(First, s, 0xDE76, -1);
testIndexOf(First, s, 0x12345, -1);
testIndexOf(First, s, -1, -1);
testIndexOf(First, s, 0x110000, -1);
testIndexOf(s, 'Z', -1);
testIndexOf(s, 0xDB98, -1);
testIndexOf(s, 0xDE76, -1);
testIndexOf(s, 0x12345, -1);
testIndexOf(s, -1, -1);
testIndexOf(s, 0x110000, -1);
}
}
@ -229,7 +229,7 @@ public class Supplementary {
*/
int fromIndex = 0;
for (int j = 2; j < golden3[i].length; j++) {
fromIndex = testIndexOf(First, s, fromIndex, ch,
fromIndex = testIndexOf(s, fromIndex, ch,
golden3[i][j]) + 1;
}
@ -237,19 +237,19 @@ public class Supplementary {
* Abnormal case1 - char is included in the string but fromIndex
* is incorrect.
*/
testIndexOf(First, s, -1, ch, golden3[i][2]);
testIndexOf(First, s, s.length(), ch,
testIndexOf(s, -1, ch, golden3[i][2]);
testIndexOf(s, s.length(), ch,
golden3[i][golden3[i].length-1]);
/*
* Abnormal case2 - char which isn't included in the string.
*/
testIndexOf(First, s, 0, 'Z', -1);
testIndexOf(First, s, 0, 0xDB98, -1);
testIndexOf(First, s, 0, 0xDE76, -1);
testIndexOf(First, s, 0, 0x12345, -1);
testIndexOf(First, s, 0, -1, -1);
testIndexOf(First, s, 0, 0x110000, -1);
testIndexOf(s, 0, 'Z', -1);
testIndexOf(s, 0, 0xDB98, -1);
testIndexOf(s, 0, 0xDE76, -1);
testIndexOf(s, 0, 0x12345, -1);
testIndexOf(s, 0, -1, -1);
testIndexOf(s, 0, 0x110000, -1);
}
}
@ -264,18 +264,18 @@ public class Supplementary {
/*
* Normal case
*/
testIndexOf(Last, s, golden3[i][0],
testLastIndexOf(s, golden3[i][0],
golden3[i][golden3[i].length-2]);
/*
* Abnormal case - char which isn't included in the string.
*/
testIndexOf(Last, s, 'Z', -1);
testIndexOf(Last, s, 0xDB98, -1);
testIndexOf(Last, s, 0xDE76, -1);
testIndexOf(Last, s, 0x12345, -1);
testIndexOf(Last, s, -1, -1);
testIndexOf(Last, s, 0x110000, -1);
testLastIndexOf(s, 'Z', -1);
testLastIndexOf(s, 0xDB98, -1);
testLastIndexOf(s, 0xDE76, -1);
testLastIndexOf(s, 0x12345, -1);
testLastIndexOf(s, -1, -1);
testLastIndexOf(s, 0x110000, -1);
}
}
@ -294,7 +294,7 @@ public class Supplementary {
*/
int fromIndex = len - 1;
for (int j = golden3[i].length - 2; j > 0; j--) {
fromIndex = testIndexOf(Last, s, fromIndex, ch,
fromIndex = testLastIndexOf(s, fromIndex, ch,
golden3[i][j]) - 1;
}
@ -302,18 +302,18 @@ public class Supplementary {
* Abnormal case1 - char is included in the string but fromIndex
* is incorrect.
*/
testIndexOf(Last, s, -1, ch, golden3[i][1]);
testIndexOf(Last, s, len, ch, golden3[i][golden3[i].length-2]);
testLastIndexOf(s, -1, ch, golden3[i][1]);
testLastIndexOf(s, len, ch, golden3[i][golden3[i].length-2]);
/*
* Abnormal case2 - char which isn't included in the string.
*/
testIndexOf(Last, s, len, 'Z', -1);
testIndexOf(Last, s, len, 0xDB98, -1);
testIndexOf(Last, s, len, 0xDE76, -1);
testIndexOf(Last, s, len, 0x12345, -1);
testIndexOf(Last, s, len, -1, -1);
testIndexOf(Last, s, len, 0x110000, -1);
testLastIndexOf(s, len, 'Z', -1);
testLastIndexOf(s, len, 0xDB98, -1);
testLastIndexOf(s, len, 0xDE76, -1);
testLastIndexOf(s, len, 0x12345, -1);
testLastIndexOf(s, len, -1, -1);
testLastIndexOf(s, len, 0x110000, -1);
}
}
@ -471,7 +471,7 @@ public class Supplementary {
result, expected);
result = str.offsetByCodePoints(j, -nCodePoints);
check(result != 0,
"offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
"offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
result, 0);
}
@ -531,7 +531,7 @@ public class Supplementary {
result, expected);
result = str.offsetByCodePoints(j, -nCodePoints);
check(result != 0,
"offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
"offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
result, 0);
}
}
@ -539,7 +539,7 @@ public class Supplementary {
static final boolean At = true, Before = false;
static final boolean First = true, Last = false;
static final boolean FIRST = true, LAST = false;
static void testCodePoint(boolean isAt, String s, int index, int expected) {
int c = isAt ? s.codePointAt(index) : s.codePointBefore(index);
@ -563,22 +563,72 @@ public class Supplementary {
+ s + "> should throw StringIndexOutOfBoundsPointerException.");
}
static void testIndexOf(boolean isFirst, String s, int c, int expected) {
int index = isFirst ? s.indexOf(c) : s.lastIndexOf(c);
check(index != expected,
(isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c)
+ ") for <" + s + ">", index, expected);
static void testIndexOf(String s, int c, int expected) {
testIndexOf2(s, c, expected);
if (s.indexOf(c) != -1) {
testIndexOf2(s + (char) c, c, expected);
if (Character.isSupplementaryCodePoint(c)) {
char[] surrogates = Character.toChars(c);
testIndexOf2(s + new String(surrogates), c, expected);
testIndexOf2(s + surrogates[0], c, expected);
testIndexOf2(s + surrogates[1], c, expected);
testIndexOf2(new String(surrogates) + s, c, 0);
testIndexOf2(surrogates[0] + s, c, expected + 1);
testIndexOf2(surrogates[1] + s, c, expected + 1);
}
}
}
static int testIndexOf(boolean isFirst, String s, int fromIndex, int c,
int expected) {
int index = isFirst ? s.indexOf(c, fromIndex) :
s.lastIndexOf(c, fromIndex);
static void testIndexOf2(String s, int c, int expected) {
int index = s.indexOf(c);
check(index != expected,
(isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) + ", "
+ fromIndex + ") for <" + s + ">", index, expected);
"indexOf(" + toHexString(c) + ") for <" + s + ">",
index, expected);
}
static void testLastIndexOf(String s, int c, int expected) {
testLastIndexOf2(s, c, expected);
if (s.lastIndexOf(c) != -1) {
testLastIndexOf2((char) c + s, c, expected + 1);
if (Character.isSupplementaryCodePoint(c)) {
char[] surrogates = Character.toChars(c);
testLastIndexOf2(s + new String(surrogates), c, s.length());
testLastIndexOf2(s + surrogates[0], c, expected);
testLastIndexOf2(s + surrogates[1], c, expected);
testLastIndexOf2(new String(surrogates) + s, c, expected + 2);
testLastIndexOf2(surrogates[0] + s, c, expected + 1);
testLastIndexOf2(surrogates[1] + s, c, expected + 1);
}
}
}
static void testLastIndexOf2(String s, int c, int expected) {
int index = s.lastIndexOf(c);
check(index != expected,
"lastIndexOf(" + toHexString(c) + ") for <" + s + ">",
index, expected);
}
static int testIndexOf(String s, int fromIndex, int c, int expected) {
int index = s.indexOf(c, fromIndex);
check(index != expected,
"indexOf(" + toHexString(c) + ", "
+ fromIndex + ") for <" + s + ">",
index, expected);
return index;
}
static int testLastIndexOf(String s, int fromIndex, int c, int expected) {
int index = s.lastIndexOf(c, fromIndex);
check(index != expected,
"lastIndexOf(" + toHexString(c) + ", "
+ fromIndex + ") for <" + s + ">",
index, expected);
return index;
}

View File

@ -24,7 +24,7 @@
/*
*
* @test
* @bug 4533872 4915683 4985217 5017280
* @bug 4533872 4915683 4985217 5017280 6937112
* @summary Unit tests for supplementary character support (JSR-204)
*/
@ -57,7 +57,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0
// includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -151,7 +151,7 @@ public class Supplementary {
"\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
"\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0
// includes an undefined supplementary character in Unicode 4.0.0
"\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
};

View File

@ -57,7 +57,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0
// includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -151,7 +151,7 @@ public class Supplementary {
"\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
"\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0
// includes an undefined supplementary character in Unicode 4.0.0
"\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
};