6937112: String.lastIndexOf confused by unpaired trailing surrogate

Rewrite lastIndexOf for performance and correctness

Reviewed-by: sherman
This commit is contained in:
Ulf Zibis 2010-06-30 16:11:32 -07:00 committed by Martin Buchholz
parent a0f3e72c24
commit 9c81e25271
4 changed files with 147 additions and 90 deletions

View File

@ -37,6 +37,7 @@ import java.util.Locale;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import sun.nio.cs.Surrogate;
/** /**
@ -1575,9 +1576,6 @@ public final class String
* if the character does not occur. * if the character does not occur.
*/ */
public int indexOf(int ch, int fromIndex) { public int indexOf(int ch, int fromIndex) {
int max = offset + count;
char v[] = value;
if (fromIndex < 0) { if (fromIndex < 0) {
fromIndex = 0; fromIndex = 0;
} else if (fromIndex >= count) { } else if (fromIndex >= count) {
@ -1585,29 +1583,36 @@ public final class String
return -1; return -1;
} }
int i = offset + fromIndex;
if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a // handle most cases here (ch is a BMP code point or a
// negative value (invalid code point)) // negative value (invalid code point))
for (; i < max ; i++) { final char[] value = this.value;
if (v[i] == ch) { final int offset = this.offset;
final int max = offset + count;
for (int i = offset + fromIndex; i < max ; i++) {
if (value[i] == ch) {
return i - offset; return i - offset;
} }
} }
return -1; return -1;
} else {
return indexOfSupplementary(ch, fromIndex);
} }
}
if (ch <= Character.MAX_CODE_POINT) { /**
// handle supplementary characters here * Handles (rare) calls of indexOf with a supplementary character.
char[] surrogates = Character.toChars(ch); */
for (; i < max; i++) { private int indexOfSupplementary(int ch, int fromIndex) {
if (v[i] == surrogates[0]) { if (Character.isValidCodePoint(ch)) {
if (i + 1 == max) { final char[] value = this.value;
break; final int offset = this.offset;
} final char hi = Surrogate.high(ch);
if (v[i+1] == surrogates[1]) { final char lo = Surrogate.low(ch);
return i - offset; final int max = offset + count - 1;
} for (int i = offset + fromIndex; i < max; i++) {
if (value[i] == hi && value[i+1] == lo) {
return i - offset;
} }
} }
} }
@ -1676,34 +1681,36 @@ public final class String
* if the character does not occur before that point. * if the character does not occur before that point.
*/ */
public int lastIndexOf(int ch, int fromIndex) { public int lastIndexOf(int ch, int fromIndex) {
int min = offset;
char v[] = value;
int i = offset + ((fromIndex >= count) ? count - 1 : fromIndex);
if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a // handle most cases here (ch is a BMP code point or a
// negative value (invalid code point)) // negative value (invalid code point))
for (; i >= min ; i--) { final char[] value = this.value;
if (v[i] == ch) { final int offset = this.offset;
int i = offset + Math.min(fromIndex, count - 1);
for (; i >= offset ; i--) {
if (value[i] == ch) {
return i - offset; return i - offset;
} }
} }
return -1; return -1;
} else {
return lastIndexOfSupplementary(ch, fromIndex);
} }
}
int max = offset + count; /**
if (ch <= Character.MAX_CODE_POINT) { * Handles (rare) calls of lastIndexOf with a supplementary character.
// handle supplementary characters here */
char[] surrogates = Character.toChars(ch); private int lastIndexOfSupplementary(int ch, int fromIndex) {
for (; i >= min; i--) { if (Character.isValidCodePoint(ch)) {
if (v[i] == surrogates[0]) { final char[] value = this.value;
if (i + 1 == max) { final int offset = this.offset;
break; char hi = Surrogate.high(ch);
} char lo = Surrogate.low(ch);
if (v[i+1] == surrogates[1]) { int i = offset + Math.min(fromIndex, count - 2);
return i - offset; for (; i >= offset; i--) {
} if (value[i] == hi && value[i+1] == lo) {
return i - offset;
} }
} }
} }

View File

@ -62,7 +62,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */ 0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0 // includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1 /* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */ 0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -168,7 +168,7 @@ public class Supplementary {
* string in input[m]. * string in input[m].
* *
* The meaning of each element in golden3[][n] * The meaning of each element in golden3[][n]
* golden3[][0]: characater which is searched. * golden3[][0]: character which is searched.
* golden3[][2]: the golden data for indexOf(int ch) * golden3[][2]: the golden data for indexOf(int ch)
* From golden3[][2] to golden3[][n-1]: * From golden3[][2] to golden3[][n-1]:
* the golden data for indexOf(int ch, int fromIndex) * the golden data for indexOf(int ch, int fromIndex)
@ -201,17 +201,17 @@ public class Supplementary {
/* /*
* Normal case * Normal case
*/ */
testIndexOf(First, s, golden3[i][0], golden3[i][2]); testIndexOf(s, golden3[i][0], golden3[i][2]);
/* /*
* Abnormal case - char which isn't included in the string. * Abnormal case - char which isn't included in the string.
*/ */
testIndexOf(First, s, 'Z', -1); testIndexOf(s, 'Z', -1);
testIndexOf(First, s, 0xDB98, -1); testIndexOf(s, 0xDB98, -1);
testIndexOf(First, s, 0xDE76, -1); testIndexOf(s, 0xDE76, -1);
testIndexOf(First, s, 0x12345, -1); testIndexOf(s, 0x12345, -1);
testIndexOf(First, s, -1, -1); testIndexOf(s, -1, -1);
testIndexOf(First, s, 0x110000, -1); testIndexOf(s, 0x110000, -1);
} }
} }
@ -229,7 +229,7 @@ public class Supplementary {
*/ */
int fromIndex = 0; int fromIndex = 0;
for (int j = 2; j < golden3[i].length; j++) { for (int j = 2; j < golden3[i].length; j++) {
fromIndex = testIndexOf(First, s, fromIndex, ch, fromIndex = testIndexOf(s, fromIndex, ch,
golden3[i][j]) + 1; golden3[i][j]) + 1;
} }
@ -237,19 +237,19 @@ public class Supplementary {
* Abnormal case1 - char is included in the string but fromIndex * Abnormal case1 - char is included in the string but fromIndex
* is incorrect. * is incorrect.
*/ */
testIndexOf(First, s, -1, ch, golden3[i][2]); testIndexOf(s, -1, ch, golden3[i][2]);
testIndexOf(First, s, s.length(), ch, testIndexOf(s, s.length(), ch,
golden3[i][golden3[i].length-1]); golden3[i][golden3[i].length-1]);
/* /*
* Abnormal case2 - char which isn't included in the string. * Abnormal case2 - char which isn't included in the string.
*/ */
testIndexOf(First, s, 0, 'Z', -1); testIndexOf(s, 0, 'Z', -1);
testIndexOf(First, s, 0, 0xDB98, -1); testIndexOf(s, 0, 0xDB98, -1);
testIndexOf(First, s, 0, 0xDE76, -1); testIndexOf(s, 0, 0xDE76, -1);
testIndexOf(First, s, 0, 0x12345, -1); testIndexOf(s, 0, 0x12345, -1);
testIndexOf(First, s, 0, -1, -1); testIndexOf(s, 0, -1, -1);
testIndexOf(First, s, 0, 0x110000, -1); testIndexOf(s, 0, 0x110000, -1);
} }
} }
@ -264,18 +264,18 @@ public class Supplementary {
/* /*
* Normal case * Normal case
*/ */
testIndexOf(Last, s, golden3[i][0], testLastIndexOf(s, golden3[i][0],
golden3[i][golden3[i].length-2]); golden3[i][golden3[i].length-2]);
/* /*
* Abnormal case - char which isn't included in the string. * Abnormal case - char which isn't included in the string.
*/ */
testIndexOf(Last, s, 'Z', -1); testLastIndexOf(s, 'Z', -1);
testIndexOf(Last, s, 0xDB98, -1); testLastIndexOf(s, 0xDB98, -1);
testIndexOf(Last, s, 0xDE76, -1); testLastIndexOf(s, 0xDE76, -1);
testIndexOf(Last, s, 0x12345, -1); testLastIndexOf(s, 0x12345, -1);
testIndexOf(Last, s, -1, -1); testLastIndexOf(s, -1, -1);
testIndexOf(Last, s, 0x110000, -1); testLastIndexOf(s, 0x110000, -1);
} }
} }
@ -294,7 +294,7 @@ public class Supplementary {
*/ */
int fromIndex = len - 1; int fromIndex = len - 1;
for (int j = golden3[i].length - 2; j > 0; j--) { for (int j = golden3[i].length - 2; j > 0; j--) {
fromIndex = testIndexOf(Last, s, fromIndex, ch, fromIndex = testLastIndexOf(s, fromIndex, ch,
golden3[i][j]) - 1; golden3[i][j]) - 1;
} }
@ -302,18 +302,18 @@ public class Supplementary {
* Abnormal case1 - char is included in the string but fromIndex * Abnormal case1 - char is included in the string but fromIndex
* is incorrect. * is incorrect.
*/ */
testIndexOf(Last, s, -1, ch, golden3[i][1]); testLastIndexOf(s, -1, ch, golden3[i][1]);
testIndexOf(Last, s, len, ch, golden3[i][golden3[i].length-2]); testLastIndexOf(s, len, ch, golden3[i][golden3[i].length-2]);
/* /*
* Abnormal case2 - char which isn't included in the string. * Abnormal case2 - char which isn't included in the string.
*/ */
testIndexOf(Last, s, len, 'Z', -1); testLastIndexOf(s, len, 'Z', -1);
testIndexOf(Last, s, len, 0xDB98, -1); testLastIndexOf(s, len, 0xDB98, -1);
testIndexOf(Last, s, len, 0xDE76, -1); testLastIndexOf(s, len, 0xDE76, -1);
testIndexOf(Last, s, len, 0x12345, -1); testLastIndexOf(s, len, 0x12345, -1);
testIndexOf(Last, s, len, -1, -1); testLastIndexOf(s, len, -1, -1);
testIndexOf(Last, s, len, 0x110000, -1); testLastIndexOf(s, len, 0x110000, -1);
} }
} }
@ -471,7 +471,7 @@ public class Supplementary {
result, expected); result, expected);
result = str.offsetByCodePoints(j, -nCodePoints); result = str.offsetByCodePoints(j, -nCodePoints);
check(result != 0, check(result != 0,
"offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
result, 0); result, 0);
} }
@ -531,7 +531,7 @@ public class Supplementary {
result, expected); result, expected);
result = str.offsetByCodePoints(j, -nCodePoints); result = str.offsetByCodePoints(j, -nCodePoints);
check(result != 0, check(result != 0,
"offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
result, 0); result, 0);
} }
} }
@ -539,7 +539,7 @@ public class Supplementary {
static final boolean At = true, Before = false; static final boolean At = true, Before = false;
static final boolean First = true, Last = false; static final boolean FIRST = true, LAST = false;
static void testCodePoint(boolean isAt, String s, int index, int expected) { static void testCodePoint(boolean isAt, String s, int index, int expected) {
int c = isAt ? s.codePointAt(index) : s.codePointBefore(index); int c = isAt ? s.codePointAt(index) : s.codePointBefore(index);
@ -563,22 +563,72 @@ public class Supplementary {
+ s + "> should throw StringIndexOutOfBoundsPointerException."); + s + "> should throw StringIndexOutOfBoundsPointerException.");
} }
static void testIndexOf(boolean isFirst, String s, int c, int expected) { static void testIndexOf(String s, int c, int expected) {
int index = isFirst ? s.indexOf(c) : s.lastIndexOf(c); testIndexOf2(s, c, expected);
if (s.indexOf(c) != -1) {
check(index != expected, testIndexOf2(s + (char) c, c, expected);
(isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) if (Character.isSupplementaryCodePoint(c)) {
+ ") for <" + s + ">", index, expected); char[] surrogates = Character.toChars(c);
testIndexOf2(s + new String(surrogates), c, expected);
testIndexOf2(s + surrogates[0], c, expected);
testIndexOf2(s + surrogates[1], c, expected);
testIndexOf2(new String(surrogates) + s, c, 0);
testIndexOf2(surrogates[0] + s, c, expected + 1);
testIndexOf2(surrogates[1] + s, c, expected + 1);
}
}
} }
static int testIndexOf(boolean isFirst, String s, int fromIndex, int c, static void testIndexOf2(String s, int c, int expected) {
int expected) { int index = s.indexOf(c);
int index = isFirst ? s.indexOf(c, fromIndex) :
s.lastIndexOf(c, fromIndex);
check(index != expected, check(index != expected,
(isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) + ", " "indexOf(" + toHexString(c) + ") for <" + s + ">",
+ fromIndex + ") for <" + s + ">", index, expected); index, expected);
}
static void testLastIndexOf(String s, int c, int expected) {
testLastIndexOf2(s, c, expected);
if (s.lastIndexOf(c) != -1) {
testLastIndexOf2((char) c + s, c, expected + 1);
if (Character.isSupplementaryCodePoint(c)) {
char[] surrogates = Character.toChars(c);
testLastIndexOf2(s + new String(surrogates), c, s.length());
testLastIndexOf2(s + surrogates[0], c, expected);
testLastIndexOf2(s + surrogates[1], c, expected);
testLastIndexOf2(new String(surrogates) + s, c, expected + 2);
testLastIndexOf2(surrogates[0] + s, c, expected + 1);
testLastIndexOf2(surrogates[1] + s, c, expected + 1);
}
}
}
static void testLastIndexOf2(String s, int c, int expected) {
int index = s.lastIndexOf(c);
check(index != expected,
"lastIndexOf(" + toHexString(c) + ") for <" + s + ">",
index, expected);
}
static int testIndexOf(String s, int fromIndex, int c, int expected) {
int index = s.indexOf(c, fromIndex);
check(index != expected,
"indexOf(" + toHexString(c) + ", "
+ fromIndex + ") for <" + s + ">",
index, expected);
return index;
}
static int testLastIndexOf(String s, int fromIndex, int c, int expected) {
int index = s.lastIndexOf(c, fromIndex);
check(index != expected,
"lastIndexOf(" + toHexString(c) + ", "
+ fromIndex + ") for <" + s + ">",
index, expected);
return index; return index;
} }

View File

@ -24,7 +24,7 @@
/* /*
* *
* @test * @test
* @bug 4533872 4915683 4985217 5017280 * @bug 4533872 4915683 4985217 5017280 6937112
* @summary Unit tests for supplementary character support (JSR-204) * @summary Unit tests for supplementary character support (JSR-204)
*/ */
@ -57,7 +57,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */ 0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0 // includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1 /* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */ 0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -151,7 +151,7 @@ public class Supplementary {
"\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00", "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
"\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00", "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0 // includes an undefined supplementary character in Unicode 4.0.0
"\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00", "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
}; };

View File

@ -57,7 +57,7 @@ public class Supplementary {
0 1 2345 678 9 012 345678 9 01 2 */ 0 1 2345 678 9 012 345678 9 01 2 */
"\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0 // includes an undefined supplementary character in Unicode 4.0.0
/* 1 11 1 1111 1 /* 1 11 1 1111 1
0 1 2345 6 789 0 12 3 4567 8 */ 0 1 2345 6 789 0 12 3 4567 8 */
"\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@ -151,7 +151,7 @@ public class Supplementary {
"\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00", "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
"\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00", "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
// includes an undefined supprementary characters in Unicode 4.0.0 // includes an undefined supplementary character in Unicode 4.0.0
"\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00", "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
}; };