From 9c81e25271b17b30628d20b339351a7969fd141e Mon Sep 17 00:00:00 2001 From: Ulf Zibis Date: Wed, 30 Jun 2010 16:11:32 -0700 Subject: [PATCH] 6937112: String.lastIndexOf confused by unpaired trailing surrogate Rewrite lastIndexOf for performance and correctness Reviewed-by: sherman --- jdk/src/share/classes/java/lang/String.java | 79 +++++----- jdk/test/java/lang/String/Supplementary.java | 148 ++++++++++++------ .../java/lang/StringBuffer/Supplementary.java | 6 +- .../lang/StringBuilder/Supplementary.java | 4 +- 4 files changed, 147 insertions(+), 90 deletions(-) diff --git a/jdk/src/share/classes/java/lang/String.java b/jdk/src/share/classes/java/lang/String.java index 60a4351e01d..be421695e5e 100644 --- a/jdk/src/share/classes/java/lang/String.java +++ b/jdk/src/share/classes/java/lang/String.java @@ -37,6 +37,7 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import sun.nio.cs.Surrogate; /** @@ -1575,9 +1576,6 @@ public final class String * if the character does not occur. */ public int indexOf(int ch, int fromIndex) { - int max = offset + count; - char v[] = value; - if (fromIndex < 0) { fromIndex = 0; } else if (fromIndex >= count) { @@ -1585,29 +1583,36 @@ public final class String return -1; } - int i = offset + fromIndex; if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { // handle most cases here (ch is a BMP code point or a // negative value (invalid code point)) - for (; i < max ; i++) { - if (v[i] == ch) { + final char[] value = this.value; + final int offset = this.offset; + final int max = offset + count; + for (int i = offset + fromIndex; i < max ; i++) { + if (value[i] == ch) { return i - offset; } } return -1; + } else { + return indexOfSupplementary(ch, fromIndex); } + } - if (ch <= Character.MAX_CODE_POINT) { - // handle supplementary characters here - char[] surrogates = Character.toChars(ch); - for (; i < max; i++) { - if (v[i] == surrogates[0]) { - if (i + 1 == max) { - break; - } - if (v[i+1] == surrogates[1]) { - return i - offset; - } + /** + * Handles (rare) calls of indexOf with a supplementary character. + */ + private int indexOfSupplementary(int ch, int fromIndex) { + if (Character.isValidCodePoint(ch)) { + final char[] value = this.value; + final int offset = this.offset; + final char hi = Surrogate.high(ch); + final char lo = Surrogate.low(ch); + final int max = offset + count - 1; + for (int i = offset + fromIndex; i < max; i++) { + if (value[i] == hi && value[i+1] == lo) { + return i - offset; } } } @@ -1676,34 +1681,36 @@ public final class String * if the character does not occur before that point. */ public int lastIndexOf(int ch, int fromIndex) { - int min = offset; - char v[] = value; - - int i = offset + ((fromIndex >= count) ? count - 1 : fromIndex); - if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { // handle most cases here (ch is a BMP code point or a // negative value (invalid code point)) - for (; i >= min ; i--) { - if (v[i] == ch) { + final char[] value = this.value; + final int offset = this.offset; + int i = offset + Math.min(fromIndex, count - 1); + for (; i >= offset ; i--) { + if (value[i] == ch) { return i - offset; } } return -1; + } else { + return lastIndexOfSupplementary(ch, fromIndex); } + } - int max = offset + count; - if (ch <= Character.MAX_CODE_POINT) { - // handle supplementary characters here - char[] surrogates = Character.toChars(ch); - for (; i >= min; i--) { - if (v[i] == surrogates[0]) { - if (i + 1 == max) { - break; - } - if (v[i+1] == surrogates[1]) { - return i - offset; - } + /** + * Handles (rare) calls of lastIndexOf with a supplementary character. + */ + private int lastIndexOfSupplementary(int ch, int fromIndex) { + if (Character.isValidCodePoint(ch)) { + final char[] value = this.value; + final int offset = this.offset; + char hi = Surrogate.high(ch); + char lo = Surrogate.low(ch); + int i = offset + Math.min(fromIndex, count - 2); + for (; i >= offset; i--) { + if (value[i] == hi && value[i+1] == lo) { + return i - offset; } } } diff --git a/jdk/test/java/lang/String/Supplementary.java b/jdk/test/java/lang/String/Supplementary.java index 48e27d1012f..1049348c25c 100644 --- a/jdk/test/java/lang/String/Supplementary.java +++ b/jdk/test/java/lang/String/Supplementary.java @@ -62,7 +62,7 @@ public class Supplementary { 0 1 2345 678 9 012 345678 9 01 2 */ "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", - // includes an undefined supprementary characters in Unicode 4.0.0 + // includes an undefined supplementary character in Unicode 4.0.0 /* 1 11 1 1111 1 0 1 2345 6 789 0 12 3 4567 8 */ "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", @@ -168,7 +168,7 @@ public class Supplementary { * string in input[m]. * * The meaning of each element in golden3[][n] - * golden3[][0]: characater which is searched. + * golden3[][0]: character which is searched. * golden3[][2]: the golden data for indexOf(int ch) * From golden3[][2] to golden3[][n-1]: * the golden data for indexOf(int ch, int fromIndex) @@ -201,17 +201,17 @@ public class Supplementary { /* * Normal case */ - testIndexOf(First, s, golden3[i][0], golden3[i][2]); + testIndexOf(s, golden3[i][0], golden3[i][2]); /* * Abnormal case - char which isn't included in the string. */ - testIndexOf(First, s, 'Z', -1); - testIndexOf(First, s, 0xDB98, -1); - testIndexOf(First, s, 0xDE76, -1); - testIndexOf(First, s, 0x12345, -1); - testIndexOf(First, s, -1, -1); - testIndexOf(First, s, 0x110000, -1); + testIndexOf(s, 'Z', -1); + testIndexOf(s, 0xDB98, -1); + testIndexOf(s, 0xDE76, -1); + testIndexOf(s, 0x12345, -1); + testIndexOf(s, -1, -1); + testIndexOf(s, 0x110000, -1); } } @@ -229,7 +229,7 @@ public class Supplementary { */ int fromIndex = 0; for (int j = 2; j < golden3[i].length; j++) { - fromIndex = testIndexOf(First, s, fromIndex, ch, + fromIndex = testIndexOf(s, fromIndex, ch, golden3[i][j]) + 1; } @@ -237,19 +237,19 @@ public class Supplementary { * Abnormal case1 - char is included in the string but fromIndex * is incorrect. */ - testIndexOf(First, s, -1, ch, golden3[i][2]); - testIndexOf(First, s, s.length(), ch, + testIndexOf(s, -1, ch, golden3[i][2]); + testIndexOf(s, s.length(), ch, golden3[i][golden3[i].length-1]); /* * Abnormal case2 - char which isn't included in the string. */ - testIndexOf(First, s, 0, 'Z', -1); - testIndexOf(First, s, 0, 0xDB98, -1); - testIndexOf(First, s, 0, 0xDE76, -1); - testIndexOf(First, s, 0, 0x12345, -1); - testIndexOf(First, s, 0, -1, -1); - testIndexOf(First, s, 0, 0x110000, -1); + testIndexOf(s, 0, 'Z', -1); + testIndexOf(s, 0, 0xDB98, -1); + testIndexOf(s, 0, 0xDE76, -1); + testIndexOf(s, 0, 0x12345, -1); + testIndexOf(s, 0, -1, -1); + testIndexOf(s, 0, 0x110000, -1); } } @@ -264,18 +264,18 @@ public class Supplementary { /* * Normal case */ - testIndexOf(Last, s, golden3[i][0], + testLastIndexOf(s, golden3[i][0], golden3[i][golden3[i].length-2]); /* * Abnormal case - char which isn't included in the string. */ - testIndexOf(Last, s, 'Z', -1); - testIndexOf(Last, s, 0xDB98, -1); - testIndexOf(Last, s, 0xDE76, -1); - testIndexOf(Last, s, 0x12345, -1); - testIndexOf(Last, s, -1, -1); - testIndexOf(Last, s, 0x110000, -1); + testLastIndexOf(s, 'Z', -1); + testLastIndexOf(s, 0xDB98, -1); + testLastIndexOf(s, 0xDE76, -1); + testLastIndexOf(s, 0x12345, -1); + testLastIndexOf(s, -1, -1); + testLastIndexOf(s, 0x110000, -1); } } @@ -294,7 +294,7 @@ public class Supplementary { */ int fromIndex = len - 1; for (int j = golden3[i].length - 2; j > 0; j--) { - fromIndex = testIndexOf(Last, s, fromIndex, ch, + fromIndex = testLastIndexOf(s, fromIndex, ch, golden3[i][j]) - 1; } @@ -302,18 +302,18 @@ public class Supplementary { * Abnormal case1 - char is included in the string but fromIndex * is incorrect. */ - testIndexOf(Last, s, -1, ch, golden3[i][1]); - testIndexOf(Last, s, len, ch, golden3[i][golden3[i].length-2]); + testLastIndexOf(s, -1, ch, golden3[i][1]); + testLastIndexOf(s, len, ch, golden3[i][golden3[i].length-2]); /* * Abnormal case2 - char which isn't included in the string. */ - testIndexOf(Last, s, len, 'Z', -1); - testIndexOf(Last, s, len, 0xDB98, -1); - testIndexOf(Last, s, len, 0xDE76, -1); - testIndexOf(Last, s, len, 0x12345, -1); - testIndexOf(Last, s, len, -1, -1); - testIndexOf(Last, s, len, 0x110000, -1); + testLastIndexOf(s, len, 'Z', -1); + testLastIndexOf(s, len, 0xDB98, -1); + testLastIndexOf(s, len, 0xDE76, -1); + testLastIndexOf(s, len, 0x12345, -1); + testLastIndexOf(s, len, -1, -1); + testLastIndexOf(s, len, 0x110000, -1); } } @@ -471,7 +471,7 @@ public class Supplementary { result, expected); result = str.offsetByCodePoints(j, -nCodePoints); check(result != 0, - "offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", + "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", result, 0); } @@ -531,7 +531,7 @@ public class Supplementary { result, expected); result = str.offsetByCodePoints(j, -nCodePoints); check(result != 0, - "offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", + "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")", result, 0); } } @@ -539,7 +539,7 @@ public class Supplementary { static final boolean At = true, Before = false; - static final boolean First = true, Last = false; + static final boolean FIRST = true, LAST = false; static void testCodePoint(boolean isAt, String s, int index, int expected) { int c = isAt ? s.codePointAt(index) : s.codePointBefore(index); @@ -563,22 +563,72 @@ public class Supplementary { + s + "> should throw StringIndexOutOfBoundsPointerException."); } - static void testIndexOf(boolean isFirst, String s, int c, int expected) { - int index = isFirst ? s.indexOf(c) : s.lastIndexOf(c); - - check(index != expected, - (isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) - + ") for <" + s + ">", index, expected); + static void testIndexOf(String s, int c, int expected) { + testIndexOf2(s, c, expected); + if (s.indexOf(c) != -1) { + testIndexOf2(s + (char) c, c, expected); + if (Character.isSupplementaryCodePoint(c)) { + char[] surrogates = Character.toChars(c); + testIndexOf2(s + new String(surrogates), c, expected); + testIndexOf2(s + surrogates[0], c, expected); + testIndexOf2(s + surrogates[1], c, expected); + testIndexOf2(new String(surrogates) + s, c, 0); + testIndexOf2(surrogates[0] + s, c, expected + 1); + testIndexOf2(surrogates[1] + s, c, expected + 1); + } + } } - static int testIndexOf(boolean isFirst, String s, int fromIndex, int c, - int expected) { - int index = isFirst ? s.indexOf(c, fromIndex) : - s.lastIndexOf(c, fromIndex); + static void testIndexOf2(String s, int c, int expected) { + int index = s.indexOf(c); check(index != expected, - (isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) + ", " - + fromIndex + ") for <" + s + ">", index, expected); + "indexOf(" + toHexString(c) + ") for <" + s + ">", + index, expected); + } + + static void testLastIndexOf(String s, int c, int expected) { + testLastIndexOf2(s, c, expected); + if (s.lastIndexOf(c) != -1) { + testLastIndexOf2((char) c + s, c, expected + 1); + if (Character.isSupplementaryCodePoint(c)) { + char[] surrogates = Character.toChars(c); + testLastIndexOf2(s + new String(surrogates), c, s.length()); + testLastIndexOf2(s + surrogates[0], c, expected); + testLastIndexOf2(s + surrogates[1], c, expected); + testLastIndexOf2(new String(surrogates) + s, c, expected + 2); + testLastIndexOf2(surrogates[0] + s, c, expected + 1); + testLastIndexOf2(surrogates[1] + s, c, expected + 1); + } + } + } + + static void testLastIndexOf2(String s, int c, int expected) { + int index = s.lastIndexOf(c); + + check(index != expected, + "lastIndexOf(" + toHexString(c) + ") for <" + s + ">", + index, expected); + } + + static int testIndexOf(String s, int fromIndex, int c, int expected) { + int index = s.indexOf(c, fromIndex); + + check(index != expected, + "indexOf(" + toHexString(c) + ", " + + fromIndex + ") for <" + s + ">", + index, expected); + + return index; + } + + static int testLastIndexOf(String s, int fromIndex, int c, int expected) { + int index = s.lastIndexOf(c, fromIndex); + + check(index != expected, + "lastIndexOf(" + toHexString(c) + ", " + + fromIndex + ") for <" + s + ">", + index, expected); return index; } diff --git a/jdk/test/java/lang/StringBuffer/Supplementary.java b/jdk/test/java/lang/StringBuffer/Supplementary.java index 1262afd001f..ce8d89c9a94 100644 --- a/jdk/test/java/lang/StringBuffer/Supplementary.java +++ b/jdk/test/java/lang/StringBuffer/Supplementary.java @@ -24,7 +24,7 @@ /* * * @test - * @bug 4533872 4915683 4985217 5017280 + * @bug 4533872 4915683 4985217 5017280 6937112 * @summary Unit tests for supplementary character support (JSR-204) */ @@ -57,7 +57,7 @@ public class Supplementary { 0 1 2345 678 9 012 345678 9 01 2 */ "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", - // includes an undefined supprementary characters in Unicode 4.0.0 + // includes an undefined supplementary character in Unicode 4.0.0 /* 1 11 1 1111 1 0 1 2345 6 789 0 12 3 4567 8 */ "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", @@ -151,7 +151,7 @@ public class Supplementary { "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00", "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00", - // includes an undefined supprementary characters in Unicode 4.0.0 + // includes an undefined supplementary character in Unicode 4.0.0 "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00", }; diff --git a/jdk/test/java/lang/StringBuilder/Supplementary.java b/jdk/test/java/lang/StringBuilder/Supplementary.java index a544029b92e..283d9b97250 100644 --- a/jdk/test/java/lang/StringBuilder/Supplementary.java +++ b/jdk/test/java/lang/StringBuilder/Supplementary.java @@ -57,7 +57,7 @@ public class Supplementary { 0 1 2345 678 9 012 345678 9 01 2 */ "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00", - // includes an undefined supprementary characters in Unicode 4.0.0 + // includes an undefined supplementary character in Unicode 4.0.0 /* 1 11 1 1111 1 0 1 2345 6 789 0 12 3 4567 8 */ "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02", @@ -151,7 +151,7 @@ public class Supplementary { "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00", "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00", - // includes an undefined supprementary characters in Unicode 4.0.0 + // includes an undefined supplementary character in Unicode 4.0.0 "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00", };