8214245: Case insensitive matching doesn't work correctly for some character classes

Reviewed-by: rriggs, darcy
This commit is contained in:
Ivan Gerasimov 2020-03-18 01:04:22 -07:00
parent e827f4ae1c
commit 1d4a4fed43
3 changed files with 164 additions and 66 deletions

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -169,11 +169,15 @@ class CharPredicates {
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
private static CharPredicate getPosixPredicate(String name) { private static CharPredicate getPosixPredicate(String name, boolean caseIns) {
switch (name) { switch (name) {
case "ALPHA": return ALPHABETIC(); case "ALPHA": return ALPHABETIC();
case "LOWER": return LOWERCASE(); case "LOWER": return caseIns
case "UPPER": return UPPERCASE(); ? LOWERCASE().union(UPPERCASE(), TITLECASE())
: LOWERCASE();
case "UPPER": return caseIns
? UPPERCASE().union(LOWERCASE(), TITLECASE())
: UPPERCASE();
case "SPACE": return WHITE_SPACE(); case "SPACE": return WHITE_SPACE();
case "PUNCT": return PUNCTUATION(); case "PUNCT": return PUNCTUATION();
case "XDIGIT": return HEX_DIGIT(); case "XDIGIT": return HEX_DIGIT();
@ -187,40 +191,46 @@ class CharPredicates {
} }
} }
private static CharPredicate getUnicodePredicate(String name) { private static CharPredicate getUnicodePredicate(String name, boolean caseIns) {
switch (name) { switch (name) {
case "ALPHABETIC": return ALPHABETIC(); case "ALPHABETIC": return ALPHABETIC();
case "ASSIGNED": return ASSIGNED(); case "ASSIGNED": return ASSIGNED();
case "CONTROL": return CONTROL(); case "CONTROL": return CONTROL();
case "HEXDIGIT": return HEX_DIGIT(); case "HEXDIGIT":
case "IDEOGRAPHIC": return IDEOGRAPHIC();
case "JOINCONTROL": return JOIN_CONTROL();
case "LETTER": return LETTER();
case "LOWERCASE": return LOWERCASE();
case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT();
case "TITLECASE": return TITLECASE();
case "PUNCTUATION": return PUNCTUATION();
case "UPPERCASE": return UPPERCASE();
case "WHITESPACE": return WHITE_SPACE();
case "WORD": return WORD();
case "WHITE_SPACE": return WHITE_SPACE();
case "HEX_DIGIT": return HEX_DIGIT(); case "HEX_DIGIT": return HEX_DIGIT();
case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT(); case "IDEOGRAPHIC": return IDEOGRAPHIC();
case "JOINCONTROL":
case "JOIN_CONTROL": return JOIN_CONTROL(); case "JOIN_CONTROL": return JOIN_CONTROL();
case "LETTER": return LETTER();
case "LOWERCASE": return caseIns
? LOWERCASE().union(UPPERCASE(), TITLECASE())
: LOWERCASE();
case "NONCHARACTERCODEPOINT":
case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
case "TITLECASE": return caseIns
? TITLECASE().union(LOWERCASE(), UPPERCASE())
: TITLECASE();
case "PUNCTUATION": return PUNCTUATION();
case "UPPERCASE": return caseIns
? UPPERCASE().union(LOWERCASE(), TITLECASE())
: UPPERCASE();
case "WHITESPACE":
case "WHITE_SPACE": return WHITE_SPACE();
case "WORD": return WORD();
default: return null; default: return null;
} }
} }
public static CharPredicate forUnicodeProperty(String propName) { public static CharPredicate forUnicodeProperty(String propName, boolean caseIns) {
propName = propName.toUpperCase(Locale.ROOT); propName = propName.toUpperCase(Locale.ROOT);
CharPredicate p = getUnicodePredicate(propName); CharPredicate p = getUnicodePredicate(propName, caseIns);
if (p != null) if (p != null)
return p; return p;
return getPosixPredicate(propName); return getPosixPredicate(propName, caseIns);
} }
public static CharPredicate forPOSIXName(String propName) { public static CharPredicate forPOSIXName(String propName, boolean caseIns) {
return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH)); return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH), caseIns);
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -254,14 +264,23 @@ class CharPredicates {
// unicode categories, aliases, properties, java methods ... // unicode categories, aliases, properties, java methods ...
static CharPredicate forProperty(String name) { static CharPredicate forProperty(String name, boolean caseIns) {
// Unicode character property aliases, defined in // Unicode character property aliases, defined in
// http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
switch (name) { switch (name) {
case "Cn": return category(1<<Character.UNASSIGNED); case "Cn": return category(1<<Character.UNASSIGNED);
case "Lu": return category(1<<Character.UPPERCASE_LETTER); case "Lu": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
case "Ll": return category(1<<Character.LOWERCASE_LETTER); (1<<Character.UPPERCASE_LETTER) |
case "Lt": return category(1<<Character.TITLECASE_LETTER); (1<<Character.TITLECASE_LETTER)
: (1<<Character.UPPERCASE_LETTER));
case "Ll": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
(1<<Character.UPPERCASE_LETTER) |
(1<<Character.TITLECASE_LETTER)
: (1<<Character.LOWERCASE_LETTER));
case "Lt": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
(1<<Character.UPPERCASE_LETTER) |
(1<<Character.TITLECASE_LETTER)
: (1<<Character.TITLECASE_LETTER));
case "Lm": return category(1<<Character.MODIFIER_LETTER); case "Lm": return category(1<<Character.MODIFIER_LETTER);
case "Lo": return category(1<<Character.OTHER_LETTER); case "Lo": return category(1<<Character.OTHER_LETTER);
case "Mn": return category(1<<Character.NON_SPACING_MARK); case "Mn": return category(1<<Character.NON_SPACING_MARK);
@ -331,39 +350,50 @@ class CharPredicates {
case "all": return Pattern.ALL(); case "all": return Pattern.ALL();
// Posix regular expression character classes, defined in // Posix regular expression character classes, defined in
// http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
case "ASCII": return range(0x00, 0x7F); // ASCII case "ASCII": return range(0x00, 0x7F); // ASCII
case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters
case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters
case "Blank": return ctype(ASCII.BLANK); // Space and tab characters case "Blank": return ctype(ASCII.BLANK); // Space and tab characters
case "Cntrl": return ctype(ASCII.CNTRL); // Control characters case "Cntrl": return ctype(ASCII.CNTRL); // Control characters
case "Digit": return range('0', '9'); // Numeric characters case "Digit": return range('0', '9'); // Numeric characters
case "Graph": return ctype(ASCII.GRAPH); // printable and visible case "Graph": return ctype(ASCII.GRAPH); // printable and visible
case "Lower": return range('a', 'z'); // Lower-case alphabetic case "Lower": return caseIns ? ctype(ASCII.ALPHA)
case "Print": return range(0x20, 0x7E); // Printable characters : range('a', 'z'); // Lower-case alphabetic
case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters case "Print": return range(0x20, 0x7E); // Printable characters
case "Space": return ctype(ASCII.SPACE); // Space characters case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters
case "Upper": return range('A', 'Z'); // Upper-case alphabetic case "Space": return ctype(ASCII.SPACE); // Space characters
case "Upper": return caseIns ? ctype(ASCII.ALPHA)
: range('A', 'Z'); // Upper-case alphabetic
case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits
// Java character properties, defined by methods in Character.java // Java character properties, defined by methods in Character.java
case "javaLowerCase": return java.lang.Character::isLowerCase; case "javaLowerCase": return caseIns ? c -> Character.isLowerCase(c) ||
case "javaUpperCase": return Character::isUpperCase; Character.isUpperCase(c) ||
case "javaAlphabetic": return java.lang.Character::isAlphabetic; Character.isTitleCase(c)
case "javaIdeographic": return java.lang.Character::isIdeographic; : Character::isLowerCase;
case "javaTitleCase": return java.lang.Character::isTitleCase; case "javaUpperCase": return caseIns ? c -> Character.isUpperCase(c) ||
case "javaDigit": return java.lang.Character::isDigit; Character.isLowerCase(c) ||
case "javaDefined": return java.lang.Character::isDefined; Character.isTitleCase(c)
case "javaLetter": return java.lang.Character::isLetter; : Character::isUpperCase;
case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit; case "javaAlphabetic": return Character::isAlphabetic;
case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart; case "javaIdeographic": return Character::isIdeographic;
case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart; case "javaTitleCase": return caseIns ? c -> Character.isTitleCase(c) ||
case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart; Character.isLowerCase(c) ||
case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart; Character.isUpperCase(c)
case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable; : Character::isTitleCase;
case "javaSpaceChar": return java.lang.Character::isSpaceChar; case "javaDigit": return Character::isDigit;
case "javaWhitespace": return java.lang.Character::isWhitespace; case "javaDefined": return Character::isDefined;
case "javaISOControl": return java.lang.Character::isISOControl; case "javaLetter": return Character::isLetter;
case "javaMirrored": return java.lang.Character::isMirrored; case "javaLetterOrDigit": return Character::isLetterOrDigit;
case "javaJavaIdentifierStart": return Character::isJavaIdentifierStart;
case "javaJavaIdentifierPart": return Character::isJavaIdentifierPart;
case "javaUnicodeIdentifierStart": return Character::isUnicodeIdentifierStart;
case "javaUnicodeIdentifierPart": return Character::isUnicodeIdentifierPart;
case "javaIdentifierIgnorable": return Character::isIdentifierIgnorable;
case "javaSpaceChar": return Character::isSpaceChar;
case "javaWhitespace": return Character::isWhitespace;
case "javaISOControl": return Character::isISOControl;
case "javaMirrored": return Character::isMirrored;
default: return null; default: return null;
} }
} }

View File

@ -2904,7 +2904,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
break; break;
case "gc": case "gc":
case "general_category": case "general_category":
p = CharPredicates.forProperty(value); p = CharPredicates.forProperty(value, has(CASE_INSENSITIVE));
break; break;
default: default:
break; break;
@ -2920,17 +2920,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} else if (name.startsWith("Is")) { } else if (name.startsWith("Is")) {
// \p{IsGeneralCategory} and \p{IsScriptName} // \p{IsGeneralCategory} and \p{IsScriptName}
String shortName = name.substring(2); String shortName = name.substring(2);
p = CharPredicates.forUnicodeProperty(shortName); p = CharPredicates.forUnicodeProperty(shortName, has(CASE_INSENSITIVE));
if (p == null) if (p == null)
p = CharPredicates.forProperty(shortName); p = CharPredicates.forProperty(shortName, has(CASE_INSENSITIVE));
if (p == null) if (p == null)
p = CharPredicates.forUnicodeScript(shortName); p = CharPredicates.forUnicodeScript(shortName);
} else { } else {
if (has(UNICODE_CHARACTER_CLASS)) { if (has(UNICODE_CHARACTER_CLASS))
p = CharPredicates.forPOSIXName(name); p = CharPredicates.forPOSIXName(name, has(CASE_INSENSITIVE));
}
if (p == null) if (p == null)
p = CharPredicates.forProperty(name); p = CharPredicates.forProperty(name, has(CASE_INSENSITIVE));
} }
if (p == null) if (p == null)
throw error("Unknown character property name {" + name + "}"); throw error("Unknown character property name {" + name + "}");
@ -5675,7 +5674,7 @@ NEXT: while (i <= last) {
return ch -> is(ch) || p.is(ch); return ch -> is(ch) || p.is(ch);
} }
default CharPredicate union(CharPredicate p1, default CharPredicate union(CharPredicate p1,
CharPredicate p2 ) { CharPredicate p2) {
return ch -> is(ch) || p1.is(ch) || p2.is(ch); return ch -> is(ch) || p1.is(ch) || p2.is(ch);
} }
default CharPredicate negate() { default CharPredicate negate() {

View File

@ -36,7 +36,7 @@
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
* 8216332 * 8216332 8214245
* *
* @library /test/lib * @library /test/lib
* @library /lib/testlibrary/java/lang * @library /lib/testlibrary/java/lang
@ -194,6 +194,7 @@ public class RegExTest {
illegalRepetitionRange(); illegalRepetitionRange();
surrogatePairWithCanonEq(); surrogatePairWithCanonEq();
lineBreakWithQuantifier(); lineBreakWithQuantifier();
caseInsensitivePMatch();
if (failure) { if (failure) {
throw new throw new
@ -5086,4 +5087,72 @@ public class RegExTest {
} }
report("lineBreakWithQuantifier"); report("lineBreakWithQuantifier");
} }
// This test is for 8214245
private static void caseInsensitivePMatch() {
for (String input : List.of("abcd", "AbCd", "ABCD")) {
for (String pattern : List.of("abcd", "aBcD", "[a-d]{4}",
"(?:a|b|c|d){4}", "\\p{Lower}{4}", "\\p{Ll}{4}",
"\\p{IsLl}{4}", "\\p{gc=Ll}{4}",
"\\p{general_category=Ll}{4}", "\\p{IsLowercase}{4}",
"\\p{javaLowerCase}{4}", "\\p{Upper}{4}", "\\p{Lu}{4}",
"\\p{IsLu}{4}", "\\p{gc=Lu}{4}", "\\p{general_category=Lu}{4}",
"\\p{IsUppercase}{4}", "\\p{javaUpperCase}{4}",
"\\p{Lt}{4}", "\\p{IsLt}{4}", "\\p{gc=Lt}{4}",
"\\p{general_category=Lt}{4}", "\\p{IsTitlecase}{4}",
"\\p{javaTitleCase}{4}", "[\\p{Lower}]{4}", "[\\p{Ll}]{4}",
"[\\p{IsLl}]{4}", "[\\p{gc=Ll}]{4}",
"[\\p{general_category=Ll}]{4}", "[\\p{IsLowercase}]{4}",
"[\\p{javaLowerCase}]{4}", "[\\p{Upper}]{4}", "[\\p{Lu}]{4}",
"[\\p{IsLu}]{4}", "[\\p{gc=Lu}]{4}",
"[\\p{general_category=Lu}]{4}", "[\\p{IsUppercase}]{4}",
"[\\p{javaUpperCase}]{4}", "[\\p{Lt}]{4}", "[\\p{IsLt}]{4}",
"[\\p{gc=Lt}]{4}", "[\\p{general_category=Lt}]{4}",
"[\\p{IsTitlecase}]{4}", "[\\p{javaTitleCase}]{4}"))
{
if (!Pattern.compile(pattern, Pattern.CASE_INSENSITIVE)
.matcher(input)
.matches())
{
failCount++;
System.err.println("Expected to match: " +
"'" + input + "' =~ /" + pattern + "/");
}
}
}
for (String input : List.of("\u01c7", "\u01c8", "\u01c9")) {
for (String pattern : List.of("\u01c7", "\u01c8", "\u01c9",
"[\u01c7\u01c8]", "[\u01c7\u01c9]", "[\u01c8\u01c9]",
"[\u01c7-\u01c8]", "[\u01c8-\u01c9]", "[\u01c7-\u01c9]",
"\\p{Lower}", "\\p{Ll}", "\\p{IsLl}", "\\p{gc=Ll}",
"\\p{general_category=Ll}", "\\p{IsLowercase}",
"\\p{javaLowerCase}", "\\p{Upper}", "\\p{Lu}",
"\\p{IsLu}", "\\p{gc=Lu}", "\\p{general_category=Lu}",
"\\p{IsUppercase}", "\\p{javaUpperCase}",
"\\p{Lt}", "\\p{IsLt}", "\\p{gc=Lt}",
"\\p{general_category=Lt}", "\\p{IsTitlecase}",
"\\p{javaTitleCase}", "[\\p{Lower}]", "[\\p{Ll}]",
"[\\p{IsLl}]", "[\\p{gc=Ll}]",
"[\\p{general_category=Ll}]", "[\\p{IsLowercase}]",
"[\\p{javaLowerCase}]", "[\\p{Upper}]", "[\\p{Lu}]",
"[\\p{IsLu}]", "[\\p{gc=Lu}]",
"[\\p{general_category=Lu}]", "[\\p{IsUppercase}]",
"[\\p{javaUpperCase}]", "[\\p{Lt}]", "[\\p{IsLt}]",
"[\\p{gc=Lt}]", "[\\p{general_category=Lt}]",
"[\\p{IsTitlecase}]", "[\\p{javaTitleCase}]"))
{
if (!Pattern.compile(pattern, Pattern.CASE_INSENSITIVE
| Pattern.UNICODE_CHARACTER_CLASS)
.matcher(input)
.matches())
{
failCount++;
System.err.println("Expected to match: " +
"'" + input + "' =~ /" + pattern + "/");
}
}
}
report("caseInsensitivePMatch");
}
} }