8264160: Regex \b is not consistent with \w without UNICODE_CHARACTER_CLASS
Reviewed-by: lancea, bpb, naoto
This commit is contained in:
parent
634800a536
commit
f01cce235b
@ -158,7 +158,8 @@ import jdk.internal.util.ArraysSupport;
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
|
||||
* <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
|
||||
* <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
|
||||
* <td headers="matches predef digit">A digit: {@code [0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
|
||||
* * UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>.</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
|
||||
* <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
|
||||
@ -167,7 +168,9 @@ import jdk.internal.util.ArraysSupport;
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
|
||||
* <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
|
||||
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
|
||||
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]} if
|
||||
* <a href="#UNICODE_CHARACTER_CLASS"> UNICODE_CHARACTER_CLASS</a> is not set. See
|
||||
* <a href="#unicodesupport">Unicode Support</a>.</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
|
||||
* <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
|
||||
@ -176,7 +179,8 @@ import jdk.internal.util.ArraysSupport;
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th>
|
||||
* <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th>
|
||||
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr>
|
||||
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
|
||||
* UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>. </td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th>
|
||||
* <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr>
|
||||
*
|
||||
@ -246,11 +250,12 @@ import jdk.internal.util.ArraysSupport;
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th>
|
||||
* <td headers="matches bounds end_line">The end of a line</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th>
|
||||
* <td headers="matches bounds word_boundary">A word boundary</td></tr>
|
||||
* <td headers="matches bounds word_boundary">A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location
|
||||
* where a non-word character abuts a word character)</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th>
|
||||
* <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th>
|
||||
* <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr>
|
||||
* <td headers="matches bounds non_word_boundary">A non-word boundary: {@code [^\b]}</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th>
|
||||
* <td headers="matches bounds begin_input">The beginning of the input</td></tr>
|
||||
* <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th>
|
||||
@ -535,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
|
||||
* that do not capture text and do not count towards the group total, or
|
||||
* <i>named-capturing</i> group.
|
||||
*
|
||||
* <h2> Unicode support </h2>
|
||||
* <h2 id="unicodesupport"> Unicode support </h2>
|
||||
*
|
||||
* <p> This class is in conformance with Level 1 of <a
|
||||
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
|
||||
@ -5377,7 +5382,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
|
||||
boolean isWord(int ch) {
|
||||
return useUWORD ? CharPredicates.WORD().is(ch)
|
||||
: (ch == '_' || Character.isLetterOrDigit(ch));
|
||||
: CharPredicates.ASCII_WORD().is(ch);
|
||||
}
|
||||
|
||||
int check(Matcher matcher, int i, CharSequence seq) {
|
||||
|
@ -36,7 +36,7 @@
|
||||
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
|
||||
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
|
||||
* 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694
|
||||
*
|
||||
* 8280403 8264160 8281315
|
||||
* @library /test/lib
|
||||
* @library /lib/testlibrary/java/lang
|
||||
* @build jdk.test.lib.RandomFactory
|
||||
@ -51,14 +51,9 @@ import java.nio.CharBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Scanner;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.IntFunction;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.MatchResult;
|
||||
@ -3854,11 +3849,11 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
// bounds/word align
|
||||
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
|
||||
twoFindIndexes(" \u0180sherman\u0400 ", boundU, 1, 10);
|
||||
assertTrue(bwbU.reset("\u0180sherman\u0400").matches());
|
||||
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
|
||||
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", boundU, 1, 11);
|
||||
assertTrue(bwbU.reset("\u0180sh\u0345erman\u0400").matches());
|
||||
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
|
||||
twoFindIndexes(" \u0724\u0739\u0724 ", boundU, 1, 4);
|
||||
assertTrue(bwbU.reset("\u0724\u0739\u0724").matches());
|
||||
assertTrue(bwbEU.reset("\u0724\u0739\u0724").matches());
|
||||
}
|
||||
@ -4503,6 +4498,8 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
//This test is for 8037397
|
||||
//Ensure we don't drop nested interior character classes to the right of an
|
||||
//intersection operator.
|
||||
@Test
|
||||
public static void droppedClassesWithIntersection() {
|
||||
String rx = "[A-Z&&[A-Z]0-9]";
|
||||
@ -4530,6 +4527,9 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
//This test is for 8269753
|
||||
//This is for ensuring that the caret doesn't point at the wrong character
|
||||
//in a syntax exception message because we previously didn't compensate for
|
||||
//tabs when rendering the offending string that contained tab characters.
|
||||
@Test
|
||||
public static void errorMessageCaretIndentation() {
|
||||
String pattern = "\t**";
|
||||
@ -4540,6 +4540,8 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
//This test is for 8276694
|
||||
//Ensure our error message indicates we have an unescaped backslash when we
|
||||
//encounter one.
|
||||
@Test
|
||||
public static void unescapedBackslash() {
|
||||
String pattern = "\\";
|
||||
@ -4549,6 +4551,7 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
//This test is for 8280403
|
||||
//Given bad intersection syntax, we should throw a PatternSyntaxException.
|
||||
@Test
|
||||
public static void badIntersectionSyntax() {
|
||||
String pattern = "[˜\\H +F&&]";
|
||||
@ -4557,7 +4560,70 @@ public class RegExTest {
|
||||
assertTrue(e.getMessage().contains("Bad intersection syntax"));
|
||||
}
|
||||
|
||||
//This test is for 8264160
|
||||
//Here we check for inconsistencies between the behavior of \w and the
|
||||
//behavior of \b. Prior to this fix, the two flags did not behave in a
|
||||
//consistent way ie \b would recognize non-\w characters as part of a word
|
||||
//in some cases. This test verifies that the two behave consistently
|
||||
//for all codepoints we support.
|
||||
@Test
|
||||
public static void wordBoundaryInconsistencies() {
|
||||
Pattern basicWordCharPattern = Pattern.compile("\\w");
|
||||
Pattern basicWordCharBoundaryPattern =
|
||||
Pattern.compile(";\\b.", Pattern.DOTALL);
|
||||
|
||||
Pattern unicodeWordCharPattern =
|
||||
Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS);
|
||||
|
||||
Pattern unicodeWordCharBoundaryPattern =
|
||||
Pattern.compile(";\\b.",
|
||||
Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);
|
||||
|
||||
IntFunction<Boolean> basicWordCharCheck =
|
||||
(cp) -> cpMatches(basicWordCharPattern, cp, false);
|
||||
|
||||
IntFunction<Boolean> basicBoundaryCharCheck =
|
||||
(cp) -> cpMatches(basicWordCharBoundaryPattern,
|
||||
cp, true);
|
||||
|
||||
IntFunction<Boolean> unicodeWordCharCheck =
|
||||
(cp) -> cpMatches(unicodeWordCharPattern, cp, false);
|
||||
|
||||
IntFunction<Boolean> unicodeBoundaryCharCheck =
|
||||
(cp) -> cpMatches(unicodeWordCharBoundaryPattern,
|
||||
cp,true);
|
||||
|
||||
//basic pattern comparison
|
||||
for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){
|
||||
assertEquals(basicWordCharCheck.apply(cp),
|
||||
basicBoundaryCharCheck.apply(cp),
|
||||
"Codepoint: " + cp);
|
||||
assertEquals(unicodeWordCharCheck.apply(cp),
|
||||
unicodeBoundaryCharCheck.apply(cp),
|
||||
"Codepoint: " + cp);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean cpMatches(Pattern p, int cp, boolean boundary) {
|
||||
String cpString;
|
||||
if (Character.isBmpCodePoint(cp)) {
|
||||
cpString = "" + ((char) cp);
|
||||
} else {
|
||||
cpString = "" + Character.highSurrogate(cp) +
|
||||
Character.lowSurrogate(cp);
|
||||
}
|
||||
|
||||
if (boundary) {
|
||||
return p.matcher(";" + cpString).matches();
|
||||
} else {
|
||||
return p.matcher(cpString).matches();
|
||||
}
|
||||
}
|
||||
|
||||
//This test is for 8281560
|
||||
//Checks that when the Canonical Equivalence flag is set, the behavior for
|
||||
//Matcher::hitEnd is equivalent for these similar, patterns that saw
|
||||
//inconsistencies.
|
||||
@Test
|
||||
public static void prematureHitEndInNFCCharProperty() {
|
||||
var testInput = "a1a1";
|
||||
@ -4582,6 +4648,8 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
//This test is for 8281315
|
||||
//Checks that we are able to correctly match this case with a backref
|
||||
//without encountering an IndexOutOfBoundsException.
|
||||
@Test
|
||||
public static void iOOBForCIBackrefs(){
|
||||
String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";
|
||||
|
Loading…
x
Reference in New Issue
Block a user