8264160: Regex \b is not consistent with \w without UNICODE_CHARACTER_CLASS

Reviewed-by: lancea, bpb, naoto
This commit is contained in:
Ian Graves 2022-03-29 00:01:57 +00:00
parent 634800a536
commit f01cce235b
2 changed files with 91 additions and 18 deletions
src/java.base/share/classes/java/util/regex
test/jdk/java/util/regex

@ -158,7 +158,8 @@ import jdk.internal.util.ArraysSupport;
* <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
* <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
* <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
* <td headers="matches predef digit">A digit: {@code [0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
* * UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>.</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
* <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
@ -167,7 +168,9 @@ import jdk.internal.util.ArraysSupport;
* <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
* <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
* <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]} if
* <a href="#UNICODE_CHARACTER_CLASS"> UNICODE_CHARACTER_CLASS</a> is not set. See
* <a href="#unicodesupport">Unicode Support</a>.</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
* <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
@ -176,7 +179,8 @@ import jdk.internal.util.ArraysSupport;
* <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th>
* <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th>
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr>
* <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
* UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>. </td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th>
* <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr>
*
@ -246,11 +250,12 @@ import jdk.internal.util.ArraysSupport;
* <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th>
* <td headers="matches bounds end_line">The end of a line</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th>
* <td headers="matches bounds word_boundary">A word boundary</td></tr>
* <td headers="matches bounds word_boundary">A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location
* where a non-word character abuts a word character)</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th>
* <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th>
* <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr>
* <td headers="matches bounds non_word_boundary">A non-word boundary: {@code [^\b]}</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th>
* <td headers="matches bounds begin_input">The beginning of the input</td></tr>
* <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th>
@ -535,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
* that do not capture text and do not count towards the group total, or
* <i>named-capturing</i> group.
*
* <h2> Unicode support </h2>
* <h2 id="unicodesupport"> Unicode support </h2>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
@ -5377,7 +5382,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
boolean isWord(int ch) {
return useUWORD ? CharPredicates.WORD().is(ch)
: (ch == '_' || Character.isLetterOrDigit(ch));
: CharPredicates.ASCII_WORD().is(ch);
}
int check(Matcher matcher, int i, CharSequence seq) {

@ -36,7 +36,7 @@
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
* 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694
*
* 8280403 8264160 8281315
* @library /test/lib
* @library /lib/testlibrary/java/lang
* @build jdk.test.lib.RandomFactory
@ -51,14 +51,9 @@ import java.nio.CharBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Scanner;
import java.util.*;
import java.util.function.Function;
import java.util.function.IntFunction;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.MatchResult;
@ -3854,11 +3849,11 @@ public class RegExTest {
}
// bounds/word align
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
twoFindIndexes(" \u0180sherman\u0400 ", boundU, 1, 10);
assertTrue(bwbU.reset("\u0180sherman\u0400").matches());
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", boundU, 1, 11);
assertTrue(bwbU.reset("\u0180sh\u0345erman\u0400").matches());
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
twoFindIndexes(" \u0724\u0739\u0724 ", boundU, 1, 4);
assertTrue(bwbU.reset("\u0724\u0739\u0724").matches());
assertTrue(bwbEU.reset("\u0724\u0739\u0724").matches());
}
@ -4503,6 +4498,8 @@ public class RegExTest {
}
//This test is for 8037397
//Ensure we don't drop nested interior character classes to the right of an
//intersection operator.
@Test
public static void droppedClassesWithIntersection() {
String rx = "[A-Z&&[A-Z]0-9]";
@ -4530,6 +4527,9 @@ public class RegExTest {
}
//This test is for 8269753
//This is for ensuring that the caret doesn't point at the wrong character
//in a syntax exception message because we previously didn't compensate for
//tabs when rendering the offending string that contained tab characters.
@Test
public static void errorMessageCaretIndentation() {
String pattern = "\t**";
@ -4540,6 +4540,8 @@ public class RegExTest {
}
//This test is for 8276694
//Ensure our error message indicates we have an unescaped backslash when we
//encounter one.
@Test
public static void unescapedBackslash() {
String pattern = "\\";
@ -4549,6 +4551,7 @@ public class RegExTest {
}
//This test is for 8280403
//Given bad intersection syntax, we should throw a PatternSyntaxException.
@Test
public static void badIntersectionSyntax() {
String pattern = "[˜\\H +F&&]";
@ -4557,7 +4560,70 @@ public class RegExTest {
assertTrue(e.getMessage().contains("Bad intersection syntax"));
}
//This test is for 8264160
//Here we check for inconsistencies between the behavior of \w and the
//behavior of \b. Prior to this fix, the two flags did not behave in a
//consistent way ie \b would recognize non-\w characters as part of a word
//in some cases. This test verifies that the two behave consistently
//for all codepoints we support.
@Test
public static void wordBoundaryInconsistencies() {
Pattern basicWordCharPattern = Pattern.compile("\\w");
Pattern basicWordCharBoundaryPattern =
Pattern.compile(";\\b.", Pattern.DOTALL);
Pattern unicodeWordCharPattern =
Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS);
Pattern unicodeWordCharBoundaryPattern =
Pattern.compile(";\\b.",
Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);
IntFunction<Boolean> basicWordCharCheck =
(cp) -> cpMatches(basicWordCharPattern, cp, false);
IntFunction<Boolean> basicBoundaryCharCheck =
(cp) -> cpMatches(basicWordCharBoundaryPattern,
cp, true);
IntFunction<Boolean> unicodeWordCharCheck =
(cp) -> cpMatches(unicodeWordCharPattern, cp, false);
IntFunction<Boolean> unicodeBoundaryCharCheck =
(cp) -> cpMatches(unicodeWordCharBoundaryPattern,
cp,true);
//basic pattern comparison
for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){
assertEquals(basicWordCharCheck.apply(cp),
basicBoundaryCharCheck.apply(cp),
"Codepoint: " + cp);
assertEquals(unicodeWordCharCheck.apply(cp),
unicodeBoundaryCharCheck.apply(cp),
"Codepoint: " + cp);
}
}
private static boolean cpMatches(Pattern p, int cp, boolean boundary) {
String cpString;
if (Character.isBmpCodePoint(cp)) {
cpString = "" + ((char) cp);
} else {
cpString = "" + Character.highSurrogate(cp) +
Character.lowSurrogate(cp);
}
if (boundary) {
return p.matcher(";" + cpString).matches();
} else {
return p.matcher(cpString).matches();
}
}
//This test is for 8281560
//Checks that when the Canonical Equivalence flag is set, the behavior for
//Matcher::hitEnd is equivalent for these similar, patterns that saw
//inconsistencies.
@Test
public static void prematureHitEndInNFCCharProperty() {
var testInput = "a1a1";
@ -4582,6 +4648,8 @@ public class RegExTest {
}
//This test is for 8281315
//Checks that we are able to correctly match this case with a backref
//without encountering an IndexOutOfBoundsException.
@Test
public static void iOOBForCIBackrefs(){
String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";