8216332: Grapheme regex does not work with emoji sequences
Reviewed-by: rriggs
This commit is contained in:
parent
add18914fb
commit
eeaafbe141
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
@ -71,7 +71,7 @@ final class Grapheme {
|
|||||||
int t1 = getGraphemeType(ch1);
|
int t1 = getGraphemeType(ch1);
|
||||||
|
|
||||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||||
gb11 = false;
|
// continue for gb11
|
||||||
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
|
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
|
||||||
// continue for gb12
|
// continue for gb12
|
||||||
} else if (rules[t0][t1]) {
|
} else if (rules[t0][t1]) {
|
||||||
|
33
test/jdk/java/util/regex/GraphemeTestCases.txt
Normal file
33
test/jdk/java/util/regex/GraphemeTestCases.txt
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
#
|
||||||
|
# This code is free software; you can redistribute it and/or modify it
|
||||||
|
# under the terms of the GNU General Public License version 2 only, as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
#
|
||||||
|
# This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
# version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
# accompanied this code).
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License version
|
||||||
|
# 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
#
|
||||||
|
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
# or visit www.oracle.com if you need additional information or have any
|
||||||
|
# questions.
|
||||||
|
#
|
||||||
|
|
||||||
|
# test cases for Grapheme support. Format follows GrahemeBreakTest.txt
|
||||||
|
# from Unicode.
|
||||||
|
# https://unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
|
||||||
|
|
||||||
|
# bug 8216332
|
||||||
|
÷ 1F468 × 1F3FE ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷
|
||||||
|
÷ 1F468 × 200D × 1F469 × 200D × 1F466 × 200d ÷ 0041 ÷
|
||||||
|
÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷
|
||||||
|
÷ 0041 ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷
|
||||||
|
÷ 1F468 × 200D × 1F3FE × 200D × 1F469 × 1F3FE × 200D × 1F466 ÷ 0041 ÷
|
@ -36,6 +36,7 @@
|
|||||||
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
|
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
|
||||||
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
|
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
|
||||||
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
|
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
|
||||||
|
* 8216332
|
||||||
*
|
*
|
||||||
* @library /test/lib
|
* @library /test/lib
|
||||||
* @library /lib/testlibrary/java/lang
|
* @library /lib/testlibrary/java/lang
|
||||||
@ -55,6 +56,8 @@ import java.io.ObjectOutputStream;
|
|||||||
import java.math.BigInteger;
|
import java.math.BigInteger;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -68,6 +71,8 @@ import java.util.regex.Matcher;
|
|||||||
import java.util.regex.MatchResult;
|
import java.util.regex.MatchResult;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.regex.PatternSyntaxException;
|
import java.util.regex.PatternSyntaxException;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import jdk.test.lib.RandomFactory;
|
import jdk.test.lib.RandomFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -4790,47 +4795,48 @@ public class RegExTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static void grapheme() throws Exception {
|
private static void grapheme() throws Exception {
|
||||||
Files.lines(UCDFiles.GRAPHEME_BREAK_TEST)
|
Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
|
||||||
|
Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
|
||||||
.filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
|
.filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
|
||||||
.forEach( ln -> {
|
.forEach( ln -> {
|
||||||
ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
|
ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
|
||||||
// System.out.println(str);
|
// System.out.println(str);
|
||||||
String[] strs = ln.split("\u00f7|\u00d7");
|
String[] strs = ln.split("\u00f7|\u00d7");
|
||||||
StringBuilder src = new StringBuilder();
|
StringBuilder src = new StringBuilder();
|
||||||
ArrayList<String> graphemes = new ArrayList<>();
|
ArrayList<String> graphemes = new ArrayList<>();
|
||||||
StringBuilder buf = new StringBuilder();
|
StringBuilder buf = new StringBuilder();
|
||||||
int offBk = 0;
|
int offBk = 0;
|
||||||
for (String str : strs) {
|
for (String str : strs) {
|
||||||
if (str.length() == 0) // first empty str
|
if (str.length() == 0) // first empty str
|
||||||
continue;
|
continue;
|
||||||
int cp = Integer.parseInt(str, 16);
|
int cp = Integer.parseInt(str, 16);
|
||||||
src.appendCodePoint(cp);
|
src.appendCodePoint(cp);
|
||||||
buf.appendCodePoint(cp);
|
buf.appendCodePoint(cp);
|
||||||
offBk += (str.length() + 1);
|
offBk += (str.length() + 1);
|
||||||
if (ln.charAt(offBk) == '\u00f7') { // DIV
|
if (ln.charAt(offBk) == '\u00f7') { // DIV
|
||||||
graphemes.add(buf.toString());
|
graphemes.add(buf.toString());
|
||||||
buf = new StringBuilder();
|
buf = new StringBuilder();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Pattern p = Pattern.compile("\\X");
|
}
|
||||||
Matcher m = p.matcher(src.toString());
|
Pattern p = Pattern.compile("\\X");
|
||||||
Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
|
Matcher m = p.matcher(src.toString());
|
||||||
for (String g : graphemes) {
|
Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
|
||||||
// System.out.printf(" grapheme:=[%s]%n", g);
|
for (String g : graphemes) {
|
||||||
// (1) test \\X directly
|
// System.out.printf(" grapheme:=[%s]%n", g);
|
||||||
if (!m.find() || !m.group().equals(g)) {
|
// (1) test \\X directly
|
||||||
System.out.println("Failed \\X [" + ln + "] : " + g);
|
if (!m.find() || !m.group().equals(g)) {
|
||||||
failCount++;
|
System.out.println("Failed \\X [" + ln + "] : " + g);
|
||||||
}
|
failCount++;
|
||||||
// (2) test \\b{g} + \\X via Scanner
|
|
||||||
boolean hasNext = s.hasNext(p);
|
|
||||||
// if (!s.hasNext() || !s.next().equals(next)) {
|
|
||||||
if (!s.hasNext(p) || !s.next(p).equals(g)) {
|
|
||||||
System.out.println("Failed b{g} [" + ln + "] : " + g);
|
|
||||||
failCount++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
// (2) test \\b{g} + \\X via Scanner
|
||||||
|
boolean hasNext = s.hasNext(p);
|
||||||
|
// if (!s.hasNext() || !s.next().equals(next)) {
|
||||||
|
if (!s.hasNext(p) || !s.next(p).equals(g)) {
|
||||||
|
System.out.println("Failed b{g} [" + ln + "] : " + g);
|
||||||
|
failCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
// some sanity checks
|
// some sanity checks
|
||||||
if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
|
if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
|
||||||
!Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||
|
!Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||
|
||||||
|
Loading…
Reference in New Issue
Block a user