From eeaafbe1412041303eee5ee68dd4dd9173a6dea6 Mon Sep 17 00:00:00 2001 From: Naoto Sato Date: Thu, 12 Mar 2020 08:31:26 -0700 Subject: [PATCH] 8216332: Grapheme regex does not work with emoji sequences Reviewed-by: rriggs --- .../classes/java/util/regex/Grapheme.java | 4 +- .../jdk/java/util/regex/GraphemeTestCases.txt | 33 ++++++++ test/jdk/java/util/regex/RegExTest.java | 80 ++++++++++--------- 3 files changed, 78 insertions(+), 39 deletions(-) create mode 100644 test/jdk/java/util/regex/GraphemeTestCases.txt diff --git a/src/java.base/share/classes/java/util/regex/Grapheme.java b/src/java.base/share/classes/java/util/regex/Grapheme.java index 2e4bf0fddac..95e5dba2533 100644 --- a/src/java.base/share/classes/java/util/regex/Grapheme.java +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -71,7 +71,7 @@ final class Grapheme { int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { - gb11 = false; + // continue for gb11 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { // continue for gb12 } else if (rules[t0][t1]) { diff --git a/test/jdk/java/util/regex/GraphemeTestCases.txt b/test/jdk/java/util/regex/GraphemeTestCases.txt new file mode 100644 index 00000000000..1d8f1a0827f --- /dev/null +++ b/test/jdk/java/util/regex/GraphemeTestCases.txt @@ -0,0 +1,33 @@ +# +# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# This code is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 only, as +# published by the Free Software Foundation. +# +# This code is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# version 2 for more details (a copy is included in the LICENSE file that +# accompanied this code). +# +# You should have received a copy of the GNU General Public License version +# 2 along with this work; if not, write to the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +# or visit www.oracle.com if you need additional information or have any +# questions. +# + +# test cases for Grapheme support. Format follows GrahemeBreakTest.txt +# from Unicode. +# https://unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html + +# bug 8216332 +÷ 1F468 × 1F3FE ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ +÷ 1F468 × 200D × 1F469 × 200D × 1F466 × 200d ÷ 0041 ÷ +÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷ +÷ 0041 ÷ 1F468 × 200D × 1F469 × 200D × 1F466 ÷ 0041 ÷ +÷ 1F468 × 200D × 1F3FE × 200D × 1F469 × 1F3FE × 200D × 1F466 ÷ 0041 ÷ diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java index 6a0c7a583b5..9ca95005d87 100644 --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -36,6 +36,7 @@ * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 + * 8216332 * * @library /test/lib * @library /lib/testlibrary/java/lang @@ -55,6 +56,8 @@ import java.io.ObjectOutputStream; import java.math.BigInteger; import java.nio.CharBuffer; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -68,6 +71,8 @@ import java.util.regex.Matcher; import java.util.regex.MatchResult; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import java.util.stream.Stream; + import jdk.test.lib.RandomFactory; /** @@ -4790,47 +4795,48 @@ public class RegExTest { } private static void grapheme() throws Exception { - Files.lines(UCDFiles.GRAPHEME_BREAK_TEST) + Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST), + Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt"))) .filter( ln -> ln.length() != 0 && !ln.startsWith("#") ) .forEach( ln -> { - ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); - // System.out.println(str); - String[] strs = ln.split("\u00f7|\u00d7"); - StringBuilder src = new StringBuilder(); - ArrayList graphemes = new ArrayList<>(); - StringBuilder buf = new StringBuilder(); - int offBk = 0; - for (String str : strs) { - if (str.length() == 0) // first empty str - continue; - int cp = Integer.parseInt(str, 16); - src.appendCodePoint(cp); - buf.appendCodePoint(cp); - offBk += (str.length() + 1); - if (ln.charAt(offBk) == '\u00f7') { // DIV - graphemes.add(buf.toString()); - buf = new StringBuilder(); - } + ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); + // System.out.println(str); + String[] strs = ln.split("\u00f7|\u00d7"); + StringBuilder src = new StringBuilder(); + ArrayList graphemes = new ArrayList<>(); + StringBuilder buf = new StringBuilder(); + int offBk = 0; + for (String str : strs) { + if (str.length() == 0) // first empty str + continue; + int cp = Integer.parseInt(str, 16); + src.appendCodePoint(cp); + buf.appendCodePoint(cp); + offBk += (str.length() + 1); + if (ln.charAt(offBk) == '\u00f7') { // DIV + graphemes.add(buf.toString()); + buf = new StringBuilder(); } - Pattern p = Pattern.compile("\\X"); - Matcher m = p.matcher(src.toString()); - Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); - for (String g : graphemes) { - // System.out.printf(" grapheme:=[%s]%n", g); - // (1) test \\X directly - if (!m.find() || !m.group().equals(g)) { - System.out.println("Failed \\X [" + ln + "] : " + g); - failCount++; - } - // (2) test \\b{g} + \\X via Scanner - boolean hasNext = s.hasNext(p); - // if (!s.hasNext() || !s.next().equals(next)) { - if (!s.hasNext(p) || !s.next(p).equals(g)) { - System.out.println("Failed b{g} [" + ln + "] : " + g); - failCount++; - } + } + Pattern p = Pattern.compile("\\X"); + Matcher m = p.matcher(src.toString()); + Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + // System.out.printf(" grapheme:=[%s]%n", g); + // (1) test \\X directly + if (!m.find() || !m.group().equals(g)) { + System.out.println("Failed \\X [" + ln + "] : " + g); + failCount++; } - }); + // (2) test \\b{g} + \\X via Scanner + boolean hasNext = s.hasNext(p); + // if (!s.hasNext() || !s.next().equals(next)) { + if (!s.hasNext(p) || !s.next(p).equals(g)) { + System.out.println("Failed b{g} [" + ln + "] : " + g); + failCount++; + } + } + }); // some sanity checks if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() || !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||