diff --git a/src/java.base/share/classes/java/util/regex/Grapheme.java b/src/java.base/share/classes/java/util/regex/Grapheme.java index 95e5dba2533..af5bd6b0413 100644 --- a/src/java.base/share/classes/java/util/regex/Grapheme.java +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java @@ -30,21 +30,8 @@ import java.util.Objects; final class Grapheme { /** - * Determines if there is an extended grapheme cluster boundary between two - * continuing characters {@code cp1} and {@code cp2}. - *

- * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification - * for the extended grapheme cluster boundary rules - *

- * Note: this method does not take care of stateful breaking. - */ - static boolean isBoundary(int cp1, int cp2) { - return rules[getType(cp1)][getType(cp2)]; - } - - /** - * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes - * the start of the char sequence is a boundary. + * Look for the next extended grapheme cluster boundary in a CharSequence. + * It assumes the start of the char sequence at offset {@code off} is a boundary. *

* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification * for the extended grapheme cluster boundary rules. The following implementation @@ -54,21 +41,20 @@ final class Grapheme { * @param src the {@code CharSequence} to be scanned * @param off offset to start looking for the next boundary in the src * @param limit limit offset in the src (exclusive) - * @return the next possible boundary + * @return the next grapheme boundary */ static int nextBoundary(CharSequence src, int off, int limit) { Objects.checkFromToIndex(off, limit, src.length()); - int ch0 = Character.codePointAt(src, 0); - int ret = Character.charCount(ch0); - int ch1; + int ch0 = Character.codePointAt(src, off); + int ret = off + Character.charCount(ch0); // indicates whether gb11 or gb12 is underway - int t0 = getGraphemeType(ch0); + int t0 = getType(ch0); int riCount = t0 == RI ? 1 : 0; boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { - ch1 = Character.codePointAt(src, ret); - int t1 = getGraphemeType(ch1); + int ch1 = Character.codePointAt(src, ret); + int t1 = getType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { // continue for gb11 @@ -177,7 +163,8 @@ final class Grapheme { cp == 0xAA7B || cp == 0xAA7D; } - private static int getGraphemeType(int cp) { + @SuppressWarnings("fallthrough") + private static int getType(int cp) { if (cp < 0x007F) { // ASCII if (cp < 32) { // Control characters if (cp == 0x000D) @@ -188,11 +175,7 @@ final class Grapheme { } return OTHER; } - return getType(cp); - } - @SuppressWarnings("fallthrough") - private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { return EXTENDED_PICTOGRAPHIC; } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 8f30cd25bf9..0d7d0738965 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -4035,17 +4035,8 @@ loop: for(int x=0, offset=0; x i) { + // continue with return below + } else if (i < endIndex) { + if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) { + return false; + } + if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) { return false; } } else { diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java index 658880288d7..acac49bd4e9 100644 --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -36,7 +36,7 @@ * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 - * 8216332 8214245 8237599 + * 8216332 8214245 8237599 8241055 * * @library /test/lib * @library /lib/testlibrary/java/lang @@ -4797,48 +4797,99 @@ public class RegExTest { } private static void grapheme() throws Exception { + final int[] lineNumber = new int[1]; Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST), Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt"))) - .filter( ln -> ln.length() != 0 && !ln.startsWith("#") ) .forEach( ln -> { - ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); - // System.out.println(str); - String[] strs = ln.split("\u00f7|\u00d7"); - StringBuilder src = new StringBuilder(); - ArrayList graphemes = new ArrayList<>(); - StringBuilder buf = new StringBuilder(); - int offBk = 0; - for (String str : strs) { - if (str.length() == 0) // first empty str - continue; - int cp = Integer.parseInt(str, 16); - src.appendCodePoint(cp); - buf.appendCodePoint(cp); - offBk += (str.length() + 1); - if (ln.charAt(offBk) == '\u00f7') { // DIV - graphemes.add(buf.toString()); - buf = new StringBuilder(); + lineNumber[0]++; + if (ln.length() == 0 || ln.startsWith("#")) { + return; } - } - Pattern p = Pattern.compile("\\X"); - Matcher m = p.matcher(src.toString()); - Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); - for (String g : graphemes) { - // System.out.printf(" grapheme:=[%s]%n", g); - // (1) test \\X directly - if (!m.find() || !m.group().equals(g)) { - System.out.println("Failed \\X [" + ln + "] : " + g); + ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); + // System.out.println(str); + String[] strs = ln.split("\u00f7|\u00d7"); + StringBuilder src = new StringBuilder(); + ArrayList graphemes = new ArrayList<>(); + StringBuilder buf = new StringBuilder(); + int offBk = 0; + for (String str : strs) { + if (str.length() == 0) // first empty str + continue; + int cp = Integer.parseInt(str, 16); + src.appendCodePoint(cp); + buf.appendCodePoint(cp); + offBk += (str.length() + 1); + if (ln.charAt(offBk) == '\u00f7') { // DIV + graphemes.add(buf.toString()); + buf = new StringBuilder(); + } + } + Pattern p = Pattern.compile("\\X"); + // (1) test \X directly + Matcher m = p.matcher(src.toString()); + for (String g : graphemes) { + // System.out.printf(" grapheme:=[%s]%n", g); + String group = null; + if (!m.find() || !(group = m.group()).equals(g)) { + System.out.println("Failed pattern \\X [" + ln + "] : " + + "expected: " + g + " - actual: " + group + + "(line " + lineNumber[0] + ")"); + failCount++; + } + } + if (m.find()) { failCount++; } - // (2) test \\b{g} + \\X via Scanner - boolean hasNext = s.hasNext(p); - // if (!s.hasNext() || !s.next().equals(next)) { - if (!s.hasNext(p) || !s.next(p).equals(g)) { - System.out.println("Failed b{g} [" + ln + "] : " + g); + // test \b{g} without \X via Pattern + Pattern pbg = Pattern.compile("\\b{g}"); + m = pbg.matcher(src.toString()); + m.find(); + int prev = m.end(); + for (String g : graphemes) { + String group = null; + if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) { + System.out.println("Failed pattern \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + group + + "(line " + lineNumber[0] + ")"); + failCount++; + } + if (!"".equals(m.group())) { + failCount++; + } + prev = m.end(); + } + if (m.find()) { failCount++; } - } - }); + // (2) test \b{g} + \X via Scanner + Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next + + " (line " + lineNumber[0] + ")"); + failCount++; + } + } + if (s.hasNext(p)) { + failCount++; + } + // test \b{g} without \X via Scanner + s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext() || !(next = s.next()).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next + + " (line " + lineNumber[0] + ")"); + failCount++; + } + } + if (s.hasNext()) { + failCount++; + } + }); // some sanity checks if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() || !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() || diff --git a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java index cc14ed7f0e0..4d75be53b4c 100644 --- a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java +++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -46,6 +46,7 @@ public class PatternBench { public String flagsString; public Pattern graphemePattern; + public Pattern graphemeBoundaryPattern; public Pattern jmodPattern; public Pattern jmodCanonicalPattern; @@ -57,7 +58,8 @@ public class PatternBench { public void setup() { flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7"; fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode"; - graphemePattern = Pattern.compile("\\b{g}"); + graphemePattern = Pattern.compile("\\X"); + graphemeBoundaryPattern = Pattern.compile("\\b{g}"); String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$"; jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ); @@ -68,11 +70,18 @@ public class PatternBench { charPattern = Pattern.compile(charPatternRegex); } + @Benchmark + @Warmup(iterations = 3) + @Measurement(iterations = 3) + public long longStringGraphemeMatches() { + return graphemePattern.matcher(flagsString.repeat(3)).results().count(); + } + @Benchmark @Warmup(iterations = 3) @Measurement(iterations = 3) public int splitFlags() { - return graphemePattern.split(flagsString).length; + return graphemeBoundaryPattern.split(flagsString).length; } @Benchmark