diff --git a/src/java.base/share/classes/java/util/regex/Grapheme.java b/src/java.base/share/classes/java/util/regex/Grapheme.java
index 95e5dba2533..af5bd6b0413 100644
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java
@@ -30,21 +30,8 @@ import java.util.Objects;
final class Grapheme {
/**
- * Determines if there is an extended grapheme cluster boundary between two
- * continuing characters {@code cp1} and {@code cp2}.
- *
- * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
- * for the extended grapheme cluster boundary rules
- *
- * Note: this method does not take care of stateful breaking.
- */
- static boolean isBoundary(int cp1, int cp2) {
- return rules[getType(cp1)][getType(cp2)];
- }
-
- /**
- * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
- * the start of the char sequence is a boundary.
+ * Look for the next extended grapheme cluster boundary in a CharSequence.
+ * It assumes the start of the char sequence at offset {@code off} is a boundary.
*
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
@@ -54,21 +41,20 @@ final class Grapheme {
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
- * @return the next possible boundary
+ * @return the next grapheme boundary
*/
static int nextBoundary(CharSequence src, int off, int limit) {
Objects.checkFromToIndex(off, limit, src.length());
- int ch0 = Character.codePointAt(src, 0);
- int ret = Character.charCount(ch0);
- int ch1;
+ int ch0 = Character.codePointAt(src, off);
+ int ret = off + Character.charCount(ch0);
// indicates whether gb11 or gb12 is underway
- int t0 = getGraphemeType(ch0);
+ int t0 = getType(ch0);
int riCount = t0 == RI ? 1 : 0;
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
while (ret < limit) {
- ch1 = Character.codePointAt(src, ret);
- int t1 = getGraphemeType(ch1);
+ int ch1 = Character.codePointAt(src, ret);
+ int t1 = getType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
// continue for gb11
@@ -177,7 +163,8 @@ final class Grapheme {
cp == 0xAA7B || cp == 0xAA7D;
}
- private static int getGraphemeType(int cp) {
+ @SuppressWarnings("fallthrough")
+ private static int getType(int cp) {
if (cp < 0x007F) { // ASCII
if (cp < 32) { // Control characters
if (cp == 0x000D)
@@ -188,11 +175,7 @@ final class Grapheme {
}
return OTHER;
}
- return getType(cp);
- }
- @SuppressWarnings("fallthrough")
- private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) {
return EXTENDED_PICTOGRAPHIC;
}
diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java
index 8f30cd25bf9..0d7d0738965 100644
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@@ -4035,17 +4035,8 @@ loop: for(int x=0, offset=0; x i) {
+ // continue with return below
+ } else if (i < endIndex) {
+ if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+ return false;
+ }
+ if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
return false;
}
} else {
diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java
index 658880288d7..acac49bd4e9 100644
--- a/test/jdk/java/util/regex/RegExTest.java
+++ b/test/jdk/java/util/regex/RegExTest.java
@@ -36,7 +36,7 @@
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
- * 8216332 8214245 8237599
+ * 8216332 8214245 8237599 8241055
*
* @library /test/lib
* @library /lib/testlibrary/java/lang
@@ -4797,48 +4797,99 @@ public class RegExTest {
}
private static void grapheme() throws Exception {
+ final int[] lineNumber = new int[1];
Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
- .filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
.forEach( ln -> {
- ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
- // System.out.println(str);
- String[] strs = ln.split("\u00f7|\u00d7");
- StringBuilder src = new StringBuilder();
- ArrayList graphemes = new ArrayList<>();
- StringBuilder buf = new StringBuilder();
- int offBk = 0;
- for (String str : strs) {
- if (str.length() == 0) // first empty str
- continue;
- int cp = Integer.parseInt(str, 16);
- src.appendCodePoint(cp);
- buf.appendCodePoint(cp);
- offBk += (str.length() + 1);
- if (ln.charAt(offBk) == '\u00f7') { // DIV
- graphemes.add(buf.toString());
- buf = new StringBuilder();
+ lineNumber[0]++;
+ if (ln.length() == 0 || ln.startsWith("#")) {
+ return;
}
- }
- Pattern p = Pattern.compile("\\X");
- Matcher m = p.matcher(src.toString());
- Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
- for (String g : graphemes) {
- // System.out.printf(" grapheme:=[%s]%n", g);
- // (1) test \\X directly
- if (!m.find() || !m.group().equals(g)) {
- System.out.println("Failed \\X [" + ln + "] : " + g);
+ ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
+ // System.out.println(str);
+ String[] strs = ln.split("\u00f7|\u00d7");
+ StringBuilder src = new StringBuilder();
+ ArrayList graphemes = new ArrayList<>();
+ StringBuilder buf = new StringBuilder();
+ int offBk = 0;
+ for (String str : strs) {
+ if (str.length() == 0) // first empty str
+ continue;
+ int cp = Integer.parseInt(str, 16);
+ src.appendCodePoint(cp);
+ buf.appendCodePoint(cp);
+ offBk += (str.length() + 1);
+ if (ln.charAt(offBk) == '\u00f7') { // DIV
+ graphemes.add(buf.toString());
+ buf = new StringBuilder();
+ }
+ }
+ Pattern p = Pattern.compile("\\X");
+ // (1) test \X directly
+ Matcher m = p.matcher(src.toString());
+ for (String g : graphemes) {
+ // System.out.printf(" grapheme:=[%s]%n", g);
+ String group = null;
+ if (!m.find() || !(group = m.group()).equals(g)) {
+ System.out.println("Failed pattern \\X [" + ln + "] : "
+ + "expected: " + g + " - actual: " + group
+ + "(line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (m.find()) {
failCount++;
}
- // (2) test \\b{g} + \\X via Scanner
- boolean hasNext = s.hasNext(p);
- // if (!s.hasNext() || !s.next().equals(next)) {
- if (!s.hasNext(p) || !s.next(p).equals(g)) {
- System.out.println("Failed b{g} [" + ln + "] : " + g);
+ // test \b{g} without \X via Pattern
+ Pattern pbg = Pattern.compile("\\b{g}");
+ m = pbg.matcher(src.toString());
+ m.find();
+ int prev = m.end();
+ for (String g : graphemes) {
+ String group = null;
+ if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) {
+ System.out.println("Failed pattern \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + group
+ + "(line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ if (!"".equals(m.group())) {
+ failCount++;
+ }
+ prev = m.end();
+ }
+ if (m.find()) {
failCount++;
}
- }
- });
+ // (2) test \b{g} + \X via Scanner
+ Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+ for (String g : graphemes) {
+ String next = null;
+ if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) {
+ System.out.println("Failed \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + next
+ + " (line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (s.hasNext(p)) {
+ failCount++;
+ }
+ // test \b{g} without \X via Scanner
+ s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+ for (String g : graphemes) {
+ String next = null;
+ if (!s.hasNext() || !(next = s.next()).equals(g)) {
+ System.out.println("Failed \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + next
+ + " (line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (s.hasNext()) {
+ failCount++;
+ }
+ });
// some sanity checks
if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
!Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||
diff --git a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
index cc14ed7f0e0..4d75be53b4c 100644
--- a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
+++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -46,6 +46,7 @@ public class PatternBench {
public String flagsString;
public Pattern graphemePattern;
+ public Pattern graphemeBoundaryPattern;
public Pattern jmodPattern;
public Pattern jmodCanonicalPattern;
@@ -57,7 +58,8 @@ public class PatternBench {
public void setup() {
flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
- graphemePattern = Pattern.compile("\\b{g}");
+ graphemePattern = Pattern.compile("\\X");
+ graphemeBoundaryPattern = Pattern.compile("\\b{g}");
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
@@ -68,11 +70,18 @@ public class PatternBench {
charPattern = Pattern.compile(charPatternRegex);
}
+ @Benchmark
+ @Warmup(iterations = 3)
+ @Measurement(iterations = 3)
+ public long longStringGraphemeMatches() {
+ return graphemePattern.matcher(flagsString.repeat(3)).results().count();
+ }
+
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public int splitFlags() {
- return graphemePattern.split(flagsString).length;
+ return graphemeBoundaryPattern.split(flagsString).length;
}
@Benchmark