8241055: Regex Grapheme Matcher Performance Depends too much on Total Input Sequence Size
Reviewed-by: naoto
This commit is contained in:
parent
d0f5c5c6bb
commit
455eaca215
@ -30,21 +30,8 @@ import java.util.Objects;
|
||||
final class Grapheme {
|
||||
|
||||
/**
|
||||
* Determines if there is an extended grapheme cluster boundary between two
|
||||
* continuing characters {@code cp1} and {@code cp2}.
|
||||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules
|
||||
* <p>
|
||||
* Note: this method does not take care of stateful breaking.
|
||||
*/
|
||||
static boolean isBoundary(int cp1, int cp2) {
|
||||
return rules[getType(cp1)][getType(cp2)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
||||
* the start of the char sequence is a boundary.
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence.
|
||||
* It assumes the start of the char sequence at offset {@code off} is a boundary.
|
||||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules. The following implementation
|
||||
@ -54,21 +41,20 @@ final class Grapheme {
|
||||
* @param src the {@code CharSequence} to be scanned
|
||||
* @param off offset to start looking for the next boundary in the src
|
||||
* @param limit limit offset in the src (exclusive)
|
||||
* @return the next possible boundary
|
||||
* @return the next grapheme boundary
|
||||
*/
|
||||
static int nextBoundary(CharSequence src, int off, int limit) {
|
||||
Objects.checkFromToIndex(off, limit, src.length());
|
||||
|
||||
int ch0 = Character.codePointAt(src, 0);
|
||||
int ret = Character.charCount(ch0);
|
||||
int ch1;
|
||||
int ch0 = Character.codePointAt(src, off);
|
||||
int ret = off + Character.charCount(ch0);
|
||||
// indicates whether gb11 or gb12 is underway
|
||||
int t0 = getGraphemeType(ch0);
|
||||
int t0 = getType(ch0);
|
||||
int riCount = t0 == RI ? 1 : 0;
|
||||
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
|
||||
while (ret < limit) {
|
||||
ch1 = Character.codePointAt(src, ret);
|
||||
int t1 = getGraphemeType(ch1);
|
||||
int ch1 = Character.codePointAt(src, ret);
|
||||
int t1 = getType(ch1);
|
||||
|
||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||
// continue for gb11
|
||||
@ -177,7 +163,8 @@ final class Grapheme {
|
||||
cp == 0xAA7B || cp == 0xAA7D;
|
||||
}
|
||||
|
||||
private static int getGraphemeType(int cp) {
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static int getType(int cp) {
|
||||
if (cp < 0x007F) { // ASCII
|
||||
if (cp < 32) { // Control characters
|
||||
if (cp == 0x000D)
|
||||
@ -188,11 +175,7 @@ final class Grapheme {
|
||||
}
|
||||
return OTHER;
|
||||
}
|
||||
return getType(cp);
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static int getType(int cp) {
|
||||
if (EmojiData.isExtendedPictographic(cp)) {
|
||||
return EXTENDED_PICTOGRAPHIC;
|
||||
}
|
||||
|
@ -4035,17 +4035,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (i < matcher.to) {
|
||||
int ch0 = Character.codePointAt(seq, i);
|
||||
int n = Character.charCount(ch0);
|
||||
int j = i + n;
|
||||
// Fast check if it's necessary to call Normalizer;
|
||||
// testing Grapheme.isBoundary is enough for this case
|
||||
while (j < matcher.to) {
|
||||
int ch1 = Character.codePointAt(seq, j);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
j += Character.charCount(ch1);
|
||||
}
|
||||
if (i + n == j) { // single, assume nfc cp
|
||||
int j = Grapheme.nextBoundary(seq, i, matcher.to);
|
||||
if (i + n == j) { // single cp grapheme, assume nfc
|
||||
if (predicate.is(ch0))
|
||||
return next.match(matcher, j, seq);
|
||||
} else {
|
||||
@ -4109,13 +4100,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
endIndex = matcher.getTextLength();
|
||||
}
|
||||
if (i == startIndex) {
|
||||
return next.match(matcher, i, seq);
|
||||
}
|
||||
if (i < endIndex) {
|
||||
if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
|
||||
Grapheme.nextBoundary(seq,
|
||||
i - Character.charCount(Character.codePointBefore(seq, i)),
|
||||
i + Character.charCount(Character.codePointAt(seq, i))) > i) {
|
||||
// continue with return below
|
||||
} else if (i < endIndex) {
|
||||
if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
|
||||
return false;
|
||||
}
|
||||
if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -36,7 +36,7 @@
|
||||
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
|
||||
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
|
||||
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
|
||||
* 8216332 8214245 8237599
|
||||
* 8216332 8214245 8237599 8241055
|
||||
*
|
||||
* @library /test/lib
|
||||
* @library /lib/testlibrary/java/lang
|
||||
@ -4797,48 +4797,99 @@ public class RegExTest {
|
||||
}
|
||||
|
||||
private static void grapheme() throws Exception {
|
||||
final int[] lineNumber = new int[1];
|
||||
Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
|
||||
Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
|
||||
.filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
|
||||
.forEach( ln -> {
|
||||
ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
|
||||
// System.out.println(str);
|
||||
String[] strs = ln.split("\u00f7|\u00d7");
|
||||
StringBuilder src = new StringBuilder();
|
||||
ArrayList<String> graphemes = new ArrayList<>();
|
||||
StringBuilder buf = new StringBuilder();
|
||||
int offBk = 0;
|
||||
for (String str : strs) {
|
||||
if (str.length() == 0) // first empty str
|
||||
continue;
|
||||
int cp = Integer.parseInt(str, 16);
|
||||
src.appendCodePoint(cp);
|
||||
buf.appendCodePoint(cp);
|
||||
offBk += (str.length() + 1);
|
||||
if (ln.charAt(offBk) == '\u00f7') { // DIV
|
||||
graphemes.add(buf.toString());
|
||||
buf = new StringBuilder();
|
||||
lineNumber[0]++;
|
||||
if (ln.length() == 0 || ln.startsWith("#")) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
Pattern p = Pattern.compile("\\X");
|
||||
Matcher m = p.matcher(src.toString());
|
||||
Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
|
||||
for (String g : graphemes) {
|
||||
// System.out.printf(" grapheme:=[%s]%n", g);
|
||||
// (1) test \\X directly
|
||||
if (!m.find() || !m.group().equals(g)) {
|
||||
System.out.println("Failed \\X [" + ln + "] : " + g);
|
||||
ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
|
||||
// System.out.println(str);
|
||||
String[] strs = ln.split("\u00f7|\u00d7");
|
||||
StringBuilder src = new StringBuilder();
|
||||
ArrayList<String> graphemes = new ArrayList<>();
|
||||
StringBuilder buf = new StringBuilder();
|
||||
int offBk = 0;
|
||||
for (String str : strs) {
|
||||
if (str.length() == 0) // first empty str
|
||||
continue;
|
||||
int cp = Integer.parseInt(str, 16);
|
||||
src.appendCodePoint(cp);
|
||||
buf.appendCodePoint(cp);
|
||||
offBk += (str.length() + 1);
|
||||
if (ln.charAt(offBk) == '\u00f7') { // DIV
|
||||
graphemes.add(buf.toString());
|
||||
buf = new StringBuilder();
|
||||
}
|
||||
}
|
||||
Pattern p = Pattern.compile("\\X");
|
||||
// (1) test \X directly
|
||||
Matcher m = p.matcher(src.toString());
|
||||
for (String g : graphemes) {
|
||||
// System.out.printf(" grapheme:=[%s]%n", g);
|
||||
String group = null;
|
||||
if (!m.find() || !(group = m.group()).equals(g)) {
|
||||
System.out.println("Failed pattern \\X [" + ln + "] : "
|
||||
+ "expected: " + g + " - actual: " + group
|
||||
+ "(line " + lineNumber[0] + ")");
|
||||
failCount++;
|
||||
}
|
||||
}
|
||||
if (m.find()) {
|
||||
failCount++;
|
||||
}
|
||||
// (2) test \\b{g} + \\X via Scanner
|
||||
boolean hasNext = s.hasNext(p);
|
||||
// if (!s.hasNext() || !s.next().equals(next)) {
|
||||
if (!s.hasNext(p) || !s.next(p).equals(g)) {
|
||||
System.out.println("Failed b{g} [" + ln + "] : " + g);
|
||||
// test \b{g} without \X via Pattern
|
||||
Pattern pbg = Pattern.compile("\\b{g}");
|
||||
m = pbg.matcher(src.toString());
|
||||
m.find();
|
||||
int prev = m.end();
|
||||
for (String g : graphemes) {
|
||||
String group = null;
|
||||
if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) {
|
||||
System.out.println("Failed pattern \\b{g} [" + ln + "] : "
|
||||
+ "expected: " + g + " - actual: " + group
|
||||
+ "(line " + lineNumber[0] + ")");
|
||||
failCount++;
|
||||
}
|
||||
if (!"".equals(m.group())) {
|
||||
failCount++;
|
||||
}
|
||||
prev = m.end();
|
||||
}
|
||||
if (m.find()) {
|
||||
failCount++;
|
||||
}
|
||||
}
|
||||
});
|
||||
// (2) test \b{g} + \X via Scanner
|
||||
Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
|
||||
for (String g : graphemes) {
|
||||
String next = null;
|
||||
if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) {
|
||||
System.out.println("Failed \\b{g} [" + ln + "] : "
|
||||
+ "expected: " + g + " - actual: " + next
|
||||
+ " (line " + lineNumber[0] + ")");
|
||||
failCount++;
|
||||
}
|
||||
}
|
||||
if (s.hasNext(p)) {
|
||||
failCount++;
|
||||
}
|
||||
// test \b{g} without \X via Scanner
|
||||
s = new Scanner(src.toString()).useDelimiter("\\b{g}");
|
||||
for (String g : graphemes) {
|
||||
String next = null;
|
||||
if (!s.hasNext() || !(next = s.next()).equals(g)) {
|
||||
System.out.println("Failed \\b{g} [" + ln + "] : "
|
||||
+ "expected: " + g + " - actual: " + next
|
||||
+ " (line " + lineNumber[0] + ")");
|
||||
failCount++;
|
||||
}
|
||||
}
|
||||
if (s.hasNext()) {
|
||||
failCount++;
|
||||
}
|
||||
});
|
||||
// some sanity checks
|
||||
if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
|
||||
!Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -46,6 +46,7 @@ public class PatternBench {
|
||||
public String flagsString;
|
||||
|
||||
public Pattern graphemePattern;
|
||||
public Pattern graphemeBoundaryPattern;
|
||||
public Pattern jmodPattern;
|
||||
public Pattern jmodCanonicalPattern;
|
||||
|
||||
@ -57,7 +58,8 @@ public class PatternBench {
|
||||
public void setup() {
|
||||
flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
|
||||
fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
|
||||
graphemePattern = Pattern.compile("\\b{g}");
|
||||
graphemePattern = Pattern.compile("\\X");
|
||||
graphemeBoundaryPattern = Pattern.compile("\\b{g}");
|
||||
|
||||
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
||||
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
||||
@ -68,11 +70,18 @@ public class PatternBench {
|
||||
charPattern = Pattern.compile(charPatternRegex);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public long longStringGraphemeMatches() {
|
||||
return graphemePattern.matcher(flagsString.repeat(3)).results().count();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public int splitFlags() {
|
||||
return graphemePattern.split(flagsString).length;
|
||||
return graphemeBoundaryPattern.split(flagsString).length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
|
Loading…
Reference in New Issue
Block a user