8225061: Performance regression in Regex
Co-authored-by: Naoto Sato <naoto.sato@oracle.com> Reviewed-by: naoto, alanb
This commit is contained in:
parent
d2ad9dabdf
commit
1813ce706a
@ -67,27 +67,28 @@ public class GenerateEmojiData {
|
|||||||
},
|
},
|
||||||
ArrayList<Range>::addAll);
|
ArrayList<Range>::addAll);
|
||||||
|
|
||||||
|
|
||||||
// make the code point conditions
|
// make the code point conditions
|
||||||
String extPictCodePoints = extPictRanges.stream()
|
// only very few codepoints below 0x2000 are "emojis", so separate them
|
||||||
.map(r -> {
|
// out to generate a fast-path check that can be efficiently inlined
|
||||||
if (r.start == r.last) {
|
String lowExtPictCodePoints = extPictRanges.stream()
|
||||||
return (" ".repeat(12) + "cp == 0x" + toHexString(r.start));
|
.takeWhile(r -> r.last < 0x2000)
|
||||||
} else if (r.start == r.last - 1) {
|
.map(r -> rangeToString(r))
|
||||||
return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" +
|
.collect(Collectors.joining(" ||\n", "", ";\n"));
|
||||||
" ".repeat(12) + "cp == 0x" + toHexString(r.last);
|
|
||||||
} else {
|
String highExtPictCodePoints = extPictRanges.stream()
|
||||||
return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) +
|
.dropWhile(r -> r.last < 0x2000)
|
||||||
" && cp <= 0x" + toHexString(r.last) + ")";
|
.map(r -> rangeToString(r))
|
||||||
}
|
.collect(Collectors.joining(" ||\n", "", ";\n"));
|
||||||
})
|
|
||||||
.collect(Collectors.joining(" ||\n")) + ";\n";
|
|
||||||
|
|
||||||
// Generate EmojiData.java file
|
// Generate EmojiData.java file
|
||||||
Files.write(Paths.get(args[2]),
|
Files.write(Paths.get(args[2]),
|
||||||
Files.lines(Paths.get(args[0]))
|
Files.lines(Paths.get(args[0]))
|
||||||
.flatMap(l -> {
|
.flatMap(l -> {
|
||||||
if (l.equals("%%%EXTPICT%%%")) {
|
if (l.equals("%%%EXTPICT_LOW%%%")) {
|
||||||
return Stream.of(extPictCodePoints);
|
return Stream.of(lowExtPictCodePoints);
|
||||||
|
} else if (l.equals("%%%EXTPICT_HIGH%%%")) {
|
||||||
|
return Stream.of(highExtPictCodePoints);
|
||||||
} else {
|
} else {
|
||||||
return Stream.of(l);
|
return Stream.of(l);
|
||||||
}
|
}
|
||||||
@ -99,6 +100,18 @@ public class GenerateEmojiData {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static String rangeToString(Range r) {
|
||||||
|
if (r.start == r.last) {
|
||||||
|
return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
|
||||||
|
} else if (r.start == r.last - 1) {
|
||||||
|
return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
|
||||||
|
" ".repeat(16) + "cp == 0x" + toHexString(r.last);
|
||||||
|
} else {
|
||||||
|
return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
|
||||||
|
" && cp <= 0x" + toHexString(r.last) + ")";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static int toInt(String hexStr) {
|
static int toInt(String hexStr) {
|
||||||
return Integer.parseUnsignedInt(hexStr, 16);
|
return Integer.parseUnsignedInt(hexStr, 16);
|
||||||
}
|
}
|
||||||
|
@ -40,7 +40,16 @@ final class EmojiData {
|
|||||||
* @return true if {@code cp} is an extended pictographic
|
* @return true if {@code cp} is an extended pictographic
|
||||||
*/
|
*/
|
||||||
static boolean isExtendedPictographic(int cp) {
|
static boolean isExtendedPictographic(int cp) {
|
||||||
|
if (cp < 0x2000) {
|
||||||
|
return
|
||||||
|
%%%EXTPICT_LOW%%%
|
||||||
|
} else {
|
||||||
|
return isHigh(cp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isHigh(int cp) {
|
||||||
return
|
return
|
||||||
%%%EXTPICT%%%
|
%%%EXTPICT_HIGH%%%
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,19 @@ import java.util.Objects;
|
|||||||
|
|
||||||
final class Grapheme {
|
final class Grapheme {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if there is an extended grapheme cluster boundary between two
|
||||||
|
* continuing characters {@code cp1} and {@code cp2}.
|
||||||
|
* <p>
|
||||||
|
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||||
|
* for the extended grapheme cluster boundary rules
|
||||||
|
* <p>
|
||||||
|
* Note: this method does not take care of stateful breaking.
|
||||||
|
*/
|
||||||
|
static boolean isBoundary(int cp1, int cp2) {
|
||||||
|
return rules[getType(cp1)][getType(cp2)];
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
||||||
* the start of the char sequence is a boundary.
|
* the start of the char sequence is a boundary.
|
||||||
@ -50,12 +63,12 @@ final class Grapheme {
|
|||||||
int ret = Character.charCount(ch0);
|
int ret = Character.charCount(ch0);
|
||||||
int ch1;
|
int ch1;
|
||||||
// indicates whether gb11 or gb12 is underway
|
// indicates whether gb11 or gb12 is underway
|
||||||
boolean gb11 = EmojiData.isExtendedPictographic(ch0);
|
int t0 = getGraphemeType(ch0);
|
||||||
int riCount = getType(ch0) == RI ? 1 : 0;
|
int riCount = t0 == RI ? 1 : 0;
|
||||||
|
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
|
||||||
while (ret < limit) {
|
while (ret < limit) {
|
||||||
ch1 = Character.codePointAt(src, ret);
|
ch1 = Character.codePointAt(src, ret);
|
||||||
int t0 = getType(ch0);
|
int t1 = getGraphemeType(ch1);
|
||||||
int t1 = getType(ch1);
|
|
||||||
|
|
||||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||||
gb11 = false;
|
gb11 = false;
|
||||||
@ -65,13 +78,14 @@ final class Grapheme {
|
|||||||
if (ret > off) {
|
if (ret > off) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
gb11 = EmojiData.isExtendedPictographic(ch1);
|
gb11 = t1 == EXTENDED_PICTOGRAPHIC;
|
||||||
riCount = 0;
|
riCount = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
riCount += getType(ch1) == RI ? 1 : 0;
|
riCount += (t1 == RI) ? 1 : 0;
|
||||||
ch0 = ch1;
|
t0 = t1;
|
||||||
|
|
||||||
ret += Character.charCount(ch1);
|
ret += Character.charCount(ch1);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -163,6 +177,20 @@ final class Grapheme {
|
|||||||
cp == 0xAA7B || cp == 0xAA7D;
|
cp == 0xAA7B || cp == 0xAA7D;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int getGraphemeType(int cp) {
|
||||||
|
if (cp < 0x007F) { // ASCII
|
||||||
|
if (cp < 32) { // Control characters
|
||||||
|
if (cp == 0x000D)
|
||||||
|
return CR;
|
||||||
|
if (cp == 0x000A)
|
||||||
|
return LF;
|
||||||
|
return CONTROL;
|
||||||
|
}
|
||||||
|
return OTHER;
|
||||||
|
}
|
||||||
|
return getType(cp);
|
||||||
|
}
|
||||||
|
|
||||||
@SuppressWarnings("fallthrough")
|
@SuppressWarnings("fallthrough")
|
||||||
private static int getType(int cp) {
|
private static int getType(int cp) {
|
||||||
if (EmojiData.isExtendedPictographic(cp)) {
|
if (EmojiData.isExtendedPictographic(cp)) {
|
||||||
@ -171,12 +199,6 @@ final class Grapheme {
|
|||||||
|
|
||||||
int type = Character.getType(cp);
|
int type = Character.getType(cp);
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Character.CONTROL:
|
|
||||||
if (cp == 0x000D)
|
|
||||||
return CR;
|
|
||||||
if (cp == 0x000A)
|
|
||||||
return LF;
|
|
||||||
return CONTROL;
|
|
||||||
case Character.UNASSIGNED:
|
case Character.UNASSIGNED:
|
||||||
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
|
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
|
||||||
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
|
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
|
||||||
@ -184,6 +206,7 @@ final class Grapheme {
|
|||||||
if (cp == 0x0378)
|
if (cp == 0x0378)
|
||||||
return OTHER;
|
return OTHER;
|
||||||
|
|
||||||
|
case Character.CONTROL:
|
||||||
case Character.LINE_SEPARATOR:
|
case Character.LINE_SEPARATOR:
|
||||||
case Character.PARAGRAPH_SEPARATOR:
|
case Character.PARAGRAPH_SEPARATOR:
|
||||||
case Character.SURROGATE:
|
case Character.SURROGATE:
|
||||||
|
@ -3973,7 +3973,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
if (i < matcher.to) {
|
if (i < matcher.to) {
|
||||||
int ch0 = Character.codePointAt(seq, i);
|
int ch0 = Character.codePointAt(seq, i);
|
||||||
int n = Character.charCount(ch0);
|
int n = Character.charCount(ch0);
|
||||||
int j = Grapheme.nextBoundary(seq, i, matcher.to);
|
int j = i + n;
|
||||||
|
// Fast check if it's necessary to call Normalizer;
|
||||||
|
// testing Grapheme.isBoundary is enough for this case
|
||||||
|
while (j < matcher.to) {
|
||||||
|
int ch1 = Character.codePointAt(seq, j);
|
||||||
|
if (Grapheme.isBoundary(ch0, ch1))
|
||||||
|
break;
|
||||||
|
ch0 = ch1;
|
||||||
|
j += Character.charCount(ch1);
|
||||||
|
}
|
||||||
if (i + n == j) { // single, assume nfc cp
|
if (i + n == j) { // single, assume nfc cp
|
||||||
if (predicate.is(ch0))
|
if (predicate.is(ch0))
|
||||||
return next.match(matcher, j, seq);
|
return next.match(matcher, j, seq);
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package org.openjdk.bench.java.util.regex;
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.PatternSyntaxException;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.AverageTime)
|
||||||
|
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||||
|
@State(Scope.Thread)
|
||||||
|
public class PatternBench {
|
||||||
|
|
||||||
|
public String fileTestString;
|
||||||
|
public String flagsString;
|
||||||
|
|
||||||
|
|
||||||
|
public Pattern graphemePattern;
|
||||||
|
public Pattern jmodPattern;
|
||||||
|
public Pattern jmodCanonicalPattern;
|
||||||
|
|
||||||
|
public Pattern pattern;
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void setup() {
|
||||||
|
flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
|
||||||
|
fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
|
||||||
|
graphemePattern = Pattern.compile("\\b{g}");
|
||||||
|
|
||||||
|
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
||||||
|
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
||||||
|
jmodPattern = Pattern.compile(jmodRegex);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public int splitFlags() {
|
||||||
|
return graphemePattern.split(flagsString).length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public boolean canonicalJmodMatch() {
|
||||||
|
return jmodCanonicalPattern.matcher(fileTestString).matches();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public boolean normalJmodMatch() {
|
||||||
|
return jmodPattern.matcher(fileTestString).matches();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user