8225061: Performance regression in Regex
Co-authored-by: Naoto Sato <naoto.sato@oracle.com> Reviewed-by: naoto, alanb
This commit is contained in:
parent
d2ad9dabdf
commit
1813ce706a
@ -67,27 +67,28 @@ public class GenerateEmojiData {
|
||||
},
|
||||
ArrayList<Range>::addAll);
|
||||
|
||||
|
||||
// make the code point conditions
|
||||
String extPictCodePoints = extPictRanges.stream()
|
||||
.map(r -> {
|
||||
if (r.start == r.last) {
|
||||
return (" ".repeat(12) + "cp == 0x" + toHexString(r.start));
|
||||
} else if (r.start == r.last - 1) {
|
||||
return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" +
|
||||
" ".repeat(12) + "cp == 0x" + toHexString(r.last);
|
||||
} else {
|
||||
return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) +
|
||||
" && cp <= 0x" + toHexString(r.last) + ")";
|
||||
}
|
||||
})
|
||||
.collect(Collectors.joining(" ||\n")) + ";\n";
|
||||
// only very few codepoints below 0x2000 are "emojis", so separate them
|
||||
// out to generate a fast-path check that can be efficiently inlined
|
||||
String lowExtPictCodePoints = extPictRanges.stream()
|
||||
.takeWhile(r -> r.last < 0x2000)
|
||||
.map(r -> rangeToString(r))
|
||||
.collect(Collectors.joining(" ||\n", "", ";\n"));
|
||||
|
||||
String highExtPictCodePoints = extPictRanges.stream()
|
||||
.dropWhile(r -> r.last < 0x2000)
|
||||
.map(r -> rangeToString(r))
|
||||
.collect(Collectors.joining(" ||\n", "", ";\n"));
|
||||
|
||||
// Generate EmojiData.java file
|
||||
Files.write(Paths.get(args[2]),
|
||||
Files.lines(Paths.get(args[0]))
|
||||
.flatMap(l -> {
|
||||
if (l.equals("%%%EXTPICT%%%")) {
|
||||
return Stream.of(extPictCodePoints);
|
||||
if (l.equals("%%%EXTPICT_LOW%%%")) {
|
||||
return Stream.of(lowExtPictCodePoints);
|
||||
} else if (l.equals("%%%EXTPICT_HIGH%%%")) {
|
||||
return Stream.of(highExtPictCodePoints);
|
||||
} else {
|
||||
return Stream.of(l);
|
||||
}
|
||||
@ -99,6 +100,18 @@ public class GenerateEmojiData {
|
||||
}
|
||||
}
|
||||
|
||||
static String rangeToString(Range r) {
|
||||
if (r.start == r.last) {
|
||||
return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
|
||||
} else if (r.start == r.last - 1) {
|
||||
return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
|
||||
" ".repeat(16) + "cp == 0x" + toHexString(r.last);
|
||||
} else {
|
||||
return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
|
||||
" && cp <= 0x" + toHexString(r.last) + ")";
|
||||
}
|
||||
}
|
||||
|
||||
static int toInt(String hexStr) {
|
||||
return Integer.parseUnsignedInt(hexStr, 16);
|
||||
}
|
||||
|
@ -40,7 +40,16 @@ final class EmojiData {
|
||||
* @return true if {@code cp} is an extended pictographic
|
||||
*/
|
||||
static boolean isExtendedPictographic(int cp) {
|
||||
if (cp < 0x2000) {
|
||||
return
|
||||
%%%EXTPICT_LOW%%%
|
||||
} else {
|
||||
return isHigh(cp);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isHigh(int cp) {
|
||||
return
|
||||
%%%EXTPICT%%%
|
||||
%%%EXTPICT_HIGH%%%
|
||||
}
|
||||
}
|
||||
|
@ -29,6 +29,19 @@ import java.util.Objects;
|
||||
|
||||
final class Grapheme {
|
||||
|
||||
/**
|
||||
* Determines if there is an extended grapheme cluster boundary between two
|
||||
* continuing characters {@code cp1} and {@code cp2}.
|
||||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules
|
||||
* <p>
|
||||
* Note: this method does not take care of stateful breaking.
|
||||
*/
|
||||
static boolean isBoundary(int cp1, int cp2) {
|
||||
return rules[getType(cp1)][getType(cp2)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
||||
* the start of the char sequence is a boundary.
|
||||
@ -50,12 +63,12 @@ final class Grapheme {
|
||||
int ret = Character.charCount(ch0);
|
||||
int ch1;
|
||||
// indicates whether gb11 or gb12 is underway
|
||||
boolean gb11 = EmojiData.isExtendedPictographic(ch0);
|
||||
int riCount = getType(ch0) == RI ? 1 : 0;
|
||||
int t0 = getGraphemeType(ch0);
|
||||
int riCount = t0 == RI ? 1 : 0;
|
||||
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
|
||||
while (ret < limit) {
|
||||
ch1 = Character.codePointAt(src, ret);
|
||||
int t0 = getType(ch0);
|
||||
int t1 = getType(ch1);
|
||||
int t1 = getGraphemeType(ch1);
|
||||
|
||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||
gb11 = false;
|
||||
@ -65,13 +78,14 @@ final class Grapheme {
|
||||
if (ret > off) {
|
||||
break;
|
||||
} else {
|
||||
gb11 = EmojiData.isExtendedPictographic(ch1);
|
||||
gb11 = t1 == EXTENDED_PICTOGRAPHIC;
|
||||
riCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
riCount += getType(ch1) == RI ? 1 : 0;
|
||||
ch0 = ch1;
|
||||
riCount += (t1 == RI) ? 1 : 0;
|
||||
t0 = t1;
|
||||
|
||||
ret += Character.charCount(ch1);
|
||||
}
|
||||
return ret;
|
||||
@ -163,6 +177,20 @@ final class Grapheme {
|
||||
cp == 0xAA7B || cp == 0xAA7D;
|
||||
}
|
||||
|
||||
private static int getGraphemeType(int cp) {
|
||||
if (cp < 0x007F) { // ASCII
|
||||
if (cp < 32) { // Control characters
|
||||
if (cp == 0x000D)
|
||||
return CR;
|
||||
if (cp == 0x000A)
|
||||
return LF;
|
||||
return CONTROL;
|
||||
}
|
||||
return OTHER;
|
||||
}
|
||||
return getType(cp);
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static int getType(int cp) {
|
||||
if (EmojiData.isExtendedPictographic(cp)) {
|
||||
@ -171,12 +199,6 @@ final class Grapheme {
|
||||
|
||||
int type = Character.getType(cp);
|
||||
switch(type) {
|
||||
case Character.CONTROL:
|
||||
if (cp == 0x000D)
|
||||
return CR;
|
||||
if (cp == 0x000A)
|
||||
return LF;
|
||||
return CONTROL;
|
||||
case Character.UNASSIGNED:
|
||||
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
|
||||
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
|
||||
@ -184,6 +206,7 @@ final class Grapheme {
|
||||
if (cp == 0x0378)
|
||||
return OTHER;
|
||||
|
||||
case Character.CONTROL:
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
case Character.SURROGATE:
|
||||
|
@ -3973,7 +3973,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (i < matcher.to) {
|
||||
int ch0 = Character.codePointAt(seq, i);
|
||||
int n = Character.charCount(ch0);
|
||||
int j = Grapheme.nextBoundary(seq, i, matcher.to);
|
||||
int j = i + n;
|
||||
// Fast check if it's necessary to call Normalizer;
|
||||
// testing Grapheme.isBoundary is enough for this case
|
||||
while (j < matcher.to) {
|
||||
int ch1 = Character.codePointAt(seq, j);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
j += Character.charCount(ch1);
|
||||
}
|
||||
if (i + n == j) { // single, assume nfc cp
|
||||
if (predicate.is(ch0))
|
||||
return next.match(matcher, j, seq);
|
||||
|
@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.java.util.regex;
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@State(Scope.Thread)
|
||||
public class PatternBench {
|
||||
|
||||
public String fileTestString;
|
||||
public String flagsString;
|
||||
|
||||
|
||||
public Pattern graphemePattern;
|
||||
public Pattern jmodPattern;
|
||||
public Pattern jmodCanonicalPattern;
|
||||
|
||||
public Pattern pattern;
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
|
||||
fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
|
||||
graphemePattern = Pattern.compile("\\b{g}");
|
||||
|
||||
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
||||
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
||||
jmodPattern = Pattern.compile(jmodRegex);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public int splitFlags() {
|
||||
return graphemePattern.split(flagsString).length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public boolean canonicalJmodMatch() {
|
||||
return jmodCanonicalPattern.matcher(fileTestString).matches();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public boolean normalJmodMatch() {
|
||||
return jmodPattern.matcher(fileTestString).matches();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user