8225061: Performance regression in Regex

Co-authored-by: Naoto Sato <naoto.sato@oracle.com>
Reviewed-by: naoto, alanb
This commit is contained in:
Claes Redestad 2019-06-01 03:18:23 +02:00
parent d2ad9dabdf
commit 1813ce706a
5 changed files with 170 additions and 30 deletions

View File

@ -67,27 +67,28 @@ public class GenerateEmojiData {
}, },
ArrayList<Range>::addAll); ArrayList<Range>::addAll);
// make the code point conditions // make the code point conditions
String extPictCodePoints = extPictRanges.stream() // only very few codepoints below 0x2000 are "emojis", so separate them
.map(r -> { // out to generate a fast-path check that can be efficiently inlined
if (r.start == r.last) { String lowExtPictCodePoints = extPictRanges.stream()
return (" ".repeat(12) + "cp == 0x" + toHexString(r.start)); .takeWhile(r -> r.last < 0x2000)
} else if (r.start == r.last - 1) { .map(r -> rangeToString(r))
return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" + .collect(Collectors.joining(" ||\n", "", ";\n"));
" ".repeat(12) + "cp == 0x" + toHexString(r.last);
} else { String highExtPictCodePoints = extPictRanges.stream()
return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) + .dropWhile(r -> r.last < 0x2000)
" && cp <= 0x" + toHexString(r.last) + ")"; .map(r -> rangeToString(r))
} .collect(Collectors.joining(" ||\n", "", ";\n"));
})
.collect(Collectors.joining(" ||\n")) + ";\n";
// Generate EmojiData.java file // Generate EmojiData.java file
Files.write(Paths.get(args[2]), Files.write(Paths.get(args[2]),
Files.lines(Paths.get(args[0])) Files.lines(Paths.get(args[0]))
.flatMap(l -> { .flatMap(l -> {
if (l.equals("%%%EXTPICT%%%")) { if (l.equals("%%%EXTPICT_LOW%%%")) {
return Stream.of(extPictCodePoints); return Stream.of(lowExtPictCodePoints);
} else if (l.equals("%%%EXTPICT_HIGH%%%")) {
return Stream.of(highExtPictCodePoints);
} else { } else {
return Stream.of(l); return Stream.of(l);
} }
@ -99,6 +100,18 @@ public class GenerateEmojiData {
} }
} }
static String rangeToString(Range r) {
if (r.start == r.last) {
return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
} else if (r.start == r.last - 1) {
return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
" ".repeat(16) + "cp == 0x" + toHexString(r.last);
} else {
return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
" && cp <= 0x" + toHexString(r.last) + ")";
}
}
static int toInt(String hexStr) { static int toInt(String hexStr) {
return Integer.parseUnsignedInt(hexStr, 16); return Integer.parseUnsignedInt(hexStr, 16);
} }

View File

@ -40,7 +40,16 @@ final class EmojiData {
* @return true if {@code cp} is an extended pictographic * @return true if {@code cp} is an extended pictographic
*/ */
static boolean isExtendedPictographic(int cp) { static boolean isExtendedPictographic(int cp) {
if (cp < 0x2000) {
return
%%%EXTPICT_LOW%%%
} else {
return isHigh(cp);
}
}
private static boolean isHigh(int cp) {
return return
%%%EXTPICT%%% %%%EXTPICT_HIGH%%%
} }
} }

View File

@ -29,6 +29,19 @@ import java.util.Objects;
final class Grapheme { final class Grapheme {
/**
* Determines if there is an extended grapheme cluster boundary between two
* continuing characters {@code cp1} and {@code cp2}.
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules
* <p>
* Note: this method does not take care of stateful breaking.
*/
static boolean isBoundary(int cp1, int cp2) {
return rules[getType(cp1)][getType(cp2)];
}
/** /**
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
* the start of the char sequence is a boundary. * the start of the char sequence is a boundary.
@ -50,12 +63,12 @@ final class Grapheme {
int ret = Character.charCount(ch0); int ret = Character.charCount(ch0);
int ch1; int ch1;
// indicates whether gb11 or gb12 is underway // indicates whether gb11 or gb12 is underway
boolean gb11 = EmojiData.isExtendedPictographic(ch0); int t0 = getGraphemeType(ch0);
int riCount = getType(ch0) == RI ? 1 : 0; int riCount = t0 == RI ? 1 : 0;
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
while (ret < limit) { while (ret < limit) {
ch1 = Character.codePointAt(src, ret); ch1 = Character.codePointAt(src, ret);
int t0 = getType(ch0); int t1 = getGraphemeType(ch1);
int t1 = getType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
gb11 = false; gb11 = false;
@ -65,13 +78,14 @@ final class Grapheme {
if (ret > off) { if (ret > off) {
break; break;
} else { } else {
gb11 = EmojiData.isExtendedPictographic(ch1); gb11 = t1 == EXTENDED_PICTOGRAPHIC;
riCount = 0; riCount = 0;
} }
} }
riCount += getType(ch1) == RI ? 1 : 0; riCount += (t1 == RI) ? 1 : 0;
ch0 = ch1; t0 = t1;
ret += Character.charCount(ch1); ret += Character.charCount(ch1);
} }
return ret; return ret;
@ -163,6 +177,20 @@ final class Grapheme {
cp == 0xAA7B || cp == 0xAA7D; cp == 0xAA7B || cp == 0xAA7D;
} }
private static int getGraphemeType(int cp) {
if (cp < 0x007F) { // ASCII
if (cp < 32) { // Control characters
if (cp == 0x000D)
return CR;
if (cp == 0x000A)
return LF;
return CONTROL;
}
return OTHER;
}
return getType(cp);
}
@SuppressWarnings("fallthrough") @SuppressWarnings("fallthrough")
private static int getType(int cp) { private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) { if (EmojiData.isExtendedPictographic(cp)) {
@ -171,12 +199,6 @@ final class Grapheme {
int type = Character.getType(cp); int type = Character.getType(cp);
switch(type) { switch(type) {
case Character.CONTROL:
if (cp == 0x000D)
return CR;
if (cp == 0x000A)
return LF;
return CONTROL;
case Character.UNASSIGNED: case Character.UNASSIGNED:
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
@ -184,6 +206,7 @@ final class Grapheme {
if (cp == 0x0378) if (cp == 0x0378)
return OTHER; return OTHER;
case Character.CONTROL:
case Character.LINE_SEPARATOR: case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR: case Character.PARAGRAPH_SEPARATOR:
case Character.SURROGATE: case Character.SURROGATE:

View File

@ -3973,7 +3973,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (i < matcher.to) { if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i); int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0); int n = Character.charCount(ch0);
int j = Grapheme.nextBoundary(seq, i, matcher.to); int j = i + n;
// Fast check if it's necessary to call Normalizer;
// testing Grapheme.isBoundary is enough for this case
while (j < matcher.to) {
int ch1 = Character.codePointAt(seq, j);
if (Grapheme.isBoundary(ch0, ch1))
break;
ch0 = ch1;
j += Character.charCount(ch1);
}
if (i + n == j) { // single, assume nfc cp if (i + n == j) { // single, assume nfc cp
if (predicate.is(ch0)) if (predicate.is(ch0))
return next.match(matcher, j, seq); return next.match(matcher, j, seq);

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
public class PatternBench {
public String fileTestString;
public String flagsString;
public Pattern graphemePattern;
public Pattern jmodPattern;
public Pattern jmodCanonicalPattern;
public Pattern pattern;
@Setup
public void setup() {
flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
graphemePattern = Pattern.compile("\\b{g}");
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
jmodPattern = Pattern.compile(jmodRegex);
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public int splitFlags() {
return graphemePattern.split(flagsString).length;
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean canonicalJmodMatch() {
return jmodCanonicalPattern.matcher(fileTestString).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean normalJmodMatch() {
return jmodPattern.matcher(fileTestString).matches();
}
}