8259074: regex benchmarks and tests

Reviewed-by: redestad
This commit is contained in:
Martin Buchholz 2021-02-08 18:09:59 +00:00
parent d6d5d9bf2f
commit 351d788809
5 changed files with 476 additions and 10 deletions

View File

@ -25,7 +25,7 @@
// A test case consists of three lines:
// The first line is a pattern used in the test
// The second line is the input to search for the pattern in
// The third line is a concatentation of the match, the number of groups,
// The third line is a concatenation of the match, the number of groups,
// and the contents of the first four subexpressions.
// Empty lines and lines beginning with comment slashes are ignored.
//
@ -1231,3 +1231,45 @@ true 1
(|f){0,1}+
foo
true 1
//----------------------------------------------------------------
// Unary numeral primality testing
//----------------------------------------------------------------
// Input is 7 (a prime), in unary; reluctant quantifier
^(11+?)\1+$
1111111
false 1
^(1{2,}?)\1+$
1111111
false 1
// Input is 8 (a power of two), in unary; reluctant quantifier
// group is shortest possible (2)
^(11+?)\1+$
11111111
true 11111111 1 11
^(1{2,}?)\1+$
11111111
true 11111111 1 11
// Input is 7 (a prime), in unary; greedy quantifier
^(11+)\1+$
1111111
false 1
^(1{2,})\1+$
1111111
false 1
// Input is 8 (a power of two), in unary; greedy quantifier
// group is longest possible (4)
^(11+)\1+$
11111111
true 11111111 1 1111
^(1{2,})\1+$
11111111
true 11111111 1 1111

View File

@ -22,14 +22,7 @@
*/
package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.*;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
@ -60,9 +53,12 @@ import java.util.concurrent.TimeUnit;
* This benchmark is great for measuring cache effects, e.g. size=10^6 has 5x
* the per-element cost of size=10^3 (See "The Myth of RAM".)
*
* (cd $(hg root) && for size in 3 16 999 999999; do make test TEST="micro:java.lang.ArrayFiddle" MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
* (cd $(git rev-parse --show-toplevel) && for size in 3 16 999 999999; do make test TEST='micro:java.lang.ArrayFiddle' MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*/
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ArrayFiddle {

View File

@ -0,0 +1,140 @@
/*
* Copyright 2020 Google Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Benchmarks of Patterns that exhibit O(2^N) performance due to catastrophic
* backtracking, **when implemented naively**.
*
* See: jdk/test/java/util/regex/RegExTest.java#expoBacktracking
* commit b45ea8903ec290ab194d9ebe040bc43edd5dd0a3
* Author: Xueming Shen <sherman@openjdk.org>
* Date: Tue May 10 21:19:25 2016 -0700
*
* Here's a way to compare the per-char cost:
*
* (cd $(git rev-parse --show-toplevel) && for size in 16 128 1024; do make test TEST='micro:java.util.regex.Exponential' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(1)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@State(Scope.Benchmark)
public class Exponential {
/** Run length of non-matching consecutive whitespace chars. */
@Param({"16", "128", "1024"})
// 2048+ runs into StackOverflowError; see JDK-8260866
int size;
public String justXs;
public String notJustXs;
// Patterns that match justXs but not notJustXs
public Pattern pat1;
public Pattern pat2;
public Pattern pat3;
public Pattern pat4;
Pattern compile(String regex) {
Pattern pat = Pattern.compile(regex);
// ad hoc correctness checking
if (! pat.matcher(justXs).matches()
|| pat.matcher(notJustXs).matches()) {
throw new AssertionError("unexpected matching: " + regex);
}
return pat;
}
@Setup(Level.Trial)
public void setup() {
justXs = "X".repeat(size);
notJustXs = justXs + "!";
// Will (or should) the engine optimize (?:X|X) to X ?
pat1 = compile("(?:X|X)*");
// Tougher to optimize than pat1
pat2 = compile("(?:[XY]|[XZ])*");
pat3 = compile("(X+)+");
pat4 = compile("^(X+)+$");
}
/** O(N) */
@Benchmark
public boolean pat1_justXs() {
return pat1.matcher(justXs).matches();
}
/** O(N) */
@Benchmark
public boolean pat1_notJustXs() {
return pat1.matcher(notJustXs).matches();
}
/** O(N) */
@Benchmark
public boolean pat2_justXs() {
return pat2.matcher(justXs).matches();
}
/** O(N) */
@Benchmark
public boolean pat2_notJustXs() {
return pat2.matcher(notJustXs).matches();
}
/** O(1) - very surprising! */
@Benchmark
public boolean pat3_justXs() {
return pat3.matcher(justXs).matches();
}
/** O(N^2) - surprising! O(N) seems very achievable. */
@Benchmark
public boolean pat3_notJustXs() {
return pat3.matcher(notJustXs).matches();
}
/** O(1) - very surprising! */
@Benchmark
public boolean pat4_justXs() {
return pat4.matcher(justXs).matches();
}
/** O(N^2) - surprising! O(N) seems very achievable. */
@Benchmark
public boolean pat4_notJustXs() {
return pat4.matcher(notJustXs).matches();
}
}

View File

@ -0,0 +1,106 @@
/*
* Copyright 2020 Google Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
/**
* Abusing regexes for fun primality testing.
* Famous among regex enthusiasts.
* https://stackoverflow.com/q/3296050/625403
*
* Prime numbers exhibit O(N^2) performance with all variants, due to exhaustive
* backtracking.
*
* Powers of two exhibit O(N) performance with all variants, with reluctant
* quantifiers doing somewhat better.
*
* Here's a way to compare the per-input-char cost:
*
* (cd $(git rev-parse --show-toplevel) && for n in 16 17 256 257 4096 4099; do make test TEST='micro:java.util.regex.Primality' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $n -p n=$n" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(1)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@State(Scope.Benchmark)
public class Primality {
/** Number to be primality tested. */
@Param({"16", "17", "256", "257", "4096", "4099"})
// "64", "67", "1024", "1031", "16384", "16411"})
int n;
/** Unary numeral representation of int n */
public String unary;
// Patterns that match composite numbers represented as unary numerals.
public Pattern reluctant1;
public Pattern reluctant2;
public Pattern greedy1;
public Pattern greedy2;
Pattern compile(String regex) {
Pattern pat = Pattern.compile(regex);
// ad hoc correctness checking
boolean isPrime1 = ! pat.matcher(unary).matches();
boolean isPrime2 = java.math.BigInteger.valueOf(n).isProbablePrime(100);
if (isPrime1 != isPrime2) {
throw new AssertionError("regex=" + regex + ", n=" + n);
}
return pat;
}
@Setup(Level.Trial)
public void setup() {
unary = "1".repeat(n);
reluctant1 = compile("^(11+?)\\1+$");
reluctant2 = compile("^(1{2,}?)\\1+$");
greedy1 = compile("^(11+)\\1+$");
greedy2 = compile("^(1{2,})\\1+$");
}
@Benchmark
public boolean reluctant1() {
return reluctant1.matcher(unary).matches();
}
@Benchmark
public boolean reluctant2() {
return reluctant2.matcher(unary).matches();
}
@Benchmark
public boolean greedy1() {
return greedy1.matcher(unary).matches();
}
@Benchmark
public boolean greedy2() {
return greedy2.matcher(unary).matches();
}
}

View File

@ -0,0 +1,182 @@
/*
* Copyright 2020 Google Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Detecting trailing whitespace is a very common problem that many programmers
* have solved, but it's surprisingly difficult to avoid O(N^2) performance
* when the input contains a long run of consecutive whitespace. For
* example, attempts to trim such whitespace caused a Stack Exchange outage.
* https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016
*
* We use "[ \t]" as our definition of whitespace (easy, but not too easy!).
*
* The use of Matcher#find (instead of Matcher#matches) is very convenient, but
* introduces an implicit O(N) loop over the input, or alternatively, a
* non-possessive "^.*?" prefix in the regex. In order for the entire search
* operation to not be O(N^2), most of the regex match operations while
* scanning the input need to be O(1), which may require the use of less-obvious
* constructs like lookbehind. The use of possessive quantifiers in the regex
* itself is sadly **insufficient**.
*
* When the subpattern following a possessive quantifier is as cheap as the
* subpattern governed by the quantifier (e.g. \s++$), the possessive quantifier
* gives you at most 2x speedup, reducing two linear scans to one.
*
* An explicit loop with find() using two matchers and possessive quantifiers is
* the most efficient, since there is no backtracking. But that cannot work with
* simple APIs that take a regex as an argument, like grep(1) does.
*
* Here's a way to compare the per-char cost:
*
* (cd $(git rev-parse --show-toplevel) && for size in 16 256 4096; do make test TEST='micro:java.util.regex.Trim' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*
* some jdk17 numbers:
*
* Benchmark (size) Mode Cnt Score Error Units
* Trim.find_loop_two_matchers 1024 avgt 8 2.252 ? 0.013 ns/op
* Trim.find_loop_usePattern 1024 avgt 8 2.328 ? 0.116 ns/op
* Trim.lookBehind_find 1024 avgt 8 21.740 ? 0.040 ns/op
* Trim.possessive2_find 1024 avgt 8 7151.592 ? 17.860 ns/op
* Trim.possessive2_matches 1024 avgt 8 2.625 ? 0.008 ns/op
* Trim.possessive3_find 1024 avgt 8 28.532 ? 1.889 ns/op
* Trim.possessive_find 1024 avgt 8 3113.776 ? 9.996 ns/op
* Trim.simple_find 1024 avgt 8 4199.480 ? 13.410 ns/op
*
* TODO: why is simple_find faster than possessive_find, for size below 512 ?
*
* (cd $(git rev-parse --show-toplevel) && for size in 128 256 512 1024 2048; do make test TEST='micro:java.util.regex.Trim.\\\(simple_find\\\|possessive_find\\\)' MICRO="FORK=2;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(1)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@State(Scope.Benchmark)
public class Trim {
/** Run length of non-matching consecutive whitespace chars. */
@Param({"16", "256", "4096"})
int size;
/** String containing long interior run of whitespace */
public String noMatch;
public Pattern whitespaceRunPattern;
public Pattern eolPattern;
public Pattern simplePattern;
public Pattern possessivePattern;
public Pattern possessivePattern2;
public Pattern possessivePattern3;
public Pattern lookBehindPattern;
Pattern compile(String regex) {
Pattern pat = Pattern.compile(regex);
// ad hoc correctness checking
if (pat.matcher(noMatch).find()) {
throw new AssertionError("unexpected matching: " + regex);
}
return pat;
}
@Setup(Level.Trial)
public void setup() {
noMatch = "xx" + " \t".repeat(size) + "yy";
simplePattern = compile("[ \t]+$");
possessivePattern = compile("[ \t]++$");
possessivePattern2 = compile("(.*+[^ \t]|^)([ \t]++)$");
possessivePattern3 = compile("(?:[^ \t]|^)([ \t]++)$");
lookBehindPattern = compile("(?<![ \t])[ \t]++$");
whitespaceRunPattern = Pattern.compile("[ \t]++");
eolPattern = Pattern.compile("$", Pattern.MULTILINE);
// more ad hoc correctness checking
if (possessive2_matches()) throw new AssertionError();
if (find_loop_two_matchers()) throw new AssertionError();
if (find_loop_usePattern()) throw new AssertionError();
}
@Benchmark
public boolean simple_find() {
return simplePattern.matcher(noMatch).find();
}
@Benchmark
public boolean possessive_find() {
return possessivePattern.matcher(noMatch).find();
}
@Benchmark
public boolean possessive2_find() {
return possessivePattern2.matcher(noMatch).find();
}
@Benchmark
public boolean possessive2_matches() {
return possessivePattern2.matcher(noMatch).matches();
}
@Benchmark
public boolean possessive3_find() {
return possessivePattern3.matcher(noMatch).find();
}
@Benchmark
public boolean lookBehind_find() {
return lookBehindPattern.matcher(noMatch).find();
}
@Benchmark
public boolean find_loop_two_matchers() {
Matcher m = whitespaceRunPattern.matcher(noMatch);
int endOfString = m.regionEnd();
while (m.find()) {
if (eolPattern.matcher(noMatch).region(m.end(), endOfString).lookingAt())
return true;
}
return false;
}
@Benchmark
public boolean find_loop_usePattern() {
Matcher m = whitespaceRunPattern.matcher(noMatch);
int endOfString = m.regionEnd();
while (m.find()) {
m.region(m.end(), endOfString);
m.usePattern(eolPattern);
if (m.lookingAt())
return true;
m.usePattern(whitespaceRunPattern);
}
return false;
}
}