From 351d788809ba73cb8a43cd6ae4619031eb0ce2f7 Mon Sep 17 00:00:00 2001 From: Martin Buchholz Date: Mon, 8 Feb 2021 18:09:59 +0000 Subject: [PATCH] 8259074: regex benchmarks and tests Reviewed-by: redestad --- test/jdk/java/util/regex/TestCases.txt | 44 ++++- .../openjdk/bench/java/lang/ArrayFiddle.java | 14 +- .../bench/java/util/regex/Exponential.java | 140 ++++++++++++++ .../bench/java/util/regex/Primality.java | 106 ++++++++++ .../openjdk/bench/java/util/regex/Trim.java | 182 ++++++++++++++++++ 5 files changed, 476 insertions(+), 10 deletions(-) create mode 100644 test/micro/org/openjdk/bench/java/util/regex/Exponential.java create mode 100644 test/micro/org/openjdk/bench/java/util/regex/Primality.java create mode 100644 test/micro/org/openjdk/bench/java/util/regex/Trim.java diff --git a/test/jdk/java/util/regex/TestCases.txt b/test/jdk/java/util/regex/TestCases.txt index 9461f57dc2d..3b367782643 100644 --- a/test/jdk/java/util/regex/TestCases.txt +++ b/test/jdk/java/util/regex/TestCases.txt @@ -25,7 +25,7 @@ // A test case consists of three lines: // The first line is a pattern used in the test // The second line is the input to search for the pattern in -// The third line is a concatentation of the match, the number of groups, +// The third line is a concatenation of the match, the number of groups, // and the contents of the first four subexpressions. // Empty lines and lines beginning with comment slashes are ignored. // @@ -1231,3 +1231,45 @@ true 1 (|f){0,1}+ foo true 1 + +//---------------------------------------------------------------- +// Unary numeral primality testing +//---------------------------------------------------------------- + +// Input is 7 (a prime), in unary; reluctant quantifier +^(11+?)\1+$ +1111111 +false 1 + +^(1{2,}?)\1+$ +1111111 +false 1 + +// Input is 8 (a power of two), in unary; reluctant quantifier +// group is shortest possible (2) +^(11+?)\1+$ +11111111 +true 11111111 1 11 + +^(1{2,}?)\1+$ +11111111 +true 11111111 1 11 + +// Input is 7 (a prime), in unary; greedy quantifier +^(11+)\1+$ +1111111 +false 1 + +^(1{2,})\1+$ +1111111 +false 1 + +// Input is 8 (a power of two), in unary; greedy quantifier +// group is longest possible (4) +^(11+)\1+$ +11111111 +true 11111111 1 1111 + +^(1{2,})\1+$ +11111111 +true 11111111 1 1111 diff --git a/test/micro/org/openjdk/bench/java/lang/ArrayFiddle.java b/test/micro/org/openjdk/bench/java/lang/ArrayFiddle.java index 25cfa200d34..7686e7ab706 100644 --- a/test/micro/org/openjdk/bench/java/lang/ArrayFiddle.java +++ b/test/micro/org/openjdk/bench/java/lang/ArrayFiddle.java @@ -22,14 +22,7 @@ */ package org.openjdk.bench.java.lang; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.OutputTimeUnit; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.*; import java.util.Arrays; import java.util.concurrent.ThreadLocalRandom; @@ -60,9 +53,12 @@ import java.util.concurrent.TimeUnit; * This benchmark is great for measuring cache effects, e.g. size=10^6 has 5x * the per-element cost of size=10^3 (See "The Myth of RAM".) * - * (cd $(hg root) && for size in 3 16 999 999999; do make test TEST="micro:java.lang.ArrayFiddle" MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) + * (cd $(git rev-parse --show-toplevel) && for size in 3 16 999 999999; do make test TEST='micro:java.lang.ArrayFiddle' MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) */ @BenchmarkMode(Mode.AverageTime) +@Fork(2) +@Warmup(iterations = 1) +@Measurement(iterations = 4) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Benchmark) public class ArrayFiddle { diff --git a/test/micro/org/openjdk/bench/java/util/regex/Exponential.java b/test/micro/org/openjdk/bench/java/util/regex/Exponential.java new file mode 100644 index 00000000000..997768a0592 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/regex/Exponential.java @@ -0,0 +1,140 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.util.regex; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Benchmarks of Patterns that exhibit O(2^N) performance due to catastrophic + * backtracking, **when implemented naively**. + * + * See: jdk/test/java/util/regex/RegExTest.java#expoBacktracking + * commit b45ea8903ec290ab194d9ebe040bc43edd5dd0a3 + * Author: Xueming Shen + * Date: Tue May 10 21:19:25 2016 -0700 + * + * Here's a way to compare the per-char cost: + * + * (cd $(git rev-parse --show-toplevel) && for size in 16 128 1024; do make test TEST='micro:java.util.regex.Exponential' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) + * + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(1) +@Warmup(iterations = 1) +@Measurement(iterations = 4) +@State(Scope.Benchmark) +public class Exponential { + /** Run length of non-matching consecutive whitespace chars. */ + @Param({"16", "128", "1024"}) + // 2048+ runs into StackOverflowError; see JDK-8260866 + int size; + + public String justXs; + public String notJustXs; + + // Patterns that match justXs but not notJustXs + public Pattern pat1; + public Pattern pat2; + public Pattern pat3; + public Pattern pat4; + + Pattern compile(String regex) { + Pattern pat = Pattern.compile(regex); + // ad hoc correctness checking + if (! pat.matcher(justXs).matches() + || pat.matcher(notJustXs).matches()) { + throw new AssertionError("unexpected matching: " + regex); + } + return pat; + } + + @Setup(Level.Trial) + public void setup() { + justXs = "X".repeat(size); + notJustXs = justXs + "!"; + + // Will (or should) the engine optimize (?:X|X) to X ? + pat1 = compile("(?:X|X)*"); + + // Tougher to optimize than pat1 + pat2 = compile("(?:[XY]|[XZ])*"); + + pat3 = compile("(X+)+"); + + pat4 = compile("^(X+)+$"); + } + + /** O(N) */ + @Benchmark + public boolean pat1_justXs() { + return pat1.matcher(justXs).matches(); + } + + /** O(N) */ + @Benchmark + public boolean pat1_notJustXs() { + return pat1.matcher(notJustXs).matches(); + } + + /** O(N) */ + @Benchmark + public boolean pat2_justXs() { + return pat2.matcher(justXs).matches(); + } + + /** O(N) */ + @Benchmark + public boolean pat2_notJustXs() { + return pat2.matcher(notJustXs).matches(); + } + + /** O(1) - very surprising! */ + @Benchmark + public boolean pat3_justXs() { + return pat3.matcher(justXs).matches(); + } + + /** O(N^2) - surprising! O(N) seems very achievable. */ + @Benchmark + public boolean pat3_notJustXs() { + return pat3.matcher(notJustXs).matches(); + } + + /** O(1) - very surprising! */ + @Benchmark + public boolean pat4_justXs() { + return pat4.matcher(justXs).matches(); + } + + /** O(N^2) - surprising! O(N) seems very achievable. */ + @Benchmark + public boolean pat4_notJustXs() { + return pat4.matcher(notJustXs).matches(); + } + +} diff --git a/test/micro/org/openjdk/bench/java/util/regex/Primality.java b/test/micro/org/openjdk/bench/java/util/regex/Primality.java new file mode 100644 index 00000000000..1959c8afe92 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/regex/Primality.java @@ -0,0 +1,106 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.util.regex; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +/** + * Abusing regexes for fun primality testing. + * Famous among regex enthusiasts. + * https://stackoverflow.com/q/3296050/625403 + * + * Prime numbers exhibit O(N^2) performance with all variants, due to exhaustive + * backtracking. + * + * Powers of two exhibit O(N) performance with all variants, with reluctant + * quantifiers doing somewhat better. + * + * Here's a way to compare the per-input-char cost: + * + * (cd $(git rev-parse --show-toplevel) && for n in 16 17 256 257 4096 4099; do make test TEST='micro:java.util.regex.Primality' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $n -p n=$n" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(1) +@Warmup(iterations = 1) +@Measurement(iterations = 4) +@State(Scope.Benchmark) +public class Primality { + /** Number to be primality tested. */ + @Param({"16", "17", "256", "257", "4096", "4099"}) + // "64", "67", "1024", "1031", "16384", "16411"}) + int n; + + /** Unary numeral representation of int n */ + public String unary; + + // Patterns that match composite numbers represented as unary numerals. + public Pattern reluctant1; + public Pattern reluctant2; + public Pattern greedy1; + public Pattern greedy2; + + Pattern compile(String regex) { + Pattern pat = Pattern.compile(regex); + // ad hoc correctness checking + boolean isPrime1 = ! pat.matcher(unary).matches(); + boolean isPrime2 = java.math.BigInteger.valueOf(n).isProbablePrime(100); + if (isPrime1 != isPrime2) { + throw new AssertionError("regex=" + regex + ", n=" + n); + } + return pat; + } + + @Setup(Level.Trial) + public void setup() { + unary = "1".repeat(n); + + reluctant1 = compile("^(11+?)\\1+$"); + reluctant2 = compile("^(1{2,}?)\\1+$"); + greedy1 = compile("^(11+)\\1+$"); + greedy2 = compile("^(1{2,})\\1+$"); + } + + @Benchmark + public boolean reluctant1() { + return reluctant1.matcher(unary).matches(); + } + + @Benchmark + public boolean reluctant2() { + return reluctant2.matcher(unary).matches(); + } + + @Benchmark + public boolean greedy1() { + return greedy1.matcher(unary).matches(); + } + + @Benchmark + public boolean greedy2() { + return greedy2.matcher(unary).matches(); + } +} diff --git a/test/micro/org/openjdk/bench/java/util/regex/Trim.java b/test/micro/org/openjdk/bench/java/util/regex/Trim.java new file mode 100644 index 00000000000..87a1d7b77fc --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/regex/Trim.java @@ -0,0 +1,182 @@ +/* + * Copyright 2020 Google Inc. All Rights Reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.util.regex; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Detecting trailing whitespace is a very common problem that many programmers + * have solved, but it's surprisingly difficult to avoid O(N^2) performance + * when the input contains a long run of consecutive whitespace. For + * example, attempts to trim such whitespace caused a Stack Exchange outage. + * https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016 + * + * We use "[ \t]" as our definition of whitespace (easy, but not too easy!). + * + * The use of Matcher#find (instead of Matcher#matches) is very convenient, but + * introduces an implicit O(N) loop over the input, or alternatively, a + * non-possessive "^.*?" prefix in the regex. In order for the entire search + * operation to not be O(N^2), most of the regex match operations while + * scanning the input need to be O(1), which may require the use of less-obvious + * constructs like lookbehind. The use of possessive quantifiers in the regex + * itself is sadly **insufficient**. + * + * When the subpattern following a possessive quantifier is as cheap as the + * subpattern governed by the quantifier (e.g. \s++$), the possessive quantifier + * gives you at most 2x speedup, reducing two linear scans to one. + * + * An explicit loop with find() using two matchers and possessive quantifiers is + * the most efficient, since there is no backtracking. But that cannot work with + * simple APIs that take a regex as an argument, like grep(1) does. + * + * Here's a way to compare the per-char cost: + * + * (cd $(git rev-parse --show-toplevel) && for size in 16 256 4096; do make test TEST='micro:java.util.regex.Trim' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) + * + * some jdk17 numbers: + * + * Benchmark (size) Mode Cnt Score Error Units + * Trim.find_loop_two_matchers 1024 avgt 8 2.252 ? 0.013 ns/op + * Trim.find_loop_usePattern 1024 avgt 8 2.328 ? 0.116 ns/op + * Trim.lookBehind_find 1024 avgt 8 21.740 ? 0.040 ns/op + * Trim.possessive2_find 1024 avgt 8 7151.592 ? 17.860 ns/op + * Trim.possessive2_matches 1024 avgt 8 2.625 ? 0.008 ns/op + * Trim.possessive3_find 1024 avgt 8 28.532 ? 1.889 ns/op + * Trim.possessive_find 1024 avgt 8 3113.776 ? 9.996 ns/op + * Trim.simple_find 1024 avgt 8 4199.480 ? 13.410 ns/op + * + * TODO: why is simple_find faster than possessive_find, for size below 512 ? + * + * (cd $(git rev-parse --show-toplevel) && for size in 128 256 512 1024 2048; do make test TEST='micro:java.util.regex.Trim.\\\(simple_find\\\|possessive_find\\\)' MICRO="FORK=2;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(1) +@Warmup(iterations = 1) +@Measurement(iterations = 4) +@State(Scope.Benchmark) +public class Trim { + /** Run length of non-matching consecutive whitespace chars. */ + @Param({"16", "256", "4096"}) + int size; + + /** String containing long interior run of whitespace */ + public String noMatch; + + public Pattern whitespaceRunPattern; + public Pattern eolPattern; + + public Pattern simplePattern; + public Pattern possessivePattern; + public Pattern possessivePattern2; + public Pattern possessivePattern3; + public Pattern lookBehindPattern; + + Pattern compile(String regex) { + Pattern pat = Pattern.compile(regex); + // ad hoc correctness checking + if (pat.matcher(noMatch).find()) { + throw new AssertionError("unexpected matching: " + regex); + } + return pat; + } + + @Setup(Level.Trial) + public void setup() { + noMatch = "xx" + " \t".repeat(size) + "yy"; + + simplePattern = compile("[ \t]+$"); + possessivePattern = compile("[ \t]++$"); + possessivePattern2 = compile("(.*+[^ \t]|^)([ \t]++)$"); + possessivePattern3 = compile("(?:[^ \t]|^)([ \t]++)$"); + lookBehindPattern = compile("(?