8225179: (regex) Minor Pattern cleanup

Reviewed-by: igerasim
This commit is contained in:
Claes Redestad 2019-06-05 10:07:22 +02:00
parent bb4d8b504a
commit a2f40ec3e1
2 changed files with 60 additions and 55 deletions

View File

@ -2100,7 +2100,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
private Node sequence(Node end) { private Node sequence(Node end) {
Node head = null; Node head = null;
Node tail = null; Node tail = null;
Node node = null; Node node;
LOOP: LOOP:
for (;;) { for (;;) {
int ch = peek(); int ch = peek();
@ -2617,7 +2617,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
CharPredicate prev = null; CharPredicate prev = null;
CharPredicate curr = null; CharPredicate curr = null;
BitClass bits = new BitClass(); BitClass bits = new BitClass();
BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
boolean isNeg = false; boolean isNeg = false;
boolean hasBits = false; boolean hasBits = false;
@ -2658,9 +2657,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (hasBits) { if (hasBits) {
// bits used, union has high precedence // bits used, union has high precedence
if (prev == null) { if (prev == null) {
prev = curr = bitsP; prev = curr = bits;
} else { } else {
prev = prev.union(bitsP); prev = prev.union(bits);
} }
hasBits = false; hasBits = false;
} }
@ -2689,9 +2688,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (consume) if (consume)
next(); next();
if (prev == null) if (prev == null)
prev = bitsP; prev = bits;
else if (hasBits) else if (hasBits)
prev = prev.union(bitsP); prev = prev.union(bits);
if (isNeg) if (isNeg)
return prev.negate(); return prev.negate();
return prev; return prev;
@ -2947,8 +2946,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
*/ */
private Node group0() { private Node group0() {
boolean capturingGroup = false; boolean capturingGroup = false;
Node head = null; Node head;
Node tail = null; Node tail;
int save = flags0; int save = flags0;
int saveTCNCount = topClosureNodes.size(); int saveTCNCount = topClosureNodes.size();
root = null; root = null;
@ -2997,7 +2996,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
head = createGroup(true); head = createGroup(true);
tail = root; tail = root;
head.next = expr(tail); head.next = expr(tail);
tail.next = lookbehindEnd; tail.next = LookBehindEndNode.INSTANCE;
TreeInfo info = new TreeInfo(); TreeInfo info = new TreeInfo();
head.study(info); head.study(info);
if (info.maxValid == false) { if (info.maxValid == false) {
@ -3253,7 +3252,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* Prev could be a single or a group, so it could be a chain of nodes. * Prev could be a single or a group, so it could be a chain of nodes.
*/ */
private Node closure(Node prev) { private Node closure(Node prev) {
Node atom;
int ch = peek(); int ch = peek();
switch (ch) { switch (ch) {
case '?': case '?':
@ -3486,14 +3484,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* never matches values above Latin-1, and a complemented BitClass always * never matches values above Latin-1, and a complemented BitClass always
* matches values above Latin-1. * matches values above Latin-1.
*/ */
static final class BitClass extends BmpCharProperty { static final class BitClass implements BmpCharPredicate {
final boolean[] bits; final boolean[] bits;
BitClass() { BitClass() {
this(new boolean[256]); bits = new boolean[256];
}
private BitClass(boolean[] bits) {
super( ch -> ch < 256 && bits[ch]);
this.bits = bits;
} }
BitClass add(int c, int flags) { BitClass add(int c, int flags) {
assert c >= 0 && c <= 255; assert c >= 0 && c <= 255;
@ -3509,8 +3503,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
bits[c] = true; bits[c] = true;
return this; return this;
} }
public boolean is(int ch) {
return ch < 256 && bits[ch];
}
} }
/** /**
* Utility method for creating a string slice matcher. * Utility method for creating a string slice matcher.
*/ */
@ -3923,7 +3921,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* boolean property. * boolean property.
*/ */
static class CharProperty extends Node { static class CharProperty extends Node {
CharPredicate predicate; final CharPredicate predicate;
CharProperty (CharPredicate predicate) { CharProperty (CharPredicate predicate) {
this.predicate = predicate; this.predicate = predicate;
@ -4698,7 +4696,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* "next". * "next".
*/ */
static final class BranchConn extends Node { static final class BranchConn extends Node {
BranchConn() {}; BranchConn() {}
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
return next.match(matcher, i, seq); return next.match(matcher, i, seq);
} }
@ -4795,34 +4793,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
matcher.locals[localIndex] = save; matcher.locals[localIndex] = save;
return ret; return ret;
} }
boolean matchRef(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[localIndex];
matcher.locals[localIndex] = ~i; // HACK
boolean ret = next.match(matcher, i, seq);
matcher.locals[localIndex] = save;
return ret;
}
}
/**
* Recursive reference to a group in the regular expression. It calls
* matchRef because if the reference fails to match we would not unset
* the group.
*/
static final class GroupRef extends Node {
GroupHead head;
GroupRef(GroupHead head) {
this.head = head;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
return head.matchRef(matcher, i, seq)
&& next.match(matcher, matcher.last, seq);
}
boolean study(TreeInfo info) {
info.maxValid = false;
info.deterministic = false;
return next.study(info);
}
} }
/** /**
@ -4944,7 +4914,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
boolean matchInit(Matcher matcher, int i, CharSequence seq) { boolean matchInit(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[countIndex]; int save = matcher.locals[countIndex];
boolean ret = false; boolean ret;
if (posIndex != -1 && matcher.localsPos[posIndex] == null) { if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
matcher.localsPos[posIndex] = new IntHashSet(); matcher.localsPos[posIndex] = new IntHashSet();
} }
@ -5168,7 +5138,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to; int savedTo = matcher.to;
boolean conditionMatched = false; boolean conditionMatched;
// Relax transparent region boundaries for lookahead // Relax transparent region boundaries for lookahead
if (matcher.transparentBounds) if (matcher.transparentBounds)
@ -5193,7 +5163,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to; int savedTo = matcher.to;
boolean conditionMatched = false; boolean conditionMatched;
// Relax transparent region boundaries for lookahead // Relax transparent region boundaries for lookahead
if (matcher.transparentBounds) if (matcher.transparentBounds)
@ -5219,11 +5189,15 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* For use with lookbehinds; matches the position where the lookbehind * For use with lookbehinds; matches the position where the lookbehind
* was encountered. * was encountered.
*/ */
static Node lookbehindEnd = new Node() { static class LookBehindEndNode extends Node {
private LookBehindEndNode() {} // Singleton
static LookBehindEndNode INSTANCE = new LookBehindEndNode();
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
return i == matcher.lookbehindTo; return i == matcher.lookbehindTo;
} }
}; }
/** /**
* Zero width positive lookbehind. * Zero width positive lookbehind.
@ -5491,7 +5465,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (patternLength < 4) { if (patternLength < 4) {
return node; return node;
} }
int i, j, k; int i, j;
int[] lastOcc = new int[128]; int[] lastOcc = new int[128];
int[] optoSft = new int[patternLength]; int[] optoSft = new int[patternLength];
// Precalculate part of the bad character shift // Precalculate part of the bad character shift
@ -5646,7 +5620,7 @@ NEXT: while (i <= last) {
static interface BmpCharPredicate extends CharPredicate { static interface BmpCharPredicate extends CharPredicate {
default CharPredicate and(CharPredicate p) { default CharPredicate and(CharPredicate p) {
if(p instanceof BmpCharPredicate) if (p instanceof BmpCharPredicate)
return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
return ch -> is(ch) && p.is(ch); return ch -> is(ch) && p.is(ch);
} }

View File

@ -45,12 +45,13 @@ public class PatternBench {
public String fileTestString; public String fileTestString;
public String flagsString; public String flagsString;
public Pattern graphemePattern; public Pattern graphemePattern;
public Pattern jmodPattern; public Pattern jmodPattern;
public Pattern jmodCanonicalPattern; public Pattern jmodCanonicalPattern;
public Pattern pattern; public String charPatternRegex;
public String[] charPatternStrings;
public Pattern charPattern;
@Setup @Setup
public void setup() { public void setup() {
@ -61,6 +62,10 @@ public class PatternBench {
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$"; String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ); jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
jmodPattern = Pattern.compile(jmodRegex); jmodPattern = Pattern.compile(jmodRegex);
charPatternRegex = "[ a-zA-Z]*foo[ a-zA-Z0-9]*bar[ a-z]*";
charPatternStrings = new String[] {"avaaafooddddddbariiii", "lorem ipsum dolor foo bar", "fpp brr lorem ipsum dolor foo bar %", "lorem ipsum dolor foo bar lorem ipsum dolor foo bar lorem ipsum dolor foo bar /"};
charPattern = Pattern.compile(charPatternRegex);
} }
@Benchmark @Benchmark
@ -83,4 +88,30 @@ public class PatternBench {
public boolean normalJmodMatch() { public boolean normalJmodMatch() {
return jmodPattern.matcher(fileTestString).matches(); return jmodPattern.matcher(fileTestString).matches();
} }
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean charPatternMatch() {
return charPattern.matcher(charPatternStrings[0]).matches()
&& charPattern.matcher(charPatternStrings[1]).matches()
&& charPattern.matcher(charPatternStrings[2]).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean charPatternMatchWithCompile() {
Pattern p = Pattern.compile(charPatternRegex);
return p.matcher(charPatternStrings[0]).matches()
&& p.matcher(charPatternStrings[1]).matches()
&& p.matcher(charPatternStrings[2]).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public Pattern charPatternCompile() {
return Pattern.compile(charPatternRegex);
}
} }