8225179: (regex) Minor Pattern cleanup

Reviewed-by: igerasim
This commit is contained in:
Claes Redestad 2019-06-05 10:07:22 +02:00
parent bb4d8b504a
commit a2f40ec3e1
2 changed files with 60 additions and 55 deletions

View File

@ -2100,7 +2100,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
private Node sequence(Node end) {
Node head = null;
Node tail = null;
Node node = null;
Node node;
LOOP:
for (;;) {
int ch = peek();
@ -2617,7 +2617,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
CharPredicate prev = null;
CharPredicate curr = null;
BitClass bits = new BitClass();
BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
boolean isNeg = false;
boolean hasBits = false;
@ -2658,9 +2657,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (hasBits) {
// bits used, union has high precedence
if (prev == null) {
prev = curr = bitsP;
prev = curr = bits;
} else {
prev = prev.union(bitsP);
prev = prev.union(bits);
}
hasBits = false;
}
@ -2689,9 +2688,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (consume)
next();
if (prev == null)
prev = bitsP;
prev = bits;
else if (hasBits)
prev = prev.union(bitsP);
prev = prev.union(bits);
if (isNeg)
return prev.negate();
return prev;
@ -2947,8 +2946,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
*/
private Node group0() {
boolean capturingGroup = false;
Node head = null;
Node tail = null;
Node head;
Node tail;
int save = flags0;
int saveTCNCount = topClosureNodes.size();
root = null;
@ -2997,7 +2996,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
head = createGroup(true);
tail = root;
head.next = expr(tail);
tail.next = lookbehindEnd;
tail.next = LookBehindEndNode.INSTANCE;
TreeInfo info = new TreeInfo();
head.study(info);
if (info.maxValid == false) {
@ -3253,7 +3252,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* Prev could be a single or a group, so it could be a chain of nodes.
*/
private Node closure(Node prev) {
Node atom;
int ch = peek();
switch (ch) {
case '?':
@ -3486,14 +3484,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* never matches values above Latin-1, and a complemented BitClass always
* matches values above Latin-1.
*/
static final class BitClass extends BmpCharProperty {
static final class BitClass implements BmpCharPredicate {
final boolean[] bits;
BitClass() {
this(new boolean[256]);
}
private BitClass(boolean[] bits) {
super( ch -> ch < 256 && bits[ch]);
this.bits = bits;
bits = new boolean[256];
}
BitClass add(int c, int flags) {
assert c >= 0 && c <= 255;
@ -3509,8 +3503,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
bits[c] = true;
return this;
}
public boolean is(int ch) {
return ch < 256 && bits[ch];
}
}
/**
* Utility method for creating a string slice matcher.
*/
@ -3923,7 +3921,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* boolean property.
*/
static class CharProperty extends Node {
CharPredicate predicate;
final CharPredicate predicate;
CharProperty (CharPredicate predicate) {
this.predicate = predicate;
@ -4698,7 +4696,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* "next".
*/
static final class BranchConn extends Node {
BranchConn() {};
BranchConn() {}
boolean match(Matcher matcher, int i, CharSequence seq) {
return next.match(matcher, i, seq);
}
@ -4795,34 +4793,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
matcher.locals[localIndex] = save;
return ret;
}
boolean matchRef(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[localIndex];
matcher.locals[localIndex] = ~i; // HACK
boolean ret = next.match(matcher, i, seq);
matcher.locals[localIndex] = save;
return ret;
}
}
/**
* Recursive reference to a group in the regular expression. It calls
* matchRef because if the reference fails to match we would not unset
* the group.
*/
static final class GroupRef extends Node {
GroupHead head;
GroupRef(GroupHead head) {
this.head = head;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
return head.matchRef(matcher, i, seq)
&& next.match(matcher, matcher.last, seq);
}
boolean study(TreeInfo info) {
info.maxValid = false;
info.deterministic = false;
return next.study(info);
}
}
/**
@ -4944,7 +4914,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
boolean matchInit(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[countIndex];
boolean ret = false;
boolean ret;
if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
matcher.localsPos[posIndex] = new IntHashSet();
}
@ -5168,7 +5138,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to;
boolean conditionMatched = false;
boolean conditionMatched;
// Relax transparent region boundaries for lookahead
if (matcher.transparentBounds)
@ -5193,7 +5163,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to;
boolean conditionMatched = false;
boolean conditionMatched;
// Relax transparent region boundaries for lookahead
if (matcher.transparentBounds)
@ -5219,11 +5189,15 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* For use with lookbehinds; matches the position where the lookbehind
* was encountered.
*/
static Node lookbehindEnd = new Node() {
static class LookBehindEndNode extends Node {
private LookBehindEndNode() {} // Singleton
static LookBehindEndNode INSTANCE = new LookBehindEndNode();
boolean match(Matcher matcher, int i, CharSequence seq) {
return i == matcher.lookbehindTo;
}
};
}
/**
* Zero width positive lookbehind.
@ -5491,7 +5465,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (patternLength < 4) {
return node;
}
int i, j, k;
int i, j;
int[] lastOcc = new int[128];
int[] optoSft = new int[patternLength];
// Precalculate part of the bad character shift
@ -5646,7 +5620,7 @@ NEXT: while (i <= last) {
static interface BmpCharPredicate extends CharPredicate {
default CharPredicate and(CharPredicate p) {
if(p instanceof BmpCharPredicate)
if (p instanceof BmpCharPredicate)
return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
return ch -> is(ch) && p.is(ch);
}

View File

@ -45,12 +45,13 @@ public class PatternBench {
public String fileTestString;
public String flagsString;
public Pattern graphemePattern;
public Pattern jmodPattern;
public Pattern jmodCanonicalPattern;
public Pattern pattern;
public String charPatternRegex;
public String[] charPatternStrings;
public Pattern charPattern;
@Setup
public void setup() {
@ -61,6 +62,10 @@ public class PatternBench {
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
jmodPattern = Pattern.compile(jmodRegex);
charPatternRegex = "[ a-zA-Z]*foo[ a-zA-Z0-9]*bar[ a-z]*";
charPatternStrings = new String[] {"avaaafooddddddbariiii", "lorem ipsum dolor foo bar", "fpp brr lorem ipsum dolor foo bar %", "lorem ipsum dolor foo bar lorem ipsum dolor foo bar lorem ipsum dolor foo bar /"};
charPattern = Pattern.compile(charPatternRegex);
}
@Benchmark
@ -83,4 +88,30 @@ public class PatternBench {
public boolean normalJmodMatch() {
return jmodPattern.matcher(fileTestString).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean charPatternMatch() {
return charPattern.matcher(charPatternStrings[0]).matches()
&& charPattern.matcher(charPatternStrings[1]).matches()
&& charPattern.matcher(charPatternStrings[2]).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public boolean charPatternMatchWithCompile() {
Pattern p = Pattern.compile(charPatternRegex);
return p.matcher(charPatternStrings[0]).matches()
&& p.matcher(charPatternStrings[1]).matches()
&& p.matcher(charPatternStrings[2]).matches();
}
@Benchmark
@Warmup(iterations = 3)
@Measurement(iterations = 3)
public Pattern charPatternCompile() {
return Pattern.compile(charPatternRegex);
}
}