8225179: (regex) Minor Pattern cleanup
Reviewed-by: igerasim
This commit is contained in:
parent
bb4d8b504a
commit
a2f40ec3e1
@ -2100,7 +2100,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
private Node sequence(Node end) {
|
||||
Node head = null;
|
||||
Node tail = null;
|
||||
Node node = null;
|
||||
Node node;
|
||||
LOOP:
|
||||
for (;;) {
|
||||
int ch = peek();
|
||||
@ -2617,7 +2617,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
CharPredicate prev = null;
|
||||
CharPredicate curr = null;
|
||||
BitClass bits = new BitClass();
|
||||
BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
|
||||
|
||||
boolean isNeg = false;
|
||||
boolean hasBits = false;
|
||||
@ -2658,9 +2657,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (hasBits) {
|
||||
// bits used, union has high precedence
|
||||
if (prev == null) {
|
||||
prev = curr = bitsP;
|
||||
prev = curr = bits;
|
||||
} else {
|
||||
prev = prev.union(bitsP);
|
||||
prev = prev.union(bits);
|
||||
}
|
||||
hasBits = false;
|
||||
}
|
||||
@ -2689,9 +2688,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (consume)
|
||||
next();
|
||||
if (prev == null)
|
||||
prev = bitsP;
|
||||
prev = bits;
|
||||
else if (hasBits)
|
||||
prev = prev.union(bitsP);
|
||||
prev = prev.union(bits);
|
||||
if (isNeg)
|
||||
return prev.negate();
|
||||
return prev;
|
||||
@ -2947,8 +2946,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
*/
|
||||
private Node group0() {
|
||||
boolean capturingGroup = false;
|
||||
Node head = null;
|
||||
Node tail = null;
|
||||
Node head;
|
||||
Node tail;
|
||||
int save = flags0;
|
||||
int saveTCNCount = topClosureNodes.size();
|
||||
root = null;
|
||||
@ -2997,7 +2996,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
head = createGroup(true);
|
||||
tail = root;
|
||||
head.next = expr(tail);
|
||||
tail.next = lookbehindEnd;
|
||||
tail.next = LookBehindEndNode.INSTANCE;
|
||||
TreeInfo info = new TreeInfo();
|
||||
head.study(info);
|
||||
if (info.maxValid == false) {
|
||||
@ -3253,7 +3252,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
* Prev could be a single or a group, so it could be a chain of nodes.
|
||||
*/
|
||||
private Node closure(Node prev) {
|
||||
Node atom;
|
||||
int ch = peek();
|
||||
switch (ch) {
|
||||
case '?':
|
||||
@ -3486,14 +3484,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
* never matches values above Latin-1, and a complemented BitClass always
|
||||
* matches values above Latin-1.
|
||||
*/
|
||||
static final class BitClass extends BmpCharProperty {
|
||||
static final class BitClass implements BmpCharPredicate {
|
||||
final boolean[] bits;
|
||||
BitClass() {
|
||||
this(new boolean[256]);
|
||||
}
|
||||
private BitClass(boolean[] bits) {
|
||||
super( ch -> ch < 256 && bits[ch]);
|
||||
this.bits = bits;
|
||||
bits = new boolean[256];
|
||||
}
|
||||
BitClass add(int c, int flags) {
|
||||
assert c >= 0 && c <= 255;
|
||||
@ -3509,8 +3503,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
bits[c] = true;
|
||||
return this;
|
||||
}
|
||||
public boolean is(int ch) {
|
||||
return ch < 256 && bits[ch];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility method for creating a string slice matcher.
|
||||
*/
|
||||
@ -3923,7 +3921,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
* boolean property.
|
||||
*/
|
||||
static class CharProperty extends Node {
|
||||
CharPredicate predicate;
|
||||
final CharPredicate predicate;
|
||||
|
||||
CharProperty (CharPredicate predicate) {
|
||||
this.predicate = predicate;
|
||||
@ -4698,7 +4696,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
* "next".
|
||||
*/
|
||||
static final class BranchConn extends Node {
|
||||
BranchConn() {};
|
||||
BranchConn() {}
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
return next.match(matcher, i, seq);
|
||||
}
|
||||
@ -4795,34 +4793,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
matcher.locals[localIndex] = save;
|
||||
return ret;
|
||||
}
|
||||
boolean matchRef(Matcher matcher, int i, CharSequence seq) {
|
||||
int save = matcher.locals[localIndex];
|
||||
matcher.locals[localIndex] = ~i; // HACK
|
||||
boolean ret = next.match(matcher, i, seq);
|
||||
matcher.locals[localIndex] = save;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive reference to a group in the regular expression. It calls
|
||||
* matchRef because if the reference fails to match we would not unset
|
||||
* the group.
|
||||
*/
|
||||
static final class GroupRef extends Node {
|
||||
GroupHead head;
|
||||
GroupRef(GroupHead head) {
|
||||
this.head = head;
|
||||
}
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
return head.matchRef(matcher, i, seq)
|
||||
&& next.match(matcher, matcher.last, seq);
|
||||
}
|
||||
boolean study(TreeInfo info) {
|
||||
info.maxValid = false;
|
||||
info.deterministic = false;
|
||||
return next.study(info);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4944,7 +4914,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
boolean matchInit(Matcher matcher, int i, CharSequence seq) {
|
||||
int save = matcher.locals[countIndex];
|
||||
boolean ret = false;
|
||||
boolean ret;
|
||||
if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
|
||||
matcher.localsPos[posIndex] = new IntHashSet();
|
||||
}
|
||||
@ -5168,7 +5138,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
int savedTo = matcher.to;
|
||||
boolean conditionMatched = false;
|
||||
boolean conditionMatched;
|
||||
|
||||
// Relax transparent region boundaries for lookahead
|
||||
if (matcher.transparentBounds)
|
||||
@ -5193,7 +5163,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
int savedTo = matcher.to;
|
||||
boolean conditionMatched = false;
|
||||
boolean conditionMatched;
|
||||
|
||||
// Relax transparent region boundaries for lookahead
|
||||
if (matcher.transparentBounds)
|
||||
@ -5219,11 +5189,15 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
* For use with lookbehinds; matches the position where the lookbehind
|
||||
* was encountered.
|
||||
*/
|
||||
static Node lookbehindEnd = new Node() {
|
||||
static class LookBehindEndNode extends Node {
|
||||
private LookBehindEndNode() {} // Singleton
|
||||
|
||||
static LookBehindEndNode INSTANCE = new LookBehindEndNode();
|
||||
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
return i == matcher.lookbehindTo;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Zero width positive lookbehind.
|
||||
@ -5491,7 +5465,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (patternLength < 4) {
|
||||
return node;
|
||||
}
|
||||
int i, j, k;
|
||||
int i, j;
|
||||
int[] lastOcc = new int[128];
|
||||
int[] optoSft = new int[patternLength];
|
||||
// Precalculate part of the bad character shift
|
||||
@ -5646,7 +5620,7 @@ NEXT: while (i <= last) {
|
||||
static interface BmpCharPredicate extends CharPredicate {
|
||||
|
||||
default CharPredicate and(CharPredicate p) {
|
||||
if(p instanceof BmpCharPredicate)
|
||||
if (p instanceof BmpCharPredicate)
|
||||
return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
|
||||
return ch -> is(ch) && p.is(ch);
|
||||
}
|
||||
|
@ -45,12 +45,13 @@ public class PatternBench {
|
||||
public String fileTestString;
|
||||
public String flagsString;
|
||||
|
||||
|
||||
public Pattern graphemePattern;
|
||||
public Pattern jmodPattern;
|
||||
public Pattern jmodCanonicalPattern;
|
||||
|
||||
public Pattern pattern;
|
||||
public String charPatternRegex;
|
||||
public String[] charPatternStrings;
|
||||
public Pattern charPattern;
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
@ -61,6 +62,10 @@ public class PatternBench {
|
||||
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
||||
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
||||
jmodPattern = Pattern.compile(jmodRegex);
|
||||
|
||||
charPatternRegex = "[ a-zA-Z]*foo[ a-zA-Z0-9]*bar[ a-z]*";
|
||||
charPatternStrings = new String[] {"avaaafooddddddbariiii", "lorem ipsum dolor foo bar", "fpp brr lorem ipsum dolor foo bar %", "lorem ipsum dolor foo bar lorem ipsum dolor foo bar lorem ipsum dolor foo bar /"};
|
||||
charPattern = Pattern.compile(charPatternRegex);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@ -83,4 +88,30 @@ public class PatternBench {
|
||||
public boolean normalJmodMatch() {
|
||||
return jmodPattern.matcher(fileTestString).matches();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public boolean charPatternMatch() {
|
||||
return charPattern.matcher(charPatternStrings[0]).matches()
|
||||
&& charPattern.matcher(charPatternStrings[1]).matches()
|
||||
&& charPattern.matcher(charPatternStrings[2]).matches();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public boolean charPatternMatchWithCompile() {
|
||||
Pattern p = Pattern.compile(charPatternRegex);
|
||||
return p.matcher(charPatternStrings[0]).matches()
|
||||
&& p.matcher(charPatternStrings[1]).matches()
|
||||
&& p.matcher(charPatternStrings[2]).matches();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@Warmup(iterations = 3)
|
||||
@Measurement(iterations = 3)
|
||||
public Pattern charPatternCompile() {
|
||||
return Pattern.compile(charPatternRegex);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user