8225179: (regex) Minor Pattern cleanup
Reviewed-by: igerasim
This commit is contained in:
parent
bb4d8b504a
commit
a2f40ec3e1
@ -2100,7 +2100,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
private Node sequence(Node end) {
|
private Node sequence(Node end) {
|
||||||
Node head = null;
|
Node head = null;
|
||||||
Node tail = null;
|
Node tail = null;
|
||||||
Node node = null;
|
Node node;
|
||||||
LOOP:
|
LOOP:
|
||||||
for (;;) {
|
for (;;) {
|
||||||
int ch = peek();
|
int ch = peek();
|
||||||
@ -2617,7 +2617,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
CharPredicate prev = null;
|
CharPredicate prev = null;
|
||||||
CharPredicate curr = null;
|
CharPredicate curr = null;
|
||||||
BitClass bits = new BitClass();
|
BitClass bits = new BitClass();
|
||||||
BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
|
|
||||||
|
|
||||||
boolean isNeg = false;
|
boolean isNeg = false;
|
||||||
boolean hasBits = false;
|
boolean hasBits = false;
|
||||||
@ -2658,9 +2657,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
if (hasBits) {
|
if (hasBits) {
|
||||||
// bits used, union has high precedence
|
// bits used, union has high precedence
|
||||||
if (prev == null) {
|
if (prev == null) {
|
||||||
prev = curr = bitsP;
|
prev = curr = bits;
|
||||||
} else {
|
} else {
|
||||||
prev = prev.union(bitsP);
|
prev = prev.union(bits);
|
||||||
}
|
}
|
||||||
hasBits = false;
|
hasBits = false;
|
||||||
}
|
}
|
||||||
@ -2689,9 +2688,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
if (consume)
|
if (consume)
|
||||||
next();
|
next();
|
||||||
if (prev == null)
|
if (prev == null)
|
||||||
prev = bitsP;
|
prev = bits;
|
||||||
else if (hasBits)
|
else if (hasBits)
|
||||||
prev = prev.union(bitsP);
|
prev = prev.union(bits);
|
||||||
if (isNeg)
|
if (isNeg)
|
||||||
return prev.negate();
|
return prev.negate();
|
||||||
return prev;
|
return prev;
|
||||||
@ -2947,8 +2946,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
*/
|
*/
|
||||||
private Node group0() {
|
private Node group0() {
|
||||||
boolean capturingGroup = false;
|
boolean capturingGroup = false;
|
||||||
Node head = null;
|
Node head;
|
||||||
Node tail = null;
|
Node tail;
|
||||||
int save = flags0;
|
int save = flags0;
|
||||||
int saveTCNCount = topClosureNodes.size();
|
int saveTCNCount = topClosureNodes.size();
|
||||||
root = null;
|
root = null;
|
||||||
@ -2997,7 +2996,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
head = createGroup(true);
|
head = createGroup(true);
|
||||||
tail = root;
|
tail = root;
|
||||||
head.next = expr(tail);
|
head.next = expr(tail);
|
||||||
tail.next = lookbehindEnd;
|
tail.next = LookBehindEndNode.INSTANCE;
|
||||||
TreeInfo info = new TreeInfo();
|
TreeInfo info = new TreeInfo();
|
||||||
head.study(info);
|
head.study(info);
|
||||||
if (info.maxValid == false) {
|
if (info.maxValid == false) {
|
||||||
@ -3253,7 +3252,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
* Prev could be a single or a group, so it could be a chain of nodes.
|
* Prev could be a single or a group, so it could be a chain of nodes.
|
||||||
*/
|
*/
|
||||||
private Node closure(Node prev) {
|
private Node closure(Node prev) {
|
||||||
Node atom;
|
|
||||||
int ch = peek();
|
int ch = peek();
|
||||||
switch (ch) {
|
switch (ch) {
|
||||||
case '?':
|
case '?':
|
||||||
@ -3486,14 +3484,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
* never matches values above Latin-1, and a complemented BitClass always
|
* never matches values above Latin-1, and a complemented BitClass always
|
||||||
* matches values above Latin-1.
|
* matches values above Latin-1.
|
||||||
*/
|
*/
|
||||||
static final class BitClass extends BmpCharProperty {
|
static final class BitClass implements BmpCharPredicate {
|
||||||
final boolean[] bits;
|
final boolean[] bits;
|
||||||
BitClass() {
|
BitClass() {
|
||||||
this(new boolean[256]);
|
bits = new boolean[256];
|
||||||
}
|
|
||||||
private BitClass(boolean[] bits) {
|
|
||||||
super( ch -> ch < 256 && bits[ch]);
|
|
||||||
this.bits = bits;
|
|
||||||
}
|
}
|
||||||
BitClass add(int c, int flags) {
|
BitClass add(int c, int flags) {
|
||||||
assert c >= 0 && c <= 255;
|
assert c >= 0 && c <= 255;
|
||||||
@ -3509,8 +3503,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
bits[c] = true;
|
bits[c] = true;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
public boolean is(int ch) {
|
||||||
|
return ch < 256 && bits[ch];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility method for creating a string slice matcher.
|
* Utility method for creating a string slice matcher.
|
||||||
*/
|
*/
|
||||||
@ -3923,7 +3921,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
* boolean property.
|
* boolean property.
|
||||||
*/
|
*/
|
||||||
static class CharProperty extends Node {
|
static class CharProperty extends Node {
|
||||||
CharPredicate predicate;
|
final CharPredicate predicate;
|
||||||
|
|
||||||
CharProperty (CharPredicate predicate) {
|
CharProperty (CharPredicate predicate) {
|
||||||
this.predicate = predicate;
|
this.predicate = predicate;
|
||||||
@ -4698,7 +4696,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
* "next".
|
* "next".
|
||||||
*/
|
*/
|
||||||
static final class BranchConn extends Node {
|
static final class BranchConn extends Node {
|
||||||
BranchConn() {};
|
BranchConn() {}
|
||||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||||
return next.match(matcher, i, seq);
|
return next.match(matcher, i, seq);
|
||||||
}
|
}
|
||||||
@ -4795,34 +4793,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
matcher.locals[localIndex] = save;
|
matcher.locals[localIndex] = save;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
boolean matchRef(Matcher matcher, int i, CharSequence seq) {
|
|
||||||
int save = matcher.locals[localIndex];
|
|
||||||
matcher.locals[localIndex] = ~i; // HACK
|
|
||||||
boolean ret = next.match(matcher, i, seq);
|
|
||||||
matcher.locals[localIndex] = save;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Recursive reference to a group in the regular expression. It calls
|
|
||||||
* matchRef because if the reference fails to match we would not unset
|
|
||||||
* the group.
|
|
||||||
*/
|
|
||||||
static final class GroupRef extends Node {
|
|
||||||
GroupHead head;
|
|
||||||
GroupRef(GroupHead head) {
|
|
||||||
this.head = head;
|
|
||||||
}
|
|
||||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
|
||||||
return head.matchRef(matcher, i, seq)
|
|
||||||
&& next.match(matcher, matcher.last, seq);
|
|
||||||
}
|
|
||||||
boolean study(TreeInfo info) {
|
|
||||||
info.maxValid = false;
|
|
||||||
info.deterministic = false;
|
|
||||||
return next.study(info);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -4944,7 +4914,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
}
|
}
|
||||||
boolean matchInit(Matcher matcher, int i, CharSequence seq) {
|
boolean matchInit(Matcher matcher, int i, CharSequence seq) {
|
||||||
int save = matcher.locals[countIndex];
|
int save = matcher.locals[countIndex];
|
||||||
boolean ret = false;
|
boolean ret;
|
||||||
if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
|
if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
|
||||||
matcher.localsPos[posIndex] = new IntHashSet();
|
matcher.localsPos[posIndex] = new IntHashSet();
|
||||||
}
|
}
|
||||||
@ -5168,7 +5138,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
}
|
}
|
||||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||||
int savedTo = matcher.to;
|
int savedTo = matcher.to;
|
||||||
boolean conditionMatched = false;
|
boolean conditionMatched;
|
||||||
|
|
||||||
// Relax transparent region boundaries for lookahead
|
// Relax transparent region boundaries for lookahead
|
||||||
if (matcher.transparentBounds)
|
if (matcher.transparentBounds)
|
||||||
@ -5193,7 +5163,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
}
|
}
|
||||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||||
int savedTo = matcher.to;
|
int savedTo = matcher.to;
|
||||||
boolean conditionMatched = false;
|
boolean conditionMatched;
|
||||||
|
|
||||||
// Relax transparent region boundaries for lookahead
|
// Relax transparent region boundaries for lookahead
|
||||||
if (matcher.transparentBounds)
|
if (matcher.transparentBounds)
|
||||||
@ -5219,11 +5189,15 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
* For use with lookbehinds; matches the position where the lookbehind
|
* For use with lookbehinds; matches the position where the lookbehind
|
||||||
* was encountered.
|
* was encountered.
|
||||||
*/
|
*/
|
||||||
static Node lookbehindEnd = new Node() {
|
static class LookBehindEndNode extends Node {
|
||||||
|
private LookBehindEndNode() {} // Singleton
|
||||||
|
|
||||||
|
static LookBehindEndNode INSTANCE = new LookBehindEndNode();
|
||||||
|
|
||||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||||
return i == matcher.lookbehindTo;
|
return i == matcher.lookbehindTo;
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Zero width positive lookbehind.
|
* Zero width positive lookbehind.
|
||||||
@ -5491,7 +5465,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||||||
if (patternLength < 4) {
|
if (patternLength < 4) {
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
int i, j, k;
|
int i, j;
|
||||||
int[] lastOcc = new int[128];
|
int[] lastOcc = new int[128];
|
||||||
int[] optoSft = new int[patternLength];
|
int[] optoSft = new int[patternLength];
|
||||||
// Precalculate part of the bad character shift
|
// Precalculate part of the bad character shift
|
||||||
@ -5646,7 +5620,7 @@ NEXT: while (i <= last) {
|
|||||||
static interface BmpCharPredicate extends CharPredicate {
|
static interface BmpCharPredicate extends CharPredicate {
|
||||||
|
|
||||||
default CharPredicate and(CharPredicate p) {
|
default CharPredicate and(CharPredicate p) {
|
||||||
if(p instanceof BmpCharPredicate)
|
if (p instanceof BmpCharPredicate)
|
||||||
return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
|
return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
|
||||||
return ch -> is(ch) && p.is(ch);
|
return ch -> is(ch) && p.is(ch);
|
||||||
}
|
}
|
||||||
|
@ -45,12 +45,13 @@ public class PatternBench {
|
|||||||
public String fileTestString;
|
public String fileTestString;
|
||||||
public String flagsString;
|
public String flagsString;
|
||||||
|
|
||||||
|
|
||||||
public Pattern graphemePattern;
|
public Pattern graphemePattern;
|
||||||
public Pattern jmodPattern;
|
public Pattern jmodPattern;
|
||||||
public Pattern jmodCanonicalPattern;
|
public Pattern jmodCanonicalPattern;
|
||||||
|
|
||||||
public Pattern pattern;
|
public String charPatternRegex;
|
||||||
|
public String[] charPatternStrings;
|
||||||
|
public Pattern charPattern;
|
||||||
|
|
||||||
@Setup
|
@Setup
|
||||||
public void setup() {
|
public void setup() {
|
||||||
@ -61,6 +62,10 @@ public class PatternBench {
|
|||||||
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
|
||||||
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
|
||||||
jmodPattern = Pattern.compile(jmodRegex);
|
jmodPattern = Pattern.compile(jmodRegex);
|
||||||
|
|
||||||
|
charPatternRegex = "[ a-zA-Z]*foo[ a-zA-Z0-9]*bar[ a-z]*";
|
||||||
|
charPatternStrings = new String[] {"avaaafooddddddbariiii", "lorem ipsum dolor foo bar", "fpp brr lorem ipsum dolor foo bar %", "lorem ipsum dolor foo bar lorem ipsum dolor foo bar lorem ipsum dolor foo bar /"};
|
||||||
|
charPattern = Pattern.compile(charPatternRegex);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Benchmark
|
@Benchmark
|
||||||
@ -83,4 +88,30 @@ public class PatternBench {
|
|||||||
public boolean normalJmodMatch() {
|
public boolean normalJmodMatch() {
|
||||||
return jmodPattern.matcher(fileTestString).matches();
|
return jmodPattern.matcher(fileTestString).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public boolean charPatternMatch() {
|
||||||
|
return charPattern.matcher(charPatternStrings[0]).matches()
|
||||||
|
&& charPattern.matcher(charPatternStrings[1]).matches()
|
||||||
|
&& charPattern.matcher(charPatternStrings[2]).matches();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public boolean charPatternMatchWithCompile() {
|
||||||
|
Pattern p = Pattern.compile(charPatternRegex);
|
||||||
|
return p.matcher(charPatternStrings[0]).matches()
|
||||||
|
&& p.matcher(charPatternStrings[1]).matches()
|
||||||
|
&& p.matcher(charPatternStrings[2]).matches();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@Warmup(iterations = 3)
|
||||||
|
@Measurement(iterations = 3)
|
||||||
|
public Pattern charPatternCompile() {
|
||||||
|
return Pattern.compile(charPatternRegex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user