7014640: To add a metachar \R for line ending and character classes for vertical/horizontal ws \v \V \h \H

Added propsoed constructs

Reviewed-by: alanb
This commit is contained in:
Xueming Shen 2012-05-08 10:57:13 -07:00
parent 828158fb8d
commit c8da8f2595
2 changed files with 189 additions and 35 deletions

View File

@ -152,15 +152,24 @@ import java.util.Arrays;
* <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
* <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\h</tt></td>
* <td headers="matches">A horizontal whitespace character:
* <tt>[ \t\xA0&#92;u1680&#92;u180e&#92;u2000-&#92;u200a&#92;u202f&#92;u205f&#92;u3000]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\H</tt></td>
* <td headers="matches">A non-horizontal whitespace character: <tt>[^\h]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
* <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
* <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\v</tt></td>
* <td headers="matches">A vertical whitespace character: <tt>[\n\x0B\f\r\x85&#92;u2028&#92;u2029]</tt>
* </td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\V</tt></td>
* <td headers="matches">A non-vertical whitespace character: <tt>[^\v]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
* <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
* <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
*
* <tr><th>&nbsp;</th></tr>
* <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
*
@ -244,6 +253,13 @@ import java.util.Arrays;
* <td headers="matches">The end of the input</td></tr>
*
* <tr><th>&nbsp;</th></tr>
* <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
* <tr><td valign="top" headers="construct lineending"><tt>\R</tt></td>
* <td headers="matches">Any Unicode linebreak sequence, is equivalent to
* <tt>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
* </tt></td></tr>
*
* <tr><th>&nbsp;</th></tr>
* <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
*
* <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
@ -599,11 +615,9 @@ import java.util.Arrays;
* <li> Noncharacter_Code_Point
* <li> Assigned
* </ul>
* <p>
* <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
* conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
* The following <b>Predefined Character classes</b> and <b>POSIX character classes</b>
* are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
* of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
* </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
* <p>
@ -668,12 +682,6 @@ import java.util.Arrays;
*
* <ul>
* <li><p> Predefined character classes (Unicode character)
* <p><tt>\h&nbsp;&nbsp;&nbsp;&nbsp;</tt>A horizontal whitespace
* <p><tt>\H&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non horizontal whitespace
* <p><tt>\v&nbsp;&nbsp;&nbsp;&nbsp;</tt>A vertical whitespace
* <p><tt>\V&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non vertical whitespace
* <p><tt>\R&nbsp;&nbsp;&nbsp;&nbsp;</tt>Any Unicode linebreak sequence
* <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt>
* <p><tt>\X&nbsp;&nbsp;&nbsp;&nbsp;</tt>Match Unicode
* <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
* <i>extended grapheme cluster</i></a>
@ -2178,7 +2186,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
unread();
prev = cursor;
ch = escape(false, first == 0);
ch = escape(false, first == 0, false);
if (ch >= 0) {
append(ch, first);
first++;
@ -2276,7 +2284,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* If the returned value is greater than zero, it is the value that
* matches the escape sequence.
*/
private int escape(boolean inclass, boolean create) {
private int escape(boolean inclass, boolean create, boolean isrange) {
int ch = skip();
switch (ch) {
case '0':
@ -2318,6 +2326,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (create) root = new LastMatch();
return -1;
case 'H':
if (create) root = new HorizWS().complement();
return -1;
case 'I':
case 'J':
case 'K':
@ -2327,8 +2337,11 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'O':
case 'P':
case 'Q':
case 'R':
break;
case 'R':
if (inclass) break;
if (create) root = new LineEnding();
return -1;
case 'S':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE).complement()
@ -2336,8 +2349,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return -1;
case 'T':
case 'U':
case 'V':
break;
case 'V':
if (create) root = new VertWS().complement();
return -1;
case 'W':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD).complement()
@ -2373,7 +2388,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'f':
return '\f';
case 'g':
break;
case 'h':
if (create) root = new HorizWS();
return -1;
case 'i':
case 'j':
break;
@ -2413,7 +2431,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'u':
return u();
case 'v':
return '\013';
// '\v' was implemented as VT/0x0B in releases < 1.8 (though
// undocumented). In JDK8 '\v' is specified as a predefined
// character class for all vertical whitespace characters.
// So [-1, root=VertWS node] pair is returned (instead of a
// single 0x0B). This breaks the range if '\v' is used as
// the start or end value, such as [\v-...] or [...-\v], in
// which a single definite value (0x0B) is expected. For
// compatiblity concern '\013'/0x0B is returned if isrange.
if (isrange)
return '\013';
if (create) root = new VertWS();
return -1;
case 'w':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD)
@ -2590,13 +2619,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
oneLetter = false;
return family(oneLetter, comp);
} else { // ordinary escape
boolean isrange = temp[cursor+1] == '-';
unread();
ch = escape(true, true);
ch = escape(true, true, isrange);
if (ch == -1)
return (CharProperty) root;
}
} else {
ch = single();
next();
}
if (ch >= 0) {
if (peek() == '-') {
@ -2606,9 +2636,15 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
if (endRange != ']') {
next();
int m = single();
if (m < ch)
int m = peek();
if (m == '\\') {
m = escape(true, false, true);
} else {
next();
}
if (m < ch) {
throw error("Illegal character range");
}
if (has(CASE_INSENSITIVE))
return caseInsensitiveRangeFor(ch, m);
else
@ -2620,17 +2656,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
throw error("Unexpected character '"+((char)ch)+"'");
}
private int single() {
int ch = peek();
switch (ch) {
case '\\':
return escape(true, false);
default:
next();
return ch;
}
}
/**
* Parses a Unicode character family and returns its representative node.
*/
@ -3694,6 +3719,35 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
}
/**
* Node class that matches a Unicode line ending '\R'
*/
static final class LineEnding extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
// (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])
if (i < matcher.to) {
int ch = seq.charAt(i);
if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||
ch == 0x85 || ch == 0x2028 || ch == 0x2029)
return next.match(matcher, i + 1, seq);
if (ch == 0x0D) {
i++;
if (i < matcher.to && seq.charAt(i) == 0x0A)
i++;
return next.match(matcher, i, seq);
}
} else {
matcher.hitEnd = true;
}
return false;
}
boolean study(TreeInfo info) {
info.minLength++;
info.maxLength += 2;
return next.study(info);
}
}
/**
* Abstract node class to match one character satisfying some
* boolean property.
@ -3789,7 +3843,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
}
/**
* Node class that matches a Unicode block.
*/
@ -3838,7 +3891,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
}
/**
* Node class that matches a POSIX type.
*/
@ -3850,6 +3902,28 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
}
/**
* Node class that matches a Perl vertical whitespace
*/
static final class VertWS extends BmpCharProperty {
boolean isSatisfiedBy(int cp) {
return (cp >= 0x0A && cp <= 0x0D) ||
cp == 0x85 || cp == 0x2028 || cp == 0x2029;
}
}
/**
* Node class that matches a Perl horizontal whitespace
*/
static final class HorizWS extends BmpCharProperty {
boolean isSatisfiedBy(int cp) {
return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
cp == 0x1680 || cp == 0x180e ||
cp >= 0x2000 && cp <= 0x200a ||
cp == 0x202f || cp == 0x205f || cp == 0x3000;
}
}
/**
* Base class for all Slice nodes
*/

View File

@ -33,7 +33,7 @@
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
* 7067045
* 7067045 7014640
*/
import java.util.regex.*;
@ -141,6 +141,8 @@ public class RegExTest {
unicodePropertiesTest();
unicodeHexNotationTest();
unicodeClassesTest();
horizontalAndVerticalWSTest();
linebreakTest();
if (failure) {
throw new
RuntimeException("RegExTest failed, 1st failure: " +
@ -857,13 +859,18 @@ public class RegExTest {
// in replacement string
try {
"\uac00".replaceAll("\uac00", "$");
failCount++;
} catch (IllegalArgumentException iie) {
} catch (Exception e) {
failCount++;
}
try {
"\uac00".replaceAll("\uac00", "\\");
failCount++;
} catch (IllegalArgumentException iie) {
} catch (Exception e) {
failCount++;
}
report("Literal replacement");
}
@ -3838,4 +3845,77 @@ public class RegExTest {
failCount++;
report("unicodePredefinedClasses");
}
private static void horizontalAndVerticalWSTest() throws Exception {
String hws = new String (new char[] {
0x09, 0x20, 0xa0, 0x1680, 0x180e,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
0x202f, 0x205f, 0x3000 });
String vws = new String (new char[] {
0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 });
if (!Pattern.compile("\\h+").matcher(hws).matches() ||
!Pattern.compile("[\\h]+").matcher(hws).matches())
failCount++;
if (Pattern.compile("\\H").matcher(hws).find() ||
Pattern.compile("[\\H]").matcher(hws).find())
failCount++;
if (!Pattern.compile("\\v+").matcher(vws).matches() ||
!Pattern.compile("[\\v]+").matcher(vws).matches())
failCount++;
if (Pattern.compile("\\V").matcher(vws).find() ||
Pattern.compile("[\\V]").matcher(vws).find())
failCount++;
String prefix = "abcd";
String suffix = "efgh";
String ng = "A";
for (int i = 0; i < hws.length(); i++) {
String c = String.valueOf(hws.charAt(i));
Matcher m = Pattern.compile("\\h").matcher(prefix + c + suffix);
if (!m.find() || !c.equals(m.group()))
failCount++;
m = Pattern.compile("[\\h]").matcher(prefix + c + suffix);
if (!m.find() || !c.equals(m.group()))
failCount++;
m = Pattern.compile("\\H").matcher(hws.substring(0, i) + ng + hws.substring(i));
if (!m.find() || !ng.equals(m.group()))
failCount++;
m = Pattern.compile("[\\H]").matcher(hws.substring(0, i) + ng + hws.substring(i));
if (!m.find() || !ng.equals(m.group()))
failCount++;
}
for (int i = 0; i < vws.length(); i++) {
String c = String.valueOf(vws.charAt(i));
Matcher m = Pattern.compile("\\v").matcher(prefix + c + suffix);
if (!m.find() || !c.equals(m.group()))
failCount++;
m = Pattern.compile("[\\v]").matcher(prefix + c + suffix);
if (!m.find() || !c.equals(m.group()))
failCount++;
m = Pattern.compile("\\V").matcher(vws.substring(0, i) + ng + vws.substring(i));
if (!m.find() || !ng.equals(m.group()))
failCount++;
m = Pattern.compile("[\\V]").matcher(vws.substring(0, i) + ng + vws.substring(i));
if (!m.find() || !ng.equals(m.group()))
failCount++;
}
// \v in range is interpreted as 0x0B. This is the undocumented behavior
if (!Pattern.compile("[\\v-\\v]").matcher(String.valueOf((char)0x0B)).matches())
failCount++;
report("horizontalAndVerticalWSTest");
}
private static void linebreakTest() throws Exception {
String linebreaks = new String (new char[] {
0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 });
String crnl = "\r\n";
if (!Pattern.compile("\\R+").matcher(linebreaks).matches() ||
!Pattern.compile("\\R").matcher(crnl).matches() ||
Pattern.compile("\\R\\R").matcher(crnl).matches())
failCount++;
report("linebreakTest");
}
}