7039066: j.u.rgex does not match TR18 RL1.4 Simple Word Boundaries and RL1.2 Properties
Updated the regex Unicode property support Reviewed-by: alanb
This commit is contained in:
parent
4463efaf61
commit
df7a03a4c3
@ -206,13 +206,15 @@ import java.util.Arrays;
|
||||
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
|
||||
*
|
||||
* <tr><th> </th></tr>
|
||||
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
|
||||
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
|
||||
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
|
||||
* <td headers="matches">A Latin script character (simple <a href="#ubc">script</a>)</td></tr>
|
||||
* <td headers="matches">A Latin script character (<a href="#usc">script</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
|
||||
* <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
|
||||
* <td headers="matches">A character in the Greek block (<a href="#ubc">block</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
|
||||
* <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
|
||||
* <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
|
||||
* <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
|
||||
* <td headers="matches">A currency symbol</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
|
||||
@ -328,10 +330,11 @@ import java.util.Arrays;
|
||||
* <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
|
||||
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
|
||||
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
|
||||
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
|
||||
* <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU) </tt></td>
|
||||
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
|
||||
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
|
||||
* <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
|
||||
* <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
|
||||
* on - off</td></tr>
|
||||
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
|
||||
* <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
|
||||
* given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
|
||||
@ -518,61 +521,140 @@ import java.util.Arrays;
|
||||
*
|
||||
* <p> This class is in conformance with Level 1 of <a
|
||||
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
|
||||
* Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
|
||||
* Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
|
||||
* Canonical Equivalents.
|
||||
*
|
||||
* <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
|
||||
* <p>
|
||||
* <b>Unicode escape sequences</b> such as <tt>\u2014</tt> in Java source code
|
||||
* are processed as described in section 3.3 of
|
||||
* <cite>The Java™ Language Specification</cite>.
|
||||
* Such escape sequences are also
|
||||
* implemented directly by the regular-expression parser so that Unicode
|
||||
* escapes can be used in expressions that are read from files or from the
|
||||
* keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
|
||||
* while not equal, compile into the same pattern, which matches the character
|
||||
* with hexadecimal value <tt>0x2014</tt>.
|
||||
*
|
||||
* <p> A Unicode character can also be represented in a regular-expression by
|
||||
* using its hexadecimal code point value directly as described in construct
|
||||
* Such escape sequences are also implemented directly by the regular-expression
|
||||
* parser so that Unicode escapes can be used in expressions that are read from
|
||||
* files or from the keyboard. Thus the strings <tt>"\u2014"</tt> and
|
||||
* <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
|
||||
* matches the character with hexadecimal value <tt>0x2014</tt>.
|
||||
* <p>
|
||||
* A Unicode character can also be represented in a regular-expression by
|
||||
* using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
|
||||
* <tt>\x{...}</tt>, for example a supplementary character U+2011F
|
||||
* can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
|
||||
* Unicode escape sequences of the surrogate pair
|
||||
* <tt>\uD840</tt><tt>\uDD1F</tt>.
|
||||
*
|
||||
* <a name="ubc">
|
||||
* <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
|
||||
* <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
|
||||
* <p>
|
||||
* Unicode scripts, blocks, categories and binary properties are written with
|
||||
* the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
|
||||
* <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
|
||||
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
|
||||
* does not match if the input has that property.
|
||||
* <p>
|
||||
* Scripts are specified either with the prefix {@code Is}, as in
|
||||
* Scripts, blocks, categories and binary properties can be used both inside
|
||||
* and outside of a character class.
|
||||
* <a name="usc">
|
||||
* <p>
|
||||
* <b>Scripts</b> are specified either with the prefix {@code Is}, as in
|
||||
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
|
||||
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
|
||||
* <p>
|
||||
* Blocks are specified with the prefix {@code In}, as in
|
||||
* The script names supported by <code>Pattern</code> are the valid script names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
|
||||
* <a name="ubc">
|
||||
* <p>
|
||||
* <b>Blocks</b> are specified with the prefix {@code In}, as in
|
||||
* {@code InMongolian}, or by using the keyword {@code block} (or its short
|
||||
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
|
||||
* <p>
|
||||
* Categories may be specified with the optional prefix {@code Is}:
|
||||
* The block names supported by <code>Pattern</code> are the valid block names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
|
||||
* <p>
|
||||
* <a name="ucc">
|
||||
* <b>Categories</b> may be specified with the optional prefix {@code Is}:
|
||||
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
|
||||
* letters. Same as scripts and blocks, categories can also be specified
|
||||
* by using the keyword {@code general_category} (or its short form
|
||||
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
|
||||
* <p>
|
||||
* Scripts, blocks and categories can be used both inside and outside of a
|
||||
* character class.
|
||||
* <p> The supported categories are those of
|
||||
* The supported categories are those of
|
||||
* <a href="http://www.unicode.org/unicode/standard/standard.html">
|
||||
* <i>The Unicode Standard</i></a> in the version specified by the
|
||||
* {@link java.lang.Character Character} class. The category names are those
|
||||
* defined in the Standard, both normative and informative.
|
||||
* The script names supported by <code>Pattern</code> are the valid script names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
|
||||
* The block names supported by <code>Pattern</code> are the valid block names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
|
||||
* <p>
|
||||
* <a name="jcc"> <p>Categories that behave like the java.lang.Character
|
||||
* <a name="ubpc">
|
||||
* <b>Binary properties</b> are specified with the prefix {@code Is}, as in
|
||||
* {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
|
||||
* are
|
||||
* <ul>
|
||||
* <li> Alphabetic
|
||||
* <li> Ideographic
|
||||
* <li> Letter
|
||||
* <li> Lowercase
|
||||
* <li> Uppercase
|
||||
* <li> Titlecase
|
||||
* <li> Punctuation
|
||||
* <Li> Control
|
||||
* <li> White_Space
|
||||
* <li> Digit
|
||||
* <li> Hex_Digit
|
||||
* <li> Noncharacter_Code_Point
|
||||
* <li> Assigned
|
||||
* </ul>
|
||||
|
||||
|
||||
* <p>
|
||||
* <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
|
||||
* conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
|
||||
* of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
|
||||
* </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
|
||||
* <p>
|
||||
* <table border="0" cellpadding="1" cellspacing="0"
|
||||
* summary="predefined and posix character classes in Unicode mode">
|
||||
* <tr align="left">
|
||||
* <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
|
||||
* <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
|
||||
*</tr>
|
||||
* <tr><td><tt>\p{Lower}</tt></td>
|
||||
* <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
|
||||
* <tr><td><tt>\p{Upper}</tt></td>
|
||||
* <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
|
||||
* <tr><td><tt>\p{ASCII}</tt></td>
|
||||
* <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Alpha}</tt></td>
|
||||
* <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
|
||||
* <tr><td><tt>\p{Digit}</tt></td>
|
||||
* <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
|
||||
* <tr><td><tt>\p{Alnum}</tt></td>
|
||||
* <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Punct}</tt></td>
|
||||
* <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
|
||||
* <tr><td><tt>\p{Graph}</tt></td>
|
||||
* <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Print}</tt></td>
|
||||
* <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Blank}</tt></td>
|
||||
* <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Cntrl}</tt></td>
|
||||
* <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
|
||||
* <tr><td><tt>\p{XDigit}</tt></td>
|
||||
* <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
|
||||
* <tr><td><tt>\p{Space}</tt></td>
|
||||
* <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
|
||||
* <tr><td><tt>\d</tt></td>
|
||||
* <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
|
||||
* <tr><td><tt>\D</tt></td>
|
||||
* <td>A non-digit: <tt>[^\d]</tt></td></tr>
|
||||
* <tr><td><tt>\s</tt></td>
|
||||
* <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
|
||||
* <tr><td><tt>\S</tt></td>
|
||||
* <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
|
||||
* <tr><td><tt>\w</tt></td>
|
||||
* <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
|
||||
* <tr><td><tt>\W</tt></td>
|
||||
* <td>A non-word character: <tt>[^\w]</tt></td></tr>
|
||||
* </table>
|
||||
* <p>
|
||||
* <a name="jcc">
|
||||
* Categories that behave like the java.lang.Character
|
||||
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
|
||||
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
|
||||
* the specified property has the name <tt>java<i>methodname</i></tt>.
|
||||
@ -796,6 +878,28 @@ public final class Pattern
|
||||
*/
|
||||
public static final int CANON_EQ = 0x80;
|
||||
|
||||
/**
|
||||
* Enables the Unicode version of <i>Predefined character classes</i> and
|
||||
* <i>POSIX character classes</i>.
|
||||
*
|
||||
* <p> When this flag is specified then the (US-ASCII only)
|
||||
* <i>Predefined character classes</i> and <i>POSIX character classes</i>
|
||||
* are in conformance with
|
||||
* <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
|
||||
* Standard #18: Unicode Regular Expression</i></a>
|
||||
* <i>Annex C: Compatibility Properties</i>.
|
||||
* <p>
|
||||
* The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
|
||||
* flag expression <tt>(?U)</tt>.
|
||||
* <p>
|
||||
* The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
|
||||
* folding.
|
||||
* <p>
|
||||
* Specifying this flag may impose a performance penalty. </p>
|
||||
* @since 1.7
|
||||
*/
|
||||
public static final int UNICODE_CHARACTER_CLASS = 0x100;
|
||||
|
||||
/* Pattern has only two serialized components: The pattern string
|
||||
* and the flags, which are all that is needed to recompile the pattern
|
||||
* when it is deserialized.
|
||||
@ -918,7 +1022,8 @@ public final class Pattern
|
||||
* Match flags, a bit mask that may include
|
||||
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
|
||||
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
|
||||
* {@link #LITERAL} and {@link #COMMENTS}
|
||||
* {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
|
||||
* and {@link #COMMENTS}
|
||||
*
|
||||
* @throws IllegalArgumentException
|
||||
* If bit values other than those corresponding to the defined
|
||||
@ -1209,6 +1314,10 @@ public final class Pattern
|
||||
pattern = p;
|
||||
flags = f;
|
||||
|
||||
// to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
|
||||
if ((flags & UNICODE_CHARACTER_CLASS) != 0)
|
||||
flags |= UNICODE_CASE;
|
||||
|
||||
// Reset group index count
|
||||
capturingGroupCount = 1;
|
||||
localCount = 0;
|
||||
@ -2164,12 +2273,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
return -1;
|
||||
case 'B':
|
||||
if (inclass) break;
|
||||
if (create) root = new Bound(Bound.NONE);
|
||||
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
|
||||
return -1;
|
||||
case 'C':
|
||||
break;
|
||||
case 'D':
|
||||
if (create) root = new Ctype(ASCII.DIGIT).complement();
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.DIGIT).complement()
|
||||
: new Ctype(ASCII.DIGIT).complement();
|
||||
return -1;
|
||||
case 'E':
|
||||
case 'F':
|
||||
@ -2191,14 +2302,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'R':
|
||||
break;
|
||||
case 'S':
|
||||
if (create) root = new Ctype(ASCII.SPACE).complement();
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.WHITE_SPACE).complement()
|
||||
: new Ctype(ASCII.SPACE).complement();
|
||||
return -1;
|
||||
case 'T':
|
||||
case 'U':
|
||||
case 'V':
|
||||
break;
|
||||
case 'W':
|
||||
if (create) root = new Ctype(ASCII.WORD).complement();
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.WORD).complement()
|
||||
: new Ctype(ASCII.WORD).complement();
|
||||
return -1;
|
||||
case 'X':
|
||||
case 'Y':
|
||||
@ -2216,12 +2331,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
return '\007';
|
||||
case 'b':
|
||||
if (inclass) break;
|
||||
if (create) root = new Bound(Bound.BOTH);
|
||||
if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
|
||||
return -1;
|
||||
case 'c':
|
||||
return c();
|
||||
case 'd':
|
||||
if (create) root = new Ctype(ASCII.DIGIT);
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.DIGIT)
|
||||
: new Ctype(ASCII.DIGIT);
|
||||
return -1;
|
||||
case 'e':
|
||||
return '\033';
|
||||
@ -2259,7 +2376,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'r':
|
||||
return '\r';
|
||||
case 's':
|
||||
if (create) root = new Ctype(ASCII.SPACE);
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.WHITE_SPACE)
|
||||
: new Ctype(ASCII.SPACE);
|
||||
return -1;
|
||||
case 't':
|
||||
return '\t';
|
||||
@ -2268,7 +2387,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'v':
|
||||
return '\013';
|
||||
case 'w':
|
||||
if (create) root = new Ctype(ASCII.WORD);
|
||||
if (create) root = has(UNICODE_CHARACTER_CLASS)
|
||||
? new Utype(UnicodeProp.WORD)
|
||||
: new Ctype(ASCII.WORD);
|
||||
return -1;
|
||||
case 'x':
|
||||
return x();
|
||||
@ -2490,7 +2611,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
{
|
||||
next();
|
||||
String name;
|
||||
CharProperty node;
|
||||
CharProperty node = null;
|
||||
|
||||
if (singleLetter) {
|
||||
int c = temp[cursor];
|
||||
@ -2536,11 +2657,21 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
} else if (name.startsWith("Is")) {
|
||||
// \p{isGeneralCategory} and \p{isScriptName}
|
||||
name = name.substring(2);
|
||||
node = CharPropertyNames.charPropertyFor(name);
|
||||
UnicodeProp uprop = UnicodeProp.forName(name);
|
||||
if (uprop != null)
|
||||
node = new Utype(uprop);
|
||||
if (node == null)
|
||||
node = CharPropertyNames.charPropertyFor(name);
|
||||
if (node == null)
|
||||
node = unicodeScriptPropertyFor(name);
|
||||
} else {
|
||||
node = charPropertyNodeFor(name);
|
||||
if (has(UNICODE_CHARACTER_CLASS)) {
|
||||
UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
|
||||
if (uprop != null)
|
||||
node = new Utype(uprop);
|
||||
}
|
||||
if (node == null)
|
||||
node = charPropertyNodeFor(name);
|
||||
}
|
||||
}
|
||||
if (maybeComplement) {
|
||||
@ -2822,6 +2953,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'x':
|
||||
flags |= COMMENTS;
|
||||
break;
|
||||
case 'U':
|
||||
flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
|
||||
break;
|
||||
case '-': // subFlag then fall through
|
||||
ch = next();
|
||||
subFlag();
|
||||
@ -2861,6 +2995,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'x':
|
||||
flags &= ~COMMENTS;
|
||||
break;
|
||||
case 'U':
|
||||
flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
|
||||
default:
|
||||
return;
|
||||
}
|
||||
@ -3663,6 +3799,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Node class that matches a Unicode "type"
|
||||
*/
|
||||
static final class Utype extends CharProperty {
|
||||
final UnicodeProp uprop;
|
||||
Utype(UnicodeProp uprop) { this.uprop = uprop; }
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return uprop.is(ch);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Node class that matches a POSIX type.
|
||||
*/
|
||||
@ -5025,9 +5173,17 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
static int BOTH = 0x3;
|
||||
static int NONE = 0x4;
|
||||
int type;
|
||||
Bound(int n) {
|
||||
boolean useUWORD;
|
||||
Bound(int n, boolean useUWORD) {
|
||||
type = n;
|
||||
this.useUWORD = useUWORD;
|
||||
}
|
||||
|
||||
boolean isWord(int ch) {
|
||||
return useUWORD ? UnicodeProp.WORD.is(ch)
|
||||
: (ch == '_' || Character.isLetterOrDigit(ch));
|
||||
}
|
||||
|
||||
int check(Matcher matcher, int i, CharSequence seq) {
|
||||
int ch;
|
||||
boolean left = false;
|
||||
@ -5039,14 +5195,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
if (i > startIndex) {
|
||||
ch = Character.codePointBefore(seq, i);
|
||||
left = (ch == '_' || Character.isLetterOrDigit(ch) ||
|
||||
left = (isWord(ch) ||
|
||||
((Character.getType(ch) == Character.NON_SPACING_MARK)
|
||||
&& hasBaseCharacter(matcher, i-1, seq)));
|
||||
}
|
||||
boolean right = false;
|
||||
if (i < endIndex) {
|
||||
ch = Character.codePointAt(seq, i);
|
||||
right = (ch == '_' || Character.isLetterOrDigit(ch) ||
|
||||
right = (isWord(ch) ||
|
||||
((Character.getType(ch) == Character.NON_SPACING_MARK)
|
||||
&& hasBaseCharacter(matcher, i, seq)));
|
||||
} else {
|
||||
@ -5428,6 +5584,12 @@ NEXT: while (i <= last) {
|
||||
defClone("javaUpperCase", new CloneableProperty() {
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return Character.isUpperCase(ch);}});
|
||||
defClone("javaAlphabetic", new CloneableProperty() {
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return Character.isAlphabetic(ch);}});
|
||||
defClone("javaIdeographic", new CloneableProperty() {
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return Character.isIdeographic(ch);}});
|
||||
defClone("javaTitleCase", new CloneableProperty() {
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return Character.isTitleCase(ch);}});
|
||||
|
236
jdk/src/share/classes/java/util/regex/UnicodeProp.java
Normal file
236
jdk/src/share/classes/java/util/regex/UnicodeProp.java
Normal file
@ -0,0 +1,236 @@
|
||||
/*
|
||||
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
enum UnicodeProp {
|
||||
|
||||
ALPHABETIC {
|
||||
public boolean is(int ch) {
|
||||
return Character.isAlphabetic(ch);
|
||||
}
|
||||
},
|
||||
|
||||
LETTER {
|
||||
public boolean is(int ch) {
|
||||
return Character.isLetter(ch);
|
||||
}
|
||||
},
|
||||
|
||||
IDEOGRAPHIC {
|
||||
public boolean is(int ch) {
|
||||
return Character.isIdeographic(ch);
|
||||
}
|
||||
},
|
||||
|
||||
LOWERCASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isLowerCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
UPPERCASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isUpperCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
TITLECASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isTitleCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
WHITE_SPACE {
|
||||
// \p{Whitespace}
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
|
||||
}
|
||||
},
|
||||
|
||||
CONTROL {
|
||||
// \p{gc=Control}
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) == Character.CONTROL;
|
||||
}
|
||||
},
|
||||
|
||||
PUNCTUATION {
|
||||
// \p{gc=Punctuation}
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
}
|
||||
},
|
||||
|
||||
HEX_DIGIT {
|
||||
// \p{gc=Decimal_Number}
|
||||
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
|
||||
public boolean is(int ch) {
|
||||
return DIGIT.is(ch) ||
|
||||
(ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46);
|
||||
}
|
||||
},
|
||||
|
||||
ASSIGNED {
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) != Character.UNASSIGNED;
|
||||
}
|
||||
},
|
||||
|
||||
NONCHARACTER_CODE_POINT {
|
||||
// PropList.txt:Noncharacter_Code_Point
|
||||
public boolean is(int ch) {
|
||||
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
}
|
||||
},
|
||||
|
||||
DIGIT {
|
||||
// \p{gc=Decimal_Number}
|
||||
public boolean is(int ch) {
|
||||
return Character.isDigit(ch);
|
||||
}
|
||||
},
|
||||
|
||||
ALNUM {
|
||||
// \p{alpha}
|
||||
// \p{digit}
|
||||
public boolean is(int ch) {
|
||||
return ALPHABETIC.is(ch) || DIGIT.is(ch);
|
||||
}
|
||||
},
|
||||
|
||||
BLANK {
|
||||
// \p{Whitespace} --
|
||||
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
|
||||
// \p{gc=Line_Separator}
|
||||
// \p{gc=Paragraph_Separator}]
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) == Character.SPACE_SEPARATOR ||
|
||||
ch == 0x9; // \N{HT}
|
||||
}
|
||||
},
|
||||
|
||||
GRAPH {
|
||||
// [^
|
||||
// \p{space}
|
||||
// \p{gc=Control}
|
||||
// \p{gc=Surrogate}
|
||||
// \p{gc=Unassigned}]
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR) |
|
||||
(1 << Character.CONTROL) |
|
||||
(1 << Character.SURROGATE) |
|
||||
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
|
||||
== 0;
|
||||
}
|
||||
},
|
||||
|
||||
PRINT {
|
||||
// \p{graph}
|
||||
// \p{blank}
|
||||
// -- \p{cntrl}
|
||||
public boolean is(int ch) {
|
||||
return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
|
||||
}
|
||||
},
|
||||
|
||||
WORD {
|
||||
// \p{alpha}
|
||||
// \p{gc=Mark}
|
||||
// \p{digit}
|
||||
// \p{gc=Connector_Punctuation}
|
||||
|
||||
public boolean is(int ch) {
|
||||
return ALPHABETIC.is(ch) ||
|
||||
((((1 << Character.NON_SPACING_MARK) |
|
||||
(1 << Character.ENCLOSING_MARK) |
|
||||
(1 << Character.COMBINING_SPACING_MARK) |
|
||||
(1 << Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
}
|
||||
};
|
||||
|
||||
private final static HashMap<String, String> posix = new HashMap<>();
|
||||
private final static HashMap<String, String> aliases = new HashMap<>();
|
||||
static {
|
||||
posix.put("ALPHA", "ALPHABETIC");
|
||||
posix.put("LOWER", "LOWERCASE");
|
||||
posix.put("UPPER", "UPPERCASE");
|
||||
posix.put("SPACE", "WHITE_SPACE");
|
||||
posix.put("PUNCT", "PUNCTUATION");
|
||||
posix.put("XDIGIT","HEX_DIGIT");
|
||||
posix.put("ALNUM", "ALNUM");
|
||||
posix.put("CNTRL", "CONTROL");
|
||||
posix.put("DIGIT", "DIGIT");
|
||||
posix.put("BLANK", "BLANK");
|
||||
posix.put("GRAPH", "GRAPH");
|
||||
posix.put("PRINT", "PRINT");
|
||||
|
||||
aliases.put("WHITESPACE", "WHITE_SPACE");
|
||||
aliases.put("HEXDIGIT","HEX_DIGIT");
|
||||
aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
|
||||
}
|
||||
|
||||
public static UnicodeProp forName(String propName) {
|
||||
propName = propName.toUpperCase(Locale.ENGLISH);
|
||||
String alias = aliases.get(propName);
|
||||
if (alias != null)
|
||||
propName = alias;
|
||||
try {
|
||||
return valueOf (propName);
|
||||
} catch (IllegalArgumentException x) {}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static UnicodeProp forPOSIXName(String propName) {
|
||||
propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
|
||||
if (propName == null)
|
||||
return null;
|
||||
return valueOf (propName);
|
||||
}
|
||||
|
||||
public abstract boolean is(int ch);
|
||||
}
|
247
jdk/test/java/util/regex/POSIX_ASCII.java
Normal file
247
jdk/test/java/util/regex/POSIX_ASCII.java
Normal file
@ -0,0 +1,247 @@
|
||||
/*
|
||||
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
|
||||
final class POSIX_ASCII {
|
||||
|
||||
static final int UPPER = 0x00000100;
|
||||
|
||||
static final int LOWER = 0x00000200;
|
||||
|
||||
static final int DIGIT = 0x00000400;
|
||||
|
||||
static final int SPACE = 0x00000800;
|
||||
|
||||
static final int PUNCT = 0x00001000;
|
||||
|
||||
static final int CNTRL = 0x00002000;
|
||||
|
||||
static final int BLANK = 0x00004000;
|
||||
|
||||
static final int HEX = 0x00008000;
|
||||
|
||||
static final int UNDER = 0x00010000;
|
||||
|
||||
static final int ASCII = 0x0000FF00;
|
||||
|
||||
static final int ALPHA = (UPPER|LOWER);
|
||||
|
||||
static final int ALNUM = (UPPER|LOWER|DIGIT);
|
||||
|
||||
static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT);
|
||||
|
||||
static final int WORD = (UPPER|LOWER|UNDER|DIGIT);
|
||||
|
||||
static final int XDIGIT = (HEX);
|
||||
|
||||
private static final int[] ctype = new int[] {
|
||||
CNTRL, /* 00 (NUL) */
|
||||
CNTRL, /* 01 (SOH) */
|
||||
CNTRL, /* 02 (STX) */
|
||||
CNTRL, /* 03 (ETX) */
|
||||
CNTRL, /* 04 (EOT) */
|
||||
CNTRL, /* 05 (ENQ) */
|
||||
CNTRL, /* 06 (ACK) */
|
||||
CNTRL, /* 07 (BEL) */
|
||||
CNTRL, /* 08 (BS) */
|
||||
SPACE+CNTRL+BLANK, /* 09 (HT) */
|
||||
SPACE+CNTRL, /* 0A (LF) */
|
||||
SPACE+CNTRL, /* 0B (VT) */
|
||||
SPACE+CNTRL, /* 0C (FF) */
|
||||
SPACE+CNTRL, /* 0D (CR) */
|
||||
CNTRL, /* 0E (SI) */
|
||||
CNTRL, /* 0F (SO) */
|
||||
CNTRL, /* 10 (DLE) */
|
||||
CNTRL, /* 11 (DC1) */
|
||||
CNTRL, /* 12 (DC2) */
|
||||
CNTRL, /* 13 (DC3) */
|
||||
CNTRL, /* 14 (DC4) */
|
||||
CNTRL, /* 15 (NAK) */
|
||||
CNTRL, /* 16 (SYN) */
|
||||
CNTRL, /* 17 (ETB) */
|
||||
CNTRL, /* 18 (CAN) */
|
||||
CNTRL, /* 19 (EM) */
|
||||
CNTRL, /* 1A (SUB) */
|
||||
CNTRL, /* 1B (ESC) */
|
||||
CNTRL, /* 1C (FS) */
|
||||
CNTRL, /* 1D (GS) */
|
||||
CNTRL, /* 1E (RS) */
|
||||
CNTRL, /* 1F (US) */
|
||||
SPACE+BLANK, /* 20 SPACE */
|
||||
PUNCT, /* 21 ! */
|
||||
PUNCT, /* 22 " */
|
||||
PUNCT, /* 23 # */
|
||||
PUNCT, /* 24 $ */
|
||||
PUNCT, /* 25 % */
|
||||
PUNCT, /* 26 & */
|
||||
PUNCT, /* 27 ' */
|
||||
PUNCT, /* 28 ( */
|
||||
PUNCT, /* 29 ) */
|
||||
PUNCT, /* 2A * */
|
||||
PUNCT, /* 2B + */
|
||||
PUNCT, /* 2C , */
|
||||
PUNCT, /* 2D - */
|
||||
PUNCT, /* 2E . */
|
||||
PUNCT, /* 2F / */
|
||||
DIGIT+HEX+0, /* 30 0 */
|
||||
DIGIT+HEX+1, /* 31 1 */
|
||||
DIGIT+HEX+2, /* 32 2 */
|
||||
DIGIT+HEX+3, /* 33 3 */
|
||||
DIGIT+HEX+4, /* 34 4 */
|
||||
DIGIT+HEX+5, /* 35 5 */
|
||||
DIGIT+HEX+6, /* 36 6 */
|
||||
DIGIT+HEX+7, /* 37 7 */
|
||||
DIGIT+HEX+8, /* 38 8 */
|
||||
DIGIT+HEX+9, /* 39 9 */
|
||||
PUNCT, /* 3A : */
|
||||
PUNCT, /* 3B ; */
|
||||
PUNCT, /* 3C < */
|
||||
PUNCT, /* 3D = */
|
||||
PUNCT, /* 3E > */
|
||||
PUNCT, /* 3F ? */
|
||||
PUNCT, /* 40 @ */
|
||||
UPPER+HEX+10, /* 41 A */
|
||||
UPPER+HEX+11, /* 42 B */
|
||||
UPPER+HEX+12, /* 43 C */
|
||||
UPPER+HEX+13, /* 44 D */
|
||||
UPPER+HEX+14, /* 45 E */
|
||||
UPPER+HEX+15, /* 46 F */
|
||||
UPPER+16, /* 47 G */
|
||||
UPPER+17, /* 48 H */
|
||||
UPPER+18, /* 49 I */
|
||||
UPPER+19, /* 4A J */
|
||||
UPPER+20, /* 4B K */
|
||||
UPPER+21, /* 4C L */
|
||||
UPPER+22, /* 4D M */
|
||||
UPPER+23, /* 4E N */
|
||||
UPPER+24, /* 4F O */
|
||||
UPPER+25, /* 50 P */
|
||||
UPPER+26, /* 51 Q */
|
||||
UPPER+27, /* 52 R */
|
||||
UPPER+28, /* 53 S */
|
||||
UPPER+29, /* 54 T */
|
||||
UPPER+30, /* 55 U */
|
||||
UPPER+31, /* 56 V */
|
||||
UPPER+32, /* 57 W */
|
||||
UPPER+33, /* 58 X */
|
||||
UPPER+34, /* 59 Y */
|
||||
UPPER+35, /* 5A Z */
|
||||
PUNCT, /* 5B [ */
|
||||
PUNCT, /* 5C \ */
|
||||
PUNCT, /* 5D ] */
|
||||
PUNCT, /* 5E ^ */
|
||||
PUNCT|UNDER, /* 5F _ */
|
||||
PUNCT, /* 60 ` */
|
||||
LOWER+HEX+10, /* 61 a */
|
||||
LOWER+HEX+11, /* 62 b */
|
||||
LOWER+HEX+12, /* 63 c */
|
||||
LOWER+HEX+13, /* 64 d */
|
||||
LOWER+HEX+14, /* 65 e */
|
||||
LOWER+HEX+15, /* 66 f */
|
||||
LOWER+16, /* 67 g */
|
||||
LOWER+17, /* 68 h */
|
||||
LOWER+18, /* 69 i */
|
||||
LOWER+19, /* 6A j */
|
||||
LOWER+20, /* 6B k */
|
||||
LOWER+21, /* 6C l */
|
||||
LOWER+22, /* 6D m */
|
||||
LOWER+23, /* 6E n */
|
||||
LOWER+24, /* 6F o */
|
||||
LOWER+25, /* 70 p */
|
||||
LOWER+26, /* 71 q */
|
||||
LOWER+27, /* 72 r */
|
||||
LOWER+28, /* 73 s */
|
||||
LOWER+29, /* 74 t */
|
||||
LOWER+30, /* 75 u */
|
||||
LOWER+31, /* 76 v */
|
||||
LOWER+32, /* 77 w */
|
||||
LOWER+33, /* 78 x */
|
||||
LOWER+34, /* 79 y */
|
||||
LOWER+35, /* 7A z */
|
||||
PUNCT, /* 7B { */
|
||||
PUNCT, /* 7C | */
|
||||
PUNCT, /* 7D } */
|
||||
PUNCT, /* 7E ~ */
|
||||
CNTRL, /* 7F (DEL) */
|
||||
};
|
||||
|
||||
static int getType(int ch) {
|
||||
return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
|
||||
}
|
||||
|
||||
static boolean isType(int ch, int type) {
|
||||
return (getType(ch) & type) != 0;
|
||||
}
|
||||
|
||||
static boolean isAscii(int ch) {
|
||||
return ((ch & 0xFFFFFF80) == 0);
|
||||
}
|
||||
|
||||
static boolean isAlpha(int ch) {
|
||||
return isType(ch, ALPHA);
|
||||
}
|
||||
|
||||
static boolean isDigit(int ch) {
|
||||
return ((ch-'0')|('9'-ch)) >= 0;
|
||||
}
|
||||
|
||||
static boolean isAlnum(int ch) {
|
||||
return isType(ch, ALNUM);
|
||||
}
|
||||
|
||||
static boolean isGraph(int ch) {
|
||||
return isType(ch, GRAPH);
|
||||
}
|
||||
|
||||
static boolean isPrint(int ch) {
|
||||
return ((ch-0x20)|(0x7E-ch)) >= 0;
|
||||
}
|
||||
|
||||
static boolean isPunct(int ch) {
|
||||
return isType(ch, PUNCT);
|
||||
}
|
||||
|
||||
static boolean isSpace(int ch) {
|
||||
return isType(ch, SPACE);
|
||||
}
|
||||
|
||||
static boolean isHexDigit(int ch) {
|
||||
return isType(ch, HEX);
|
||||
}
|
||||
|
||||
static boolean isCntrl(int ch) {
|
||||
return isType(ch, CNTRL);
|
||||
}
|
||||
|
||||
static boolean isLower(int ch) {
|
||||
return ((ch-'a')|('z'-ch)) >= 0;
|
||||
}
|
||||
|
||||
static boolean isUpper(int ch) {
|
||||
return ((ch-'A')|('Z'-ch)) >= 0;
|
||||
}
|
||||
|
||||
static boolean isWord(int ch) {
|
||||
return isType(ch, WORD);
|
||||
}
|
||||
}
|
141
jdk/test/java/util/regex/POSIX_Unicode.java
Normal file
141
jdk/test/java/util/regex/POSIX_Unicode.java
Normal file
@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
final public class POSIX_Unicode {
|
||||
|
||||
public static boolean isAlpha(int ch) {
|
||||
return Character.isAlphabetic(ch);
|
||||
}
|
||||
|
||||
public static boolean isLower(int ch) {
|
||||
return Character.isLowerCase(ch);
|
||||
}
|
||||
|
||||
public static boolean isUpper(int ch) {
|
||||
return Character.isUpperCase(ch);
|
||||
}
|
||||
|
||||
// \p{Whitespace}
|
||||
public static boolean isSpace(int ch) {
|
||||
return ((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 ||
|
||||
(ch >= 0x9 && ch <= 0xd) ||
|
||||
(ch == 0x85);
|
||||
}
|
||||
|
||||
// \p{gc=Control}
|
||||
public static boolean isCntrl(int ch) {
|
||||
return Character.getType(ch) == Character.CONTROL;
|
||||
}
|
||||
|
||||
// \p{gc=Punctuation}
|
||||
public static boolean isPunct(int ch) {
|
||||
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
}
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
|
||||
public static boolean isHexDigit(int ch) {
|
||||
return Character.isDigit(ch) ||
|
||||
(ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46);
|
||||
}
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
public static boolean isDigit(int ch) {
|
||||
return Character.isDigit(ch);
|
||||
};
|
||||
|
||||
// \p{alpha}
|
||||
// \p{digit}
|
||||
public static boolean isAlnum(int ch) {
|
||||
return Character.isAlphabetic(ch) || Character.isDigit(ch);
|
||||
}
|
||||
|
||||
// \p{Whitespace} --
|
||||
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
|
||||
// \p{gc=Line_Separator}
|
||||
// \p{gc=Paragraph_Separator}]
|
||||
public static boolean isBlank(int ch) {
|
||||
int type = Character.getType(ch);
|
||||
return isSpace(ch) &&
|
||||
ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 &&
|
||||
type != Character.LINE_SEPARATOR &&
|
||||
type != Character.PARAGRAPH_SEPARATOR;
|
||||
}
|
||||
|
||||
// [^
|
||||
// \p{space}
|
||||
// \p{gc=Control}
|
||||
// \p{gc=Surrogate}
|
||||
// \p{gc=Unassigned}]
|
||||
public static boolean isGraph(int ch) {
|
||||
int type = Character.getType(ch);
|
||||
return !(isSpace(ch) ||
|
||||
Character.CONTROL == type ||
|
||||
Character.SURROGATE == type ||
|
||||
Character.UNASSIGNED == type);
|
||||
}
|
||||
|
||||
// \p{graph}
|
||||
// \p{blank}
|
||||
// -- \p{cntrl}
|
||||
public static boolean isPrint(int ch) {
|
||||
return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch);
|
||||
}
|
||||
|
||||
// PropList.txt:Noncharacter_Code_Point
|
||||
public static boolean isNoncharacterCodePoint(int ch) {
|
||||
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
}
|
||||
|
||||
// \p{alpha}
|
||||
// \p{gc=Mark}
|
||||
// \p{digit}
|
||||
// \p{gc=Connector_Punctuation}
|
||||
public static boolean isWord(int ch) {
|
||||
return isAlpha(ch) ||
|
||||
((((1 << Character.NON_SPACING_MARK) |
|
||||
(1 << Character.ENCLOSING_MARK) |
|
||||
(1 << Character.COMBINING_SPACING_MARK) |
|
||||
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0 ||
|
||||
isDigit(ch);
|
||||
}
|
||||
}
|
@ -32,7 +32,7 @@
|
||||
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
|
||||
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
|
||||
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
|
||||
* 6350801 6676425 6878475 6919132 6931676 6948903 7014645
|
||||
* 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066
|
||||
*/
|
||||
|
||||
import java.util.regex.*;
|
||||
@ -137,6 +137,7 @@ public class RegExTest {
|
||||
nonBmpClassComplementTest();
|
||||
unicodePropertiesTest();
|
||||
unicodeHexNotationTest();
|
||||
unicodeClassesTest();
|
||||
if (failure)
|
||||
throw new RuntimeException("Failure in the RE handling.");
|
||||
else
|
||||
@ -3656,5 +3657,146 @@ public class RegExTest {
|
||||
failCount++;
|
||||
}
|
||||
report("unicodeHexNotation");
|
||||
}
|
||||
}
|
||||
|
||||
private static void unicodeClassesTest() throws Exception {
|
||||
|
||||
Matcher lower = Pattern.compile("\\p{Lower}").matcher("");
|
||||
Matcher upper = Pattern.compile("\\p{Upper}").matcher("");
|
||||
Matcher ASCII = Pattern.compile("\\p{ASCII}").matcher("");
|
||||
Matcher alpha = Pattern.compile("\\p{Alpha}").matcher("");
|
||||
Matcher digit = Pattern.compile("\\p{Digit}").matcher("");
|
||||
Matcher alnum = Pattern.compile("\\p{Alnum}").matcher("");
|
||||
Matcher punct = Pattern.compile("\\p{Punct}").matcher("");
|
||||
Matcher graph = Pattern.compile("\\p{Graph}").matcher("");
|
||||
Matcher print = Pattern.compile("\\p{Print}").matcher("");
|
||||
Matcher blank = Pattern.compile("\\p{Blank}").matcher("");
|
||||
Matcher cntrl = Pattern.compile("\\p{Cntrl}").matcher("");
|
||||
Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher("");
|
||||
Matcher space = Pattern.compile("\\p{Space}").matcher("");
|
||||
Matcher bound = Pattern.compile("\\b").matcher("");
|
||||
Matcher word = Pattern.compile("\\w++").matcher("");
|
||||
// UNICODE_CHARACTER_CLASS
|
||||
Matcher lowerU = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher upperU = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher ASCIIU = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher alphaU = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher digitU = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher alnumU = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher punctU = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher graphU = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher printU = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher blankU = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher cntrlU = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher spaceU = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher boundU = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher wordU = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
// embedded flag (?U)
|
||||
Matcher lowerEU = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher graphEU = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher wordEU = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
|
||||
Matcher bwb = Pattern.compile("\\b\\w\\b").matcher("");
|
||||
Matcher bwbU = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
Matcher bwbEU = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
|
||||
// properties
|
||||
Matcher lowerP = Pattern.compile("\\p{IsLowerCase}").matcher("");
|
||||
Matcher upperP = Pattern.compile("\\p{IsUpperCase}").matcher("");
|
||||
Matcher titleP = Pattern.compile("\\p{IsTitleCase}").matcher("");
|
||||
Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher("");
|
||||
Matcher alphaP = Pattern.compile("\\p{IsAlphabetic}").matcher("");
|
||||
Matcher ideogP = Pattern.compile("\\p{IsIdeographic}").matcher("");
|
||||
Matcher cntrlP = Pattern.compile("\\p{IsControl}").matcher("");
|
||||
Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
|
||||
Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
|
||||
Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
|
||||
|
||||
// javaMethod
|
||||
Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher("");
|
||||
Matcher upperJ = Pattern.compile("\\p{javaUpperCase}").matcher("");
|
||||
Matcher alphaJ = Pattern.compile("\\p{javaAlphabetic}").matcher("");
|
||||
Matcher ideogJ = Pattern.compile("\\p{javaIdeographic}").matcher("");
|
||||
|
||||
for (int cp = 1; cp < 0x30000; cp++) {
|
||||
String str = new String(Character.toChars(cp));
|
||||
int type = Character.getType(cp);
|
||||
if (// lower
|
||||
POSIX_ASCII.isLower(cp) != lower.reset(str).matches() ||
|
||||
Character.isLowerCase(cp) != lowerU.reset(str).matches() ||
|
||||
Character.isLowerCase(cp) != lowerP.reset(str).matches() ||
|
||||
Character.isLowerCase(cp) != lowerEU.reset(str).matches()||
|
||||
Character.isLowerCase(cp) != lowerJ.reset(str).matches()||
|
||||
// upper
|
||||
POSIX_ASCII.isUpper(cp) != upper.reset(str).matches() ||
|
||||
POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() ||
|
||||
Character.isUpperCase(cp) != upperP.reset(str).matches() ||
|
||||
Character.isUpperCase(cp) != upperJ.reset(str).matches() ||
|
||||
// alpha
|
||||
POSIX_ASCII.isAlpha(cp) != alpha.reset(str).matches() ||
|
||||
POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() ||
|
||||
Character.isAlphabetic(cp)!= alphaP.reset(str).matches() ||
|
||||
Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() ||
|
||||
// digit
|
||||
POSIX_ASCII.isDigit(cp) != digit.reset(str).matches() ||
|
||||
Character.isDigit(cp) != digitU.reset(str).matches() ||
|
||||
// alnum
|
||||
POSIX_ASCII.isAlnum(cp) != alnum.reset(str).matches() ||
|
||||
POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() ||
|
||||
// punct
|
||||
POSIX_ASCII.isPunct(cp) != punct.reset(str).matches() ||
|
||||
POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() ||
|
||||
// graph
|
||||
POSIX_ASCII.isGraph(cp) != graph.reset(str).matches() ||
|
||||
POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() ||
|
||||
POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()||
|
||||
// blank
|
||||
POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK)
|
||||
!= blank.reset(str).matches() ||
|
||||
POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() ||
|
||||
// print
|
||||
POSIX_ASCII.isPrint(cp) != print.reset(str).matches() ||
|
||||
POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() ||
|
||||
// cntrl
|
||||
POSIX_ASCII.isCntrl(cp) != cntrl.reset(str).matches() ||
|
||||
POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() ||
|
||||
(Character.CONTROL == type) != cntrlP.reset(str).matches() ||
|
||||
// hexdigit
|
||||
POSIX_ASCII.isHexDigit(cp) != xdigit.reset(str).matches() ||
|
||||
POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() ||
|
||||
// space
|
||||
POSIX_ASCII.isSpace(cp) != space.reset(str).matches() ||
|
||||
POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() ||
|
||||
POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() ||
|
||||
// word
|
||||
POSIX_ASCII.isWord(cp) != word.reset(str).matches() ||
|
||||
POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() ||
|
||||
POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()||
|
||||
// bwordb
|
||||
POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() ||
|
||||
POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() ||
|
||||
// properties
|
||||
Character.isTitleCase(cp) != titleP.reset(str).matches() ||
|
||||
Character.isLetter(cp) != letterP.reset(str).matches()||
|
||||
Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
|
||||
Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
|
||||
(Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
|
||||
POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
|
||||
failCount++;
|
||||
}
|
||||
|
||||
// bounds/word align
|
||||
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
|
||||
if (!bwbU.reset("\u0180sherman\u0400").matches())
|
||||
failCount++;
|
||||
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
|
||||
if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches())
|
||||
failCount++;
|
||||
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
|
||||
if (!bwbU.reset("\u0724\u0739\u0724").matches())
|
||||
failCount++;
|
||||
if (!bwbEU.reset("\u0724\u0739\u0724").matches())
|
||||
failCount++;
|
||||
report("unicodePredefinedClasses");
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user