7039066: j.u.rgex does not match TR18 RL1.4 Simple Word Boundaries and RL1.2 Properties

Updated the regex Unicode property support

Reviewed-by: alanb
This commit is contained in:
Xueming Shen 2011-04-28 20:48:36 -07:00
parent 4463efaf61
commit df7a03a4c3
5 changed files with 980 additions and 52 deletions

View File

@ -206,13 +206,15 @@ import java.util.Arrays;
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
*
* <tr><th>&nbsp;</th></tr>
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
* <td headers="matches">A Latin&nbsp;script character (simple <a href="#ubc">script</a>)</td></tr>
* <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
* <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr>
* <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
* <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
* <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
* <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
* <td headers="matches">A currency symbol</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
@ -328,10 +330,11 @@ import java.util.Arrays;
* <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
* <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
* <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
* <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
* on - off</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
* <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
* given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
@ -518,61 +521,140 @@ import java.util.Arrays;
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
* Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
* Canonical Equivalents.
*
* <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
* <p>
* <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code
* are processed as described in section 3.3 of
* <cite>The Java&trade; Language Specification</cite>.
* Such escape sequences are also
* implemented directly by the regular-expression parser so that Unicode
* escapes can be used in expressions that are read from files or from the
* keyboard. Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
* while not equal, compile into the same pattern, which matches the character
* with hexadecimal value <tt>0x2014</tt>.
*
* <p> A Unicode character can also be represented in a regular-expression by
* using its hexadecimal code point value directly as described in construct
* Such escape sequences are also implemented directly by the regular-expression
* parser so that Unicode escapes can be used in expressions that are read from
* files or from the keyboard. Thus the strings <tt>"&#92;u2014"</tt> and
* <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
* matches the character with hexadecimal value <tt>0x2014</tt>.
* <p>
* A Unicode character can also be represented in a regular-expression by
* using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
* <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
* can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
* Unicode escape sequences of the surrogate pair
* <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
*
* <a name="ubc">
* <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
* <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* <p>
* Unicode scripts, blocks, categories and binary properties are written with
* the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
* <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
* does not match if the input has that property.
* <p>
* Scripts are specified either with the prefix {@code Is}, as in
* Scripts, blocks, categories and binary properties can be used both inside
* and outside of a character class.
* <a name="usc">
* <p>
* <b>Scripts</b> are specified either with the prefix {@code Is}, as in
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
* <p>
* Blocks are specified with the prefix {@code In}, as in
* The script names supported by <code>Pattern</code> are the valid script names
* accepted and defined by
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
* <a name="ubc">
* <p>
* <b>Blocks</b> are specified with the prefix {@code In}, as in
* {@code InMongolian}, or by using the keyword {@code block} (or its short
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
* <p>
* Categories may be specified with the optional prefix {@code Is}:
* The block names supported by <code>Pattern</code> are the valid block names
* accepted and defined by
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
* <p>
* <a name="ucc">
* <b>Categories</b> may be specified with the optional prefix {@code Is}:
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
* letters. Same as scripts and blocks, categories can also be specified
* by using the keyword {@code general_category} (or its short form
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
* <p>
* Scripts, blocks and categories can be used both inside and outside of a
* character class.
* <p> The supported categories are those of
* The supported categories are those of
* <a href="http://www.unicode.org/unicode/standard/standard.html">
* <i>The Unicode Standard</i></a> in the version specified by the
* {@link java.lang.Character Character} class. The category names are those
* defined in the Standard, both normative and informative.
* The script names supported by <code>Pattern</code> are the valid script names
* accepted and defined by
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
* The block names supported by <code>Pattern</code> are the valid block names
* accepted and defined by
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
* <p>
* <a name="jcc"> <p>Categories that behave like the java.lang.Character
* <a name="ubpc">
* <b>Binary properties</b> are specified with the prefix {@code Is}, as in
* {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
* are
* <ul>
* <li> Alphabetic
* <li> Ideographic
* <li> Letter
* <li> Lowercase
* <li> Uppercase
* <li> Titlecase
* <li> Punctuation
* <Li> Control
* <li> White_Space
* <li> Digit
* <li> Hex_Digit
* <li> Noncharacter_Code_Point
* <li> Assigned
* </ul>
* <p>
* <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
* conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
* of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
* </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
* <p>
* <table border="0" cellpadding="1" cellspacing="0"
* summary="predefined and posix character classes in Unicode mode">
* <tr align="left">
* <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
* <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
*</tr>
* <tr><td><tt>\p{Lower}</tt></td>
* <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
* <tr><td><tt>\p{Upper}</tt></td>
* <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
* <tr><td><tt>\p{ASCII}</tt></td>
* <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
* <tr><td><tt>\p{Alpha}</tt></td>
* <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
* <tr><td><tt>\p{Digit}</tt></td>
* <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
* <tr><td><tt>\p{Alnum}</tt></td>
* <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
* <tr><td><tt>\p{Punct}</tt></td>
* <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
* <tr><td><tt>\p{Graph}</tt></td>
* <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
* <tr><td><tt>\p{Print}</tt></td>
* <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
* <tr><td><tt>\p{Blank}</tt></td>
* <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
* <tr><td><tt>\p{Cntrl}</tt></td>
* <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
* <tr><td><tt>\p{XDigit}</tt></td>
* <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
* <tr><td><tt>\p{Space}</tt></td>
* <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
* <tr><td><tt>\d</tt></td>
* <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
* <tr><td><tt>\D</tt></td>
* <td>A non-digit: <tt>[^\d]</tt></td></tr>
* <tr><td><tt>\s</tt></td>
* <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
* <tr><td><tt>\S</tt></td>
* <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
* <tr><td><tt>\w</tt></td>
* <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
* <tr><td><tt>\W</tt></td>
* <td>A non-word character: <tt>[^\w]</tt></td></tr>
* </table>
* <p>
* <a name="jcc">
* Categories that behave like the java.lang.Character
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
* the specified property has the name <tt>java<i>methodname</i></tt>.
@ -796,6 +878,28 @@ public final class Pattern
*/
public static final int CANON_EQ = 0x80;
/**
* Enables the Unicode version of <i>Predefined character classes</i> and
* <i>POSIX character classes</i>.
*
* <p> When this flag is specified then the (US-ASCII only)
* <i>Predefined character classes</i> and <i>POSIX character classes</i>
* are in conformance with
* <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression</i></a>
* <i>Annex C: Compatibility Properties</i>.
* <p>
* The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
* flag expression&nbsp;<tt>(?U)</tt>.
* <p>
* The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
* folding.
* <p>
* Specifying this flag may impose a performance penalty. </p>
* @since 1.7
*/
public static final int UNICODE_CHARACTER_CLASS = 0x100;
/* Pattern has only two serialized components: The pattern string
* and the flags, which are all that is needed to recompile the pattern
* when it is deserialized.
@ -918,7 +1022,8 @@ public final class Pattern
* Match flags, a bit mask that may include
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
* {@link #LITERAL} and {@link #COMMENTS}
* {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
* and {@link #COMMENTS}
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
@ -1209,6 +1314,10 @@ public final class Pattern
pattern = p;
flags = f;
// to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
if ((flags & UNICODE_CHARACTER_CLASS) != 0)
flags |= UNICODE_CASE;
// Reset group index count
capturingGroupCount = 1;
localCount = 0;
@ -2164,12 +2273,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return -1;
case 'B':
if (inclass) break;
if (create) root = new Bound(Bound.NONE);
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
if (create) root = new Ctype(ASCII.DIGIT).complement();
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.DIGIT).complement()
: new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
@ -2191,14 +2302,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'R':
break;
case 'S':
if (create) root = new Ctype(ASCII.SPACE).complement();
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE).complement()
: new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
case 'V':
break;
case 'W':
if (create) root = new Ctype(ASCII.WORD).complement();
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD).complement()
: new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
case 'Y':
@ -2216,12 +2331,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return '\007';
case 'b':
if (inclass) break;
if (create) root = new Bound(Bound.BOTH);
if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'c':
return c();
case 'd':
if (create) root = new Ctype(ASCII.DIGIT);
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.DIGIT)
: new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
@ -2259,7 +2376,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'r':
return '\r';
case 's':
if (create) root = new Ctype(ASCII.SPACE);
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE)
: new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
@ -2268,7 +2387,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'v':
return '\013';
case 'w':
if (create) root = new Ctype(ASCII.WORD);
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD)
: new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
@ -2490,7 +2611,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
{
next();
String name;
CharProperty node;
CharProperty node = null;
if (singleLetter) {
int c = temp[cursor];
@ -2536,11 +2657,21 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} else if (name.startsWith("Is")) {
// \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2);
node = CharPropertyNames.charPropertyFor(name);
UnicodeProp uprop = UnicodeProp.forName(name);
if (uprop != null)
node = new Utype(uprop);
if (node == null)
node = CharPropertyNames.charPropertyFor(name);
if (node == null)
node = unicodeScriptPropertyFor(name);
} else {
node = charPropertyNodeFor(name);
if (has(UNICODE_CHARACTER_CLASS)) {
UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
if (uprop != null)
node = new Utype(uprop);
}
if (node == null)
node = charPropertyNodeFor(name);
}
}
if (maybeComplement) {
@ -2822,6 +2953,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'x':
flags |= COMMENTS;
break;
case 'U':
flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
break;
case '-': // subFlag then fall through
ch = next();
subFlag();
@ -2861,6 +2995,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
case 'x':
flags &= ~COMMENTS;
break;
case 'U':
flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
default:
return;
}
@ -3663,6 +3799,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
}
/**
* Node class that matches a Unicode "type"
*/
static final class Utype extends CharProperty {
final UnicodeProp uprop;
Utype(UnicodeProp uprop) { this.uprop = uprop; }
boolean isSatisfiedBy(int ch) {
return uprop.is(ch);
}
}
/**
* Node class that matches a POSIX type.
*/
@ -5025,9 +5173,17 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
static int BOTH = 0x3;
static int NONE = 0x4;
int type;
Bound(int n) {
boolean useUWORD;
Bound(int n, boolean useUWORD) {
type = n;
this.useUWORD = useUWORD;
}
boolean isWord(int ch) {
return useUWORD ? UnicodeProp.WORD.is(ch)
: (ch == '_' || Character.isLetterOrDigit(ch));
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
boolean left = false;
@ -5039,14 +5195,14 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
left = (ch == '_' || Character.isLetterOrDigit(ch) ||
left = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
right = (ch == '_' || Character.isLetterOrDigit(ch) ||
right = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
} else {
@ -5428,6 +5584,12 @@ NEXT: while (i <= last) {
defClone("javaUpperCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isUpperCase(ch);}});
defClone("javaAlphabetic", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isAlphabetic(ch);}});
defClone("javaIdeographic", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isIdeographic(ch);}});
defClone("javaTitleCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isTitleCase(ch);}});

View File

@ -0,0 +1,236 @@
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.util.regex;
import java.util.HashMap;
import java.util.Locale;
enum UnicodeProp {
ALPHABETIC {
public boolean is(int ch) {
return Character.isAlphabetic(ch);
}
},
LETTER {
public boolean is(int ch) {
return Character.isLetter(ch);
}
},
IDEOGRAPHIC {
public boolean is(int ch) {
return Character.isIdeographic(ch);
}
},
LOWERCASE {
public boolean is(int ch) {
return Character.isLowerCase(ch);
}
},
UPPERCASE {
public boolean is(int ch) {
return Character.isUpperCase(ch);
}
},
TITLECASE {
public boolean is(int ch) {
return Character.isTitleCase(ch);
}
},
WHITE_SPACE {
// \p{Whitespace}
public boolean is(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
}
},
CONTROL {
// \p{gc=Control}
public boolean is(int ch) {
return Character.getType(ch) == Character.CONTROL;
}
},
PUNCTUATION {
// \p{gc=Punctuation}
public boolean is(int ch) {
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
},
HEX_DIGIT {
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
public boolean is(int ch) {
return DIGIT.is(ch) ||
(ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
(ch >= 0xFF41 && ch <= 0xFF46);
}
},
ASSIGNED {
public boolean is(int ch) {
return Character.getType(ch) != Character.UNASSIGNED;
}
},
NONCHARACTER_CODE_POINT {
// PropList.txt:Noncharacter_Code_Point
public boolean is(int ch) {
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
}
},
DIGIT {
// \p{gc=Decimal_Number}
public boolean is(int ch) {
return Character.isDigit(ch);
}
},
ALNUM {
// \p{alpha}
// \p{digit}
public boolean is(int ch) {
return ALPHABETIC.is(ch) || DIGIT.is(ch);
}
},
BLANK {
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
public boolean is(int ch) {
return Character.getType(ch) == Character.SPACE_SEPARATOR ||
ch == 0x9; // \N{HT}
}
},
GRAPH {
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
public boolean is(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR) |
(1 << Character.CONTROL) |
(1 << Character.SURROGATE) |
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
== 0;
}
},
PRINT {
// \p{graph}
// \p{blank}
// -- \p{cntrl}
public boolean is(int ch) {
return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
}
},
WORD {
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
public boolean is(int ch) {
return ALPHABETIC.is(ch) ||
((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.DECIMAL_DIGIT_NUMBER) |
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
};
private final static HashMap<String, String> posix = new HashMap<>();
private final static HashMap<String, String> aliases = new HashMap<>();
static {
posix.put("ALPHA", "ALPHABETIC");
posix.put("LOWER", "LOWERCASE");
posix.put("UPPER", "UPPERCASE");
posix.put("SPACE", "WHITE_SPACE");
posix.put("PUNCT", "PUNCTUATION");
posix.put("XDIGIT","HEX_DIGIT");
posix.put("ALNUM", "ALNUM");
posix.put("CNTRL", "CONTROL");
posix.put("DIGIT", "DIGIT");
posix.put("BLANK", "BLANK");
posix.put("GRAPH", "GRAPH");
posix.put("PRINT", "PRINT");
aliases.put("WHITESPACE", "WHITE_SPACE");
aliases.put("HEXDIGIT","HEX_DIGIT");
aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
}
public static UnicodeProp forName(String propName) {
propName = propName.toUpperCase(Locale.ENGLISH);
String alias = aliases.get(propName);
if (alias != null)
propName = alias;
try {
return valueOf (propName);
} catch (IllegalArgumentException x) {}
return null;
}
public static UnicodeProp forPOSIXName(String propName) {
propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
if (propName == null)
return null;
return valueOf (propName);
}
public abstract boolean is(int ch);
}

View File

@ -0,0 +1,247 @@
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
final class POSIX_ASCII {
static final int UPPER = 0x00000100;
static final int LOWER = 0x00000200;
static final int DIGIT = 0x00000400;
static final int SPACE = 0x00000800;
static final int PUNCT = 0x00001000;
static final int CNTRL = 0x00002000;
static final int BLANK = 0x00004000;
static final int HEX = 0x00008000;
static final int UNDER = 0x00010000;
static final int ASCII = 0x0000FF00;
static final int ALPHA = (UPPER|LOWER);
static final int ALNUM = (UPPER|LOWER|DIGIT);
static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT);
static final int WORD = (UPPER|LOWER|UNDER|DIGIT);
static final int XDIGIT = (HEX);
private static final int[] ctype = new int[] {
CNTRL, /* 00 (NUL) */
CNTRL, /* 01 (SOH) */
CNTRL, /* 02 (STX) */
CNTRL, /* 03 (ETX) */
CNTRL, /* 04 (EOT) */
CNTRL, /* 05 (ENQ) */
CNTRL, /* 06 (ACK) */
CNTRL, /* 07 (BEL) */
CNTRL, /* 08 (BS) */
SPACE+CNTRL+BLANK, /* 09 (HT) */
SPACE+CNTRL, /* 0A (LF) */
SPACE+CNTRL, /* 0B (VT) */
SPACE+CNTRL, /* 0C (FF) */
SPACE+CNTRL, /* 0D (CR) */
CNTRL, /* 0E (SI) */
CNTRL, /* 0F (SO) */
CNTRL, /* 10 (DLE) */
CNTRL, /* 11 (DC1) */
CNTRL, /* 12 (DC2) */
CNTRL, /* 13 (DC3) */
CNTRL, /* 14 (DC4) */
CNTRL, /* 15 (NAK) */
CNTRL, /* 16 (SYN) */
CNTRL, /* 17 (ETB) */
CNTRL, /* 18 (CAN) */
CNTRL, /* 19 (EM) */
CNTRL, /* 1A (SUB) */
CNTRL, /* 1B (ESC) */
CNTRL, /* 1C (FS) */
CNTRL, /* 1D (GS) */
CNTRL, /* 1E (RS) */
CNTRL, /* 1F (US) */
SPACE+BLANK, /* 20 SPACE */
PUNCT, /* 21 ! */
PUNCT, /* 22 " */
PUNCT, /* 23 # */
PUNCT, /* 24 $ */
PUNCT, /* 25 % */
PUNCT, /* 26 & */
PUNCT, /* 27 ' */
PUNCT, /* 28 ( */
PUNCT, /* 29 ) */
PUNCT, /* 2A * */
PUNCT, /* 2B + */
PUNCT, /* 2C , */
PUNCT, /* 2D - */
PUNCT, /* 2E . */
PUNCT, /* 2F / */
DIGIT+HEX+0, /* 30 0 */
DIGIT+HEX+1, /* 31 1 */
DIGIT+HEX+2, /* 32 2 */
DIGIT+HEX+3, /* 33 3 */
DIGIT+HEX+4, /* 34 4 */
DIGIT+HEX+5, /* 35 5 */
DIGIT+HEX+6, /* 36 6 */
DIGIT+HEX+7, /* 37 7 */
DIGIT+HEX+8, /* 38 8 */
DIGIT+HEX+9, /* 39 9 */
PUNCT, /* 3A : */
PUNCT, /* 3B ; */
PUNCT, /* 3C < */
PUNCT, /* 3D = */
PUNCT, /* 3E > */
PUNCT, /* 3F ? */
PUNCT, /* 40 @ */
UPPER+HEX+10, /* 41 A */
UPPER+HEX+11, /* 42 B */
UPPER+HEX+12, /* 43 C */
UPPER+HEX+13, /* 44 D */
UPPER+HEX+14, /* 45 E */
UPPER+HEX+15, /* 46 F */
UPPER+16, /* 47 G */
UPPER+17, /* 48 H */
UPPER+18, /* 49 I */
UPPER+19, /* 4A J */
UPPER+20, /* 4B K */
UPPER+21, /* 4C L */
UPPER+22, /* 4D M */
UPPER+23, /* 4E N */
UPPER+24, /* 4F O */
UPPER+25, /* 50 P */
UPPER+26, /* 51 Q */
UPPER+27, /* 52 R */
UPPER+28, /* 53 S */
UPPER+29, /* 54 T */
UPPER+30, /* 55 U */
UPPER+31, /* 56 V */
UPPER+32, /* 57 W */
UPPER+33, /* 58 X */
UPPER+34, /* 59 Y */
UPPER+35, /* 5A Z */
PUNCT, /* 5B [ */
PUNCT, /* 5C \ */
PUNCT, /* 5D ] */
PUNCT, /* 5E ^ */
PUNCT|UNDER, /* 5F _ */
PUNCT, /* 60 ` */
LOWER+HEX+10, /* 61 a */
LOWER+HEX+11, /* 62 b */
LOWER+HEX+12, /* 63 c */
LOWER+HEX+13, /* 64 d */
LOWER+HEX+14, /* 65 e */
LOWER+HEX+15, /* 66 f */
LOWER+16, /* 67 g */
LOWER+17, /* 68 h */
LOWER+18, /* 69 i */
LOWER+19, /* 6A j */
LOWER+20, /* 6B k */
LOWER+21, /* 6C l */
LOWER+22, /* 6D m */
LOWER+23, /* 6E n */
LOWER+24, /* 6F o */
LOWER+25, /* 70 p */
LOWER+26, /* 71 q */
LOWER+27, /* 72 r */
LOWER+28, /* 73 s */
LOWER+29, /* 74 t */
LOWER+30, /* 75 u */
LOWER+31, /* 76 v */
LOWER+32, /* 77 w */
LOWER+33, /* 78 x */
LOWER+34, /* 79 y */
LOWER+35, /* 7A z */
PUNCT, /* 7B { */
PUNCT, /* 7C | */
PUNCT, /* 7D } */
PUNCT, /* 7E ~ */
CNTRL, /* 7F (DEL) */
};
static int getType(int ch) {
return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
}
static boolean isType(int ch, int type) {
return (getType(ch) & type) != 0;
}
static boolean isAscii(int ch) {
return ((ch & 0xFFFFFF80) == 0);
}
static boolean isAlpha(int ch) {
return isType(ch, ALPHA);
}
static boolean isDigit(int ch) {
return ((ch-'0')|('9'-ch)) >= 0;
}
static boolean isAlnum(int ch) {
return isType(ch, ALNUM);
}
static boolean isGraph(int ch) {
return isType(ch, GRAPH);
}
static boolean isPrint(int ch) {
return ((ch-0x20)|(0x7E-ch)) >= 0;
}
static boolean isPunct(int ch) {
return isType(ch, PUNCT);
}
static boolean isSpace(int ch) {
return isType(ch, SPACE);
}
static boolean isHexDigit(int ch) {
return isType(ch, HEX);
}
static boolean isCntrl(int ch) {
return isType(ch, CNTRL);
}
static boolean isLower(int ch) {
return ((ch-'a')|('z'-ch)) >= 0;
}
static boolean isUpper(int ch) {
return ((ch-'A')|('Z'-ch)) >= 0;
}
static boolean isWord(int ch) {
return isType(ch, WORD);
}
}

View File

@ -0,0 +1,141 @@
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.util.HashMap;
import java.util.Locale;
final public class POSIX_Unicode {
public static boolean isAlpha(int ch) {
return Character.isAlphabetic(ch);
}
public static boolean isLower(int ch) {
return Character.isLowerCase(ch);
}
public static boolean isUpper(int ch) {
return Character.isUpperCase(ch);
}
// \p{Whitespace}
public static boolean isSpace(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 ||
(ch >= 0x9 && ch <= 0xd) ||
(ch == 0x85);
}
// \p{gc=Control}
public static boolean isCntrl(int ch) {
return Character.getType(ch) == Character.CONTROL;
}
// \p{gc=Punctuation}
public static boolean isPunct(int ch) {
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
public static boolean isHexDigit(int ch) {
return Character.isDigit(ch) ||
(ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
(ch >= 0xFF41 && ch <= 0xFF46);
}
// \p{gc=Decimal_Number}
public static boolean isDigit(int ch) {
return Character.isDigit(ch);
};
// \p{alpha}
// \p{digit}
public static boolean isAlnum(int ch) {
return Character.isAlphabetic(ch) || Character.isDigit(ch);
}
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
public static boolean isBlank(int ch) {
int type = Character.getType(ch);
return isSpace(ch) &&
ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 &&
type != Character.LINE_SEPARATOR &&
type != Character.PARAGRAPH_SEPARATOR;
}
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
public static boolean isGraph(int ch) {
int type = Character.getType(ch);
return !(isSpace(ch) ||
Character.CONTROL == type ||
Character.SURROGATE == type ||
Character.UNASSIGNED == type);
}
// \p{graph}
// \p{blank}
// -- \p{cntrl}
public static boolean isPrint(int ch) {
return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch);
}
// PropList.txt:Noncharacter_Code_Point
public static boolean isNoncharacterCodePoint(int ch) {
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
}
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
public static boolean isWord(int ch) {
return isAlpha(ch) ||
((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0 ||
isDigit(ch);
}
}

View File

@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 7014645
* 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066
*/
import java.util.regex.*;
@ -137,6 +137,7 @@ public class RegExTest {
nonBmpClassComplementTest();
unicodePropertiesTest();
unicodeHexNotationTest();
unicodeClassesTest();
if (failure)
throw new RuntimeException("Failure in the RE handling.");
else
@ -3656,5 +3657,146 @@ public class RegExTest {
failCount++;
}
report("unicodeHexNotation");
}
}
private static void unicodeClassesTest() throws Exception {
Matcher lower = Pattern.compile("\\p{Lower}").matcher("");
Matcher upper = Pattern.compile("\\p{Upper}").matcher("");
Matcher ASCII = Pattern.compile("\\p{ASCII}").matcher("");
Matcher alpha = Pattern.compile("\\p{Alpha}").matcher("");
Matcher digit = Pattern.compile("\\p{Digit}").matcher("");
Matcher alnum = Pattern.compile("\\p{Alnum}").matcher("");
Matcher punct = Pattern.compile("\\p{Punct}").matcher("");
Matcher graph = Pattern.compile("\\p{Graph}").matcher("");
Matcher print = Pattern.compile("\\p{Print}").matcher("");
Matcher blank = Pattern.compile("\\p{Blank}").matcher("");
Matcher cntrl = Pattern.compile("\\p{Cntrl}").matcher("");
Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher("");
Matcher space = Pattern.compile("\\p{Space}").matcher("");
Matcher bound = Pattern.compile("\\b").matcher("");
Matcher word = Pattern.compile("\\w++").matcher("");
// UNICODE_CHARACTER_CLASS
Matcher lowerU = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher upperU = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher ASCIIU = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher alphaU = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher digitU = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher alnumU = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher punctU = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher graphU = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher printU = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher blankU = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher cntrlU = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher spaceU = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher boundU = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher wordU = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
// embedded flag (?U)
Matcher lowerEU = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher graphEU = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher wordEU = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher bwb = Pattern.compile("\\b\\w\\b").matcher("");
Matcher bwbU = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher bwbEU = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
// properties
Matcher lowerP = Pattern.compile("\\p{IsLowerCase}").matcher("");
Matcher upperP = Pattern.compile("\\p{IsUpperCase}").matcher("");
Matcher titleP = Pattern.compile("\\p{IsTitleCase}").matcher("");
Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher("");
Matcher alphaP = Pattern.compile("\\p{IsAlphabetic}").matcher("");
Matcher ideogP = Pattern.compile("\\p{IsIdeographic}").matcher("");
Matcher cntrlP = Pattern.compile("\\p{IsControl}").matcher("");
Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
// javaMethod
Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher("");
Matcher upperJ = Pattern.compile("\\p{javaUpperCase}").matcher("");
Matcher alphaJ = Pattern.compile("\\p{javaAlphabetic}").matcher("");
Matcher ideogJ = Pattern.compile("\\p{javaIdeographic}").matcher("");
for (int cp = 1; cp < 0x30000; cp++) {
String str = new String(Character.toChars(cp));
int type = Character.getType(cp);
if (// lower
POSIX_ASCII.isLower(cp) != lower.reset(str).matches() ||
Character.isLowerCase(cp) != lowerU.reset(str).matches() ||
Character.isLowerCase(cp) != lowerP.reset(str).matches() ||
Character.isLowerCase(cp) != lowerEU.reset(str).matches()||
Character.isLowerCase(cp) != lowerJ.reset(str).matches()||
// upper
POSIX_ASCII.isUpper(cp) != upper.reset(str).matches() ||
POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() ||
Character.isUpperCase(cp) != upperP.reset(str).matches() ||
Character.isUpperCase(cp) != upperJ.reset(str).matches() ||
// alpha
POSIX_ASCII.isAlpha(cp) != alpha.reset(str).matches() ||
POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() ||
Character.isAlphabetic(cp)!= alphaP.reset(str).matches() ||
Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() ||
// digit
POSIX_ASCII.isDigit(cp) != digit.reset(str).matches() ||
Character.isDigit(cp) != digitU.reset(str).matches() ||
// alnum
POSIX_ASCII.isAlnum(cp) != alnum.reset(str).matches() ||
POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() ||
// punct
POSIX_ASCII.isPunct(cp) != punct.reset(str).matches() ||
POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() ||
// graph
POSIX_ASCII.isGraph(cp) != graph.reset(str).matches() ||
POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() ||
POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()||
// blank
POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK)
!= blank.reset(str).matches() ||
POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() ||
// print
POSIX_ASCII.isPrint(cp) != print.reset(str).matches() ||
POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() ||
// cntrl
POSIX_ASCII.isCntrl(cp) != cntrl.reset(str).matches() ||
POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() ||
(Character.CONTROL == type) != cntrlP.reset(str).matches() ||
// hexdigit
POSIX_ASCII.isHexDigit(cp) != xdigit.reset(str).matches() ||
POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() ||
// space
POSIX_ASCII.isSpace(cp) != space.reset(str).matches() ||
POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() ||
POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() ||
// word
POSIX_ASCII.isWord(cp) != word.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()||
// bwordb
POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() ||
// properties
Character.isTitleCase(cp) != titleP.reset(str).matches() ||
Character.isLetter(cp) != letterP.reset(str).matches()||
Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
(Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
failCount++;
}
// bounds/word align
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
if (!bwbU.reset("\u0180sherman\u0400").matches())
failCount++;
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches())
failCount++;
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
if (!bwbU.reset("\u0724\u0739\u0724").matches())
failCount++;
if (!bwbEU.reset("\u0724\u0739\u0724").matches())
failCount++;
report("unicodePredefinedClasses");
}
}