From c8da8f2595f948524a869ff5928303a029ad9e98 Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Tue, 8 May 2012 10:57:13 -0700 Subject: [PATCH] 7014640: To add a metachar \R for line ending and character classes for vertical/horizontal ws \v \V \h \H Added propsoed constructs Reviewed-by: alanb --- .../classes/java/util/regex/Pattern.java | 140 +++++++++++++----- jdk/test/java/util/regex/RegExTest.java | 84 ++++++++++- 2 files changed, 189 insertions(+), 35 deletions(-) diff --git a/jdk/src/share/classes/java/util/regex/Pattern.java b/jdk/src/share/classes/java/util/regex/Pattern.java index 626d5b08657..6c32f6a30bb 100644 --- a/jdk/src/share/classes/java/util/regex/Pattern.java +++ b/jdk/src/share/classes/java/util/regex/Pattern.java @@ -152,15 +152,24 @@ import java.util.Arrays; * A digit: [0-9] * \D * A non-digit: [^0-9] + * \h + * A horizontal whitespace character: + * [ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000] + * \H + * A non-horizontal whitespace character: [^\h] * \s * A whitespace character: [ \t\n\x0B\f\r] * \S * A non-whitespace character: [^\s] + * \v + * A vertical whitespace character: [\n\x0B\f\r\x85\u2028\u2029] + * + * \V + * A non-vertical whitespace character: [^\v] * \w * A word character: [a-zA-Z_0-9] * \W * A non-word character: [^\w] - * *   * POSIX character classes (US-ASCII only) * @@ -244,6 +253,13 @@ import java.util.Arrays; * The end of the input * *   + * Linebreak matcher + * \R + * Any Unicode linebreak sequence, is equivalent to + * \u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] + * + * + *   * Greedy quantifiers * * X? @@ -599,11 +615,9 @@ import java.util.Arrays; *
  • Noncharacter_Code_Point *
  • Assigned * - - *

    - * Predefined Character classes and POSIX character classes are in - * conformance with the recommendation of Annex C: Compatibility Properties + * The following Predefined Character classes and POSIX character classes + * are in conformance with the recommendation of Annex C: Compatibility Properties * of Unicode Regular Expression * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified. *

    @@ -668,12 +682,6 @@ import java.util.Arrays; * *

      *
    • Predefined character classes (Unicode character) - *

      \h    A horizontal whitespace - *

      \H    A non horizontal whitespace - *

      \v    A vertical whitespace - *

      \V    A non vertical whitespace - *

      \R    Any Unicode linebreak sequence - * \u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029] *

      \X    Match Unicode * * extended grapheme cluster @@ -2178,7 +2186,7 @@ loop: for(int x=0, offset=0; x= 0) { append(ch, first); first++; @@ -2276,7 +2284,7 @@ loop: for(int x=0, offset=0; x= 0) { if (peek() == '-') { @@ -2606,9 +2636,15 @@ loop: for(int x=0, offset=0; x= 0x0A && cp <= 0x0D) || + cp == 0x85 || cp == 0x2028 || cp == 0x2029; + } + } + + /** + * Node class that matches a Perl horizontal whitespace + */ + static final class HorizWS extends BmpCharProperty { + boolean isSatisfiedBy(int cp) { + return cp == 0x09 || cp == 0x20 || cp == 0xa0 || + cp == 0x1680 || cp == 0x180e || + cp >= 0x2000 && cp <= 0x200a || + cp == 0x202f || cp == 0x205f || cp == 0x3000; + } + } + /** * Base class for all Slice nodes */ diff --git a/jdk/test/java/util/regex/RegExTest.java b/jdk/test/java/util/regex/RegExTest.java index f583769ecb2..7846f5c05e3 100644 --- a/jdk/test/java/util/regex/RegExTest.java +++ b/jdk/test/java/util/regex/RegExTest.java @@ -33,7 +33,7 @@ * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066 - * 7067045 + * 7067045 7014640 */ import java.util.regex.*; @@ -141,6 +141,8 @@ public class RegExTest { unicodePropertiesTest(); unicodeHexNotationTest(); unicodeClassesTest(); + horizontalAndVerticalWSTest(); + linebreakTest(); if (failure) { throw new RuntimeException("RegExTest failed, 1st failure: " + @@ -857,13 +859,18 @@ public class RegExTest { // in replacement string try { "\uac00".replaceAll("\uac00", "$"); + failCount++; + } catch (IllegalArgumentException iie) { + } catch (Exception e) { + failCount++; + } + try { "\uac00".replaceAll("\uac00", "\\"); failCount++; } catch (IllegalArgumentException iie) { } catch (Exception e) { failCount++; } - report("Literal replacement"); } @@ -3838,4 +3845,77 @@ public class RegExTest { failCount++; report("unicodePredefinedClasses"); } + + private static void horizontalAndVerticalWSTest() throws Exception { + String hws = new String (new char[] { + 0x09, 0x20, 0xa0, 0x1680, 0x180e, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, + 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, + 0x202f, 0x205f, 0x3000 }); + String vws = new String (new char[] { + 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 }); + if (!Pattern.compile("\\h+").matcher(hws).matches() || + !Pattern.compile("[\\h]+").matcher(hws).matches()) + failCount++; + if (Pattern.compile("\\H").matcher(hws).find() || + Pattern.compile("[\\H]").matcher(hws).find()) + failCount++; + if (!Pattern.compile("\\v+").matcher(vws).matches() || + !Pattern.compile("[\\v]+").matcher(vws).matches()) + failCount++; + if (Pattern.compile("\\V").matcher(vws).find() || + Pattern.compile("[\\V]").matcher(vws).find()) + failCount++; + String prefix = "abcd"; + String suffix = "efgh"; + String ng = "A"; + for (int i = 0; i < hws.length(); i++) { + String c = String.valueOf(hws.charAt(i)); + Matcher m = Pattern.compile("\\h").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + m = Pattern.compile("[\\h]").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + + m = Pattern.compile("\\H").matcher(hws.substring(0, i) + ng + hws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + m = Pattern.compile("[\\H]").matcher(hws.substring(0, i) + ng + hws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + } + for (int i = 0; i < vws.length(); i++) { + String c = String.valueOf(vws.charAt(i)); + Matcher m = Pattern.compile("\\v").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + m = Pattern.compile("[\\v]").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + + m = Pattern.compile("\\V").matcher(vws.substring(0, i) + ng + vws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + m = Pattern.compile("[\\V]").matcher(vws.substring(0, i) + ng + vws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + } + // \v in range is interpreted as 0x0B. This is the undocumented behavior + if (!Pattern.compile("[\\v-\\v]").matcher(String.valueOf((char)0x0B)).matches()) + failCount++; + report("horizontalAndVerticalWSTest"); + } + + private static void linebreakTest() throws Exception { + String linebreaks = new String (new char[] { + 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 }); + String crnl = "\r\n"; + if (!Pattern.compile("\\R+").matcher(linebreaks).matches() || + !Pattern.compile("\\R").matcher(crnl).matches() || + Pattern.compile("\\R\\R").matcher(crnl).matches()) + failCount++; + report("linebreakTest"); + } + }