diff --git a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java index cbf06f9168e..80091ec47b6 100644 --- a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java +++ b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java @@ -67,27 +67,28 @@ public class GenerateEmojiData { }, ArrayList::addAll); + // make the code point conditions - String extPictCodePoints = extPictRanges.stream() - .map(r -> { - if (r.start == r.last) { - return (" ".repeat(12) + "cp == 0x" + toHexString(r.start)); - } else if (r.start == r.last - 1) { - return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" + - " ".repeat(12) + "cp == 0x" + toHexString(r.last); - } else { - return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) + - " && cp <= 0x" + toHexString(r.last) + ")"; - } - }) - .collect(Collectors.joining(" ||\n")) + ";\n"; + // only very few codepoints below 0x2000 are "emojis", so separate them + // out to generate a fast-path check that can be efficiently inlined + String lowExtPictCodePoints = extPictRanges.stream() + .takeWhile(r -> r.last < 0x2000) + .map(r -> rangeToString(r)) + .collect(Collectors.joining(" ||\n", "", ";\n")); + + String highExtPictCodePoints = extPictRanges.stream() + .dropWhile(r -> r.last < 0x2000) + .map(r -> rangeToString(r)) + .collect(Collectors.joining(" ||\n", "", ";\n")); // Generate EmojiData.java file Files.write(Paths.get(args[2]), Files.lines(Paths.get(args[0])) .flatMap(l -> { - if (l.equals("%%%EXTPICT%%%")) { - return Stream.of(extPictCodePoints); + if (l.equals("%%%EXTPICT_LOW%%%")) { + return Stream.of(lowExtPictCodePoints); + } else if (l.equals("%%%EXTPICT_HIGH%%%")) { + return Stream.of(highExtPictCodePoints); } else { return Stream.of(l); } @@ -99,6 +100,18 @@ public class GenerateEmojiData { } } + static String rangeToString(Range r) { + if (r.start == r.last) { + return (" ".repeat(16) + "cp == 0x" + toHexString(r.start)); + } else if (r.start == r.last - 1) { + return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" + + " ".repeat(16) + "cp == 0x" + toHexString(r.last); + } else { + return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) + + " && cp <= 0x" + toHexString(r.last) + ")"; + } + } + static int toInt(String hexStr) { return Integer.parseUnsignedInt(hexStr, 16); } diff --git a/src/java.base/share/classes/java/util/regex/EmojiData.java.template b/src/java.base/share/classes/java/util/regex/EmojiData.java.template index c6b28625d02..ccc417fa6a1 100644 --- a/src/java.base/share/classes/java/util/regex/EmojiData.java.template +++ b/src/java.base/share/classes/java/util/regex/EmojiData.java.template @@ -40,7 +40,16 @@ final class EmojiData { * @return true if {@code cp} is an extended pictographic */ static boolean isExtendedPictographic(int cp) { + if (cp < 0x2000) { + return +%%%EXTPICT_LOW%%% + } else { + return isHigh(cp); + } + } + + private static boolean isHigh(int cp) { return -%%%EXTPICT%%% +%%%EXTPICT_HIGH%%% } } diff --git a/src/java.base/share/classes/java/util/regex/Grapheme.java b/src/java.base/share/classes/java/util/regex/Grapheme.java index 68cc46c31a6..2e4bf0fddac 100644 --- a/src/java.base/share/classes/java/util/regex/Grapheme.java +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java @@ -29,6 +29,19 @@ import java.util.Objects; final class Grapheme { + /** + * Determines if there is an extended grapheme cluster boundary between two + * continuing characters {@code cp1} and {@code cp2}. + *

+ * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification + * for the extended grapheme cluster boundary rules + *

+ * Note: this method does not take care of stateful breaking. + */ + static boolean isBoundary(int cp1, int cp2) { + return rules[getType(cp1)][getType(cp2)]; + } + /** * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes * the start of the char sequence is a boundary. @@ -50,12 +63,12 @@ final class Grapheme { int ret = Character.charCount(ch0); int ch1; // indicates whether gb11 or gb12 is underway - boolean gb11 = EmojiData.isExtendedPictographic(ch0); - int riCount = getType(ch0) == RI ? 1 : 0; + int t0 = getGraphemeType(ch0); + int riCount = t0 == RI ? 1 : 0; + boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { ch1 = Character.codePointAt(src, ret); - int t0 = getType(ch0); - int t1 = getType(ch1); + int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { gb11 = false; @@ -65,13 +78,14 @@ final class Grapheme { if (ret > off) { break; } else { - gb11 = EmojiData.isExtendedPictographic(ch1); + gb11 = t1 == EXTENDED_PICTOGRAPHIC; riCount = 0; } } - riCount += getType(ch1) == RI ? 1 : 0; - ch0 = ch1; + riCount += (t1 == RI) ? 1 : 0; + t0 = t1; + ret += Character.charCount(ch1); } return ret; @@ -163,6 +177,20 @@ final class Grapheme { cp == 0xAA7B || cp == 0xAA7D; } + private static int getGraphemeType(int cp) { + if (cp < 0x007F) { // ASCII + if (cp < 32) { // Control characters + if (cp == 0x000D) + return CR; + if (cp == 0x000A) + return LF; + return CONTROL; + } + return OTHER; + } + return getType(cp); + } + @SuppressWarnings("fallthrough") private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { @@ -171,12 +199,6 @@ final class Grapheme { int type = Character.getType(cp); switch(type) { - case Character.CONTROL: - if (cp == 0x000D) - return CR; - if (cp == 0x000A) - return LF; - return CONTROL; case Character.UNASSIGNED: // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" @@ -184,6 +206,7 @@ final class Grapheme { if (cp == 0x0378) return OTHER; + case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SURROGATE: diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index a1541bd4434..af0d47bf396 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -3973,7 +3973,16 @@ loop: for(int x=0, offset=0; x