8303018: Unicode Emoji Properties

Reviewed-by: prr, erikj, rriggs
This commit is contained in:
Naoto Sato 2023-03-20 20:20:19 +00:00
parent bc0ed730f2
commit f593a6b52e
19 changed files with 643 additions and 278 deletions

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generatecharacter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.AbstractMap;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* A class holding emoji character properties
* https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files
*/
class EmojiData {
// Emoji properties map
private final Map<Integer, Long> emojiProps;
static EmojiData readSpecFile(Path file, int plane) throws IOException {
return new EmojiData(file, plane);
}
EmojiData(Path file, int plane) throws IOException {
emojiProps = Files.readAllLines(file).stream()
.map(line -> line.split("#", 2)[0])
.filter(Predicate.not(String::isBlank))
.map(line -> line.split("[ \t]*;[ \t]*", 2))
.flatMap(map -> {
var range = map[0].split("\\.\\.", 2);
var start = Integer.valueOf(range[0], 16);
if ((start >> 16) != plane) {
return Stream.empty();
} else {
return range.length == 1 ?
Stream.of(new AbstractMap.SimpleEntry<>(start, convertType(map[1].trim()))) :
IntStream.rangeClosed(start, Integer.valueOf(range[1], 16))
.mapToObj(cp -> new AbstractMap.SimpleEntry<>(cp, convertType(map[1].trim())));
}
})
.collect(Collectors.toMap(AbstractMap.SimpleEntry::getKey,
AbstractMap.SimpleEntry::getValue,
(v1, v2) -> v1 | v2));
}
long properties(int cp) {
return emojiProps.get(cp);
}
Set<Integer> codepoints() {
return emojiProps.keySet();
}
private static long convertType(String type) {
return switch (type) {
case "Emoji" -> GenerateCharacter.maskEmoji;
case "Emoji_Presentation" -> GenerateCharacter.maskEmojiPresentation;
case "Emoji_Modifier" -> GenerateCharacter.maskEmojiModifier;
case "Emoji_Modifier_Base" -> GenerateCharacter.maskEmojiModifierBase;
case "Emoji_Component" -> GenerateCharacter.maskEmojiComponent;
case "Extended_Pictographic" -> GenerateCharacter.maskExtendedPictographic;
default -> throw new InternalError("Unrecognizable Emoji type: " + type);
};
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -33,6 +33,7 @@ import java.io.PrintWriter;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.File;
import java.nio.file.Paths;
import java.util.List;
import build.tools.generatecharacter.CharacterName;
@ -74,6 +75,7 @@ public class GenerateCharacter {
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
static String DefaultPropListFileName = ROOT + "PropList.txt";
static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
static String DefaultEmojiDataFileName = ROOT + "emoji-data.txt";
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
static String DefaultJavaOutputFileName = ROOT + "Character.java";
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
@ -105,7 +107,7 @@ public class GenerateCharacter {
entries are short rather than byte).
*/
/* The character properties are currently encoded into A (32 bits) and B (8 bits)
/* The character properties are currently encoded into A (32 bits) and B (16 bits)
two parts.
A: the low 32 bits are defined in the following manner:
@ -160,6 +162,13 @@ public class GenerateCharacter {
1 bit Ideographic property
1 bit ID_Start property
1 bit ID_Continue property
6 bits for Emoji properties :-
1 bit for Emoji
1 bit for Emoji_Presentation
1 bit for Emoji_Modifier
1 bit for Emoji_Modifier_Base
1 bit for Emoji_Component
1 bit for Extended_Pictographic
*/
@ -188,15 +197,21 @@ public class GenerateCharacter {
// maskMirrored needs to be long, if up 16-bit
private static final long maskMirrored = 0x80000000L;
// bit masks identify the 8-bit property field described above, in B
// bit masks identify the 16-bit property field described above, in B
// table
private static final long
maskOtherLowercase = 0x0100000000L,
maskOtherUppercase = 0x0200000000L,
maskOtherAlphabetic = 0x0400000000L,
maskIdeographic = 0x0800000000L,
maskIDStart = 0x1000000000L,
maskIDContinue = 0x2000000000L;
static final long
maskOtherLowercase = 1L << 32,
maskOtherUppercase = 1L << 33,
maskOtherAlphabetic = 1L << 34,
maskIdeographic = 1L << 35,
maskIDStart = 1L << 36,
maskIDContinue = 1L << 37,
maskEmoji = 1L << 38,
maskEmojiPresentation = 1L << 39,
maskEmojiModifier = 1L << 40,
maskEmojiModifierBase = 1L << 41,
maskEmojiComponent = 1L << 42,
maskExtendedPictographic = 1L << 43;
// Can compare masked values with these to determine
// numeric or lexical types.
@ -304,7 +319,7 @@ public class GenerateCharacter {
* @see GenerateCharacter#buildOne
*/
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList, EmojiData emojiData)
{
long[] result = new long[bLatin1 ? 256 : 1 << 16];
int k = 0;
@ -361,6 +376,9 @@ public class GenerateCharacter {
addExProp(result, propList, "ID_Start", maskIDStart);
addExProp(result, propList, "ID_Continue", maskIDContinue);
// add Emoji properties to the upper 16-bit
addEmojiProps(result, emojiData);
return result;
}
@ -583,6 +601,14 @@ public class GenerateCharacter {
}
}
static void addEmojiProps(long[] map, EmojiData emojiData) {
for (int cp : emojiData.codepoints()) {
var index = cp & 0xFFFF;
if (index < map.length)
map[index] |= emojiData.properties(cp);
}
}
/**
* This is the heart of the table compression strategy. The inputs are a map
* and a number of bits (size). The map is simply an array of long integer values;
@ -776,6 +802,12 @@ OUTER: for (int i = 0; i < n; i += m) {
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
if (x.equals("maskEmoji")) return "0x" + hex4(maskEmoji >> 32);
if (x.equals("maskEmojiPresentation")) return "0x" + hex4(maskEmojiPresentation >> 32);
if (x.equals("maskEmojiModifier")) return "0x" + hex4(maskEmojiModifier >> 32);
if (x.equals("maskEmojiModifierBase")) return "0x" + hex4(maskEmojiModifierBase >> 32);
if (x.equals("maskEmojiComponent")) return "0x" + hex4(maskEmojiComponent >> 32);
if (x.equals("maskExtendedPictographic")) return "0x" + hex4(maskExtendedPictographic >> 32);
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
@ -952,7 +984,7 @@ OUTER: for (int i = 0; i < n; i += m) {
// If we ever need more than 32 bits to represent the character properties,
// then a table "B" may be needed as well.
genTable(result, "B", tables[n - 1], 32, 8, sizes[n - 1], false, 0, true, true, false);
genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
result.append(commentStart);
@ -1434,6 +1466,42 @@ OUTER: for (int i = 0; i < n; i += m) {
result.append(", supradecimal ");
result.append((val & maskDigitOffset) >> shiftDigitOffset);
}
if ((val & maskOtherLowercase) == maskOtherLowercase) {
result.append(", otherLowercase");
}
if ((val & maskOtherUppercase) == maskOtherUppercase) {
result.append(", otherUppercase");
}
if ((val & maskOtherAlphabetic) == maskOtherAlphabetic) {
result.append(", otherAlphabetic");
}
if ((val & maskIdeographic) == maskIdeographic) {
result.append(", ideographic");
}
if ((val & maskIDStart) == maskIDStart) {
result.append(", IDStart");
}
if ((val & maskIDContinue) == maskIDContinue) {
result.append(", IDContinue");
}
if ((val & maskEmoji) == maskEmoji) {
result.append(", emoji");
}
if ((val & maskEmojiPresentation) == maskEmojiPresentation) {
result.append(", emojiPresentation");
}
if ((val & maskEmojiModifier) == maskEmojiModifier) {
result.append(", emojiModifier");
}
if ((val & maskEmojiModifierBase) == maskEmojiModifierBase) {
result.append(", emojiModifierBase");
}
if ((val & maskEmojiComponent) == maskEmojiComponent) {
result.append(", emojiComponent");
}
if ((val & maskExtendedPictographic) == maskExtendedPictographic) {
result.append(", extendedPictographic");
}
}
static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
@ -1512,6 +1580,7 @@ OUTER: for (int i = 0; i < n; i += m) {
static String SpecialCasingFileName = null;
static String PropListFileName = null;
static String DerivedPropsFileName = null;
static String EmojiDataFileName = null;
static boolean useCharForByte = false;
static int[] sizes;
static int bins = 0; // liu; if > 0, then perform search
@ -1649,6 +1718,14 @@ OUTER: for (int i = 0; i < n; i += m) {
DerivedPropsFileName = args[++j];
}
}
else if (args[j].equals("-emojidata")) {
if (j == args.length -1) {
FAIL("File name missing after -emojidata");
}
else {
EmojiDataFileName = args[++j];
}
}
else if (args[j].equals("-plane")) {
if (j == args.length -1) {
FAIL("Plane number missing after -plane");
@ -1717,6 +1794,10 @@ OUTER: for (int i = 0; i < n; i += m) {
DerivedPropsFileName = DefaultDerivedPropsFileName;
desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
}
if (EmojiDataFileName == null) {
EmojiDataFileName = DefaultEmojiDataFileName;
desc.append(" [-emojidata " + EmojiDataFileName + ']');
}
if (TemplateFileName == null) {
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
: DefaultJavaTemplateFileName);
@ -1871,11 +1952,12 @@ OUTER: for (int i = 0; i < n; i += m) {
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
EmojiData emojiData = EmojiData.readSpecFile(Paths.get(EmojiDataFileName), plane);
if (verbose) {
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
}
long[] map = buildMap(data, specialCaseMaps, propList);
long[] map = buildMap(data, specialCaseMaps, propList, emojiData);
if (verbose) {
System.err.println("Completed building of initial map");
}

View File

@ -1,155 +0,0 @@
/*
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generateemojidata;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Generate EmojiData.java
* args[0]: Full path string to the template file
* args[1]: Full path string to the directory that contains "emoji-data.txt"
* args[2]: Full path string to the generated .java file
*/
public class GenerateEmojiData {
public static void main(String[] args) {
try {
final Range[] last = new Range[1]; // last extended pictographic range
last[0] = new Range(0, 0);
List<Range> extPictRanges = Files.lines(Paths.get(args[1], "emoji", "emoji-data.txt"))
.filter(Predicate.not(l -> l.startsWith("#") || l.isBlank()))
.filter(l -> l.contains("; Extended_Pictograph"))
.map(l -> new Range(l.replaceFirst(" .*", "")))
.sorted()
.collect(ArrayList<Range>::new,
(list, r) -> {
// collapsing consecutive pictographic ranges
int lastIndex = list.size() - 1;
if (lastIndex >= 0) {
Range lastRange = list.get(lastIndex);
if (lastRange.last + 1 == r.start) {
list.set(lastIndex, new Range(lastRange.start, r.last));
return;
}
}
list.add(r);
},
ArrayList<Range>::addAll);
// make the code point conditions
// only very few codepoints below 0x2000 are "emojis", so separate them
// out to generate a fast-path check that can be efficiently inlined
String lowExtPictCodePoints = extPictRanges.stream()
.takeWhile(r -> r.last < 0x2000)
.map(r -> rangeToString(r))
.collect(Collectors.joining(" ||\n", "", ";\n"));
String highExtPictCodePoints = extPictRanges.stream()
.dropWhile(r -> r.last < 0x2000)
.map(r -> rangeToString(r))
.collect(Collectors.joining(" ||\n", "", ";\n"));
// Generate EmojiData.java file
Files.write(Paths.get(args[2]),
Files.lines(Paths.get(args[0]))
.flatMap(l -> {
if (l.equals("%%%EXTPICT_LOW%%%")) {
return Stream.of(lowExtPictCodePoints);
} else if (l.equals("%%%EXTPICT_HIGH%%%")) {
return Stream.of(highExtPictCodePoints);
} else {
return Stream.of(l);
}
})
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
} catch (IOException e) {
e.printStackTrace();
}
}
static String rangeToString(Range r) {
if (r.start == r.last) {
return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
} else if (r.start == r.last - 1) {
return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
" ".repeat(16) + "cp == 0x" + toHexString(r.last);
} else {
return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
" && cp <= 0x" + toHexString(r.last) + ")";
}
}
static int toInt(String hexStr) {
return Integer.parseUnsignedInt(hexStr, 16);
}
static String toHexString(int cp) {
String ret = Integer.toUnsignedString(cp, 16).toUpperCase();
if (ret.length() < 4) {
ret = "0".repeat(4 - ret.length()) + ret;
}
return ret;
}
static class Range implements Comparable<Range> {
int start;
int last;
Range (int start, int last) {
this.start = start;
this.last = last;
}
Range (String input) {
input = input.replaceFirst("\\s#.*", "");
start = toInt(input.replaceFirst("[\\s\\.].*", ""));
last = input.contains("..") ?
toInt(input.replaceFirst(".*\\.\\.", "")
.replaceFirst(";.*", "").trim())
: start;
}
@Override
public String toString() {
return "Start: " + toHexString(start) + ", Last: " + toHexString(last);
}
@Override
public int compareTo(Range other) {
return Integer.compare(start, other.start);
}
}
}

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
@ -34,7 +34,6 @@ include gensrc/GensrcBuffer.gmk
include gensrc/GensrcExceptions.gmk
include gensrc/GensrcVarHandles.gmk
include gensrc/GensrcModuleLoaderMap.gmk
include gensrc/GensrcEmojiData.gmk
include gensrc/GensrcScopedMemoryAccess.gmk
# GensrcLocaleData.gmk does not set TARGETS, so we must choose which targets

View File

@ -49,6 +49,7 @@ define SetupCharacterData
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-derivedprops $(UNICODEDATA)/DerivedCoreProperties.txt \
-emojidata $(UNICODEDATA)/emoji/emoji-data.txt \
-o $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/lang/$1.java \
-usecharforbyte $3

View File

@ -1,43 +0,0 @@
#
# Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation. Oracle designates this
# particular file as subject to the "Classpath" exception as provided
# by Oracle in the LICENSE file that accompanied this code.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
#
#
# Rules to create $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java
#
GENSRC_EMOJIDATA := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java
EMOJIDATATEMP = $(MODULE_SRC)/share/classes/jdk/internal/util/regex/EmojiData.java.template
UNICODEDATA = $(MODULE_SRC)/share/data/unicodedata
$(GENSRC_EMOJIDATA): $(BUILD_TOOLS_JDK) $(EMOJIDATATEMP) $(UNICODEDATA)/emoji/emoji-data.txt
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATEEMOJIDATA) \
$(EMOJIDATATEMP) \
$(UNICODEDATA) \
$(GENSRC_EMOJIDATA)
TARGETS += $(GENSRC_EMOJIDATA)

View File

@ -10781,6 +10781,113 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
return CharacterData.of(codePoint).isIdentifierIgnorable(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is an Emoji.
* <p>
* A character is considered to be an Emoji if and only if it has the {@code Emoji}
* property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character is an Emoji;
* {@code false} otherwise.
* @since 21
*/
public static boolean isEmoji(int codePoint) {
return CharacterData.of(codePoint).isEmoji(codePoint);
}
/**
* Determines if the specified character (Unicode code point) has the
* Emoji Presentation property by default.
* <p>
* A character is considered to have the Emoji Presentation property if and
* only if it has the {@code Emoji_Presentation} property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character has the Emoji Presentation
* property; {@code false} otherwise.
* @since 21
*/
public static boolean isEmojiPresentation(int codePoint) {
return CharacterData.of(codePoint).isEmojiPresentation(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is an
* Emoji Modifier.
* <p>
* A character is considered to be an Emoji Modifier if and only if it has
* the {@code Emoji_Modifier} property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character is an Emoji Modifier;
* {@code false} otherwise.
* @since 21
*/
public static boolean isEmojiModifier(int codePoint) {
return CharacterData.of(codePoint).isEmojiModifier(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is an
* Emoji Modifier Base.
* <p>
* A character is considered to be an Emoji Modifier Base if and only if it has
* the {@code Emoji_Modifier_Base} property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character is an Emoji Modifier Base;
* {@code false} otherwise.
* @since 21
*/
public static boolean isEmojiModifierBase(int codePoint) {
return CharacterData.of(codePoint).isEmojiModifierBase(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is an
* Emoji Component.
* <p>
* A character is considered to be an Emoji Component if and only if it has
* the {@code Emoji_Component} property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character is an Emoji Component;
* {@code false} otherwise.
* @since 21
*/
public static boolean isEmojiComponent(int codePoint) {
return CharacterData.of(codePoint).isEmojiComponent(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is
* an Extended Pictographic.
* <p>
* A character is considered to be an Extended Pictographic if and only if it has
* the {@code Extended_Pictographic} property, defined in
* <a href="https://unicode.org/reports/tr51/#Emoji_Properties_and_Data_Files">
* Unicode Emoji (Technical Standard #51)</a>.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character is an Extended Pictographic;
* {@code false} otherwise.
* @since 21
*/
public static boolean isExtendedPictographic(int codePoint) {
return CharacterData.of(codePoint).isExtendedPictographic(codePoint);
}
/**
* Converts the character argument to lowercase using case
* mapping information from the UnicodeData file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2006, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2006, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -38,6 +38,12 @@ abstract class CharacterData {
abstract boolean isUnicodeIdentifierStart(int ch);
abstract boolean isUnicodeIdentifierPart(int ch);
abstract boolean isIdentifierIgnorable(int ch);
abstract boolean isEmoji(int ch);
abstract boolean isEmojiPresentation(int ch);
abstract boolean isEmojiModifier(int ch);
abstract boolean isEmojiModifierBase(int ch);
abstract boolean isEmojiComponent(int ch);
abstract boolean isExtendedPictographic(int ch);
abstract int toLowerCase(int ch);
abstract int toUpperCase(int ch);
abstract int toTitleCase(int ch);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -120,6 +120,30 @@ class CharacterData00 extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -117,6 +117,30 @@ class CharacterData01 extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -116,6 +116,30 @@ class CharacterData02 extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -116,6 +116,30 @@ class CharacterData03 extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -116,6 +116,30 @@ class CharacterData0E extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);

View File

@ -134,6 +134,30 @@ class CharacterDataLatin1 extends CharacterData {
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);
}
boolean isEmoji(int ch) {
return (getPropertiesEx(ch) & $$maskEmoji) != 0;
}
boolean isEmojiPresentation(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiPresentation) != 0;
}
boolean isEmojiModifier(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifier) != 0;
}
boolean isEmojiModifierBase(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiModifierBase) != 0;
}
boolean isEmojiComponent(int ch) {
return (getPropertiesEx(ch) & $$maskEmojiComponent) != 0;
}
boolean isExtendedPictographic(int ch) {
return (getPropertiesEx(ch) & $$maskExtendedPictographic) != 0;
}
int toLowerCase(int ch) {
if (ch < 'A') { // Fast path for low code points
return ch;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -60,6 +60,30 @@ class CharacterDataPrivateUse extends CharacterData {
return false;
}
boolean isEmoji(int ch) {
return false;
}
boolean isEmojiPresentation(int ch) {
return false;
}
boolean isEmojiModifier(int ch) {
return false;
}
boolean isEmojiModifierBase(int ch) {
return false;
}
boolean isEmojiComponent(int ch) {
return false;
}
boolean isExtendedPictographic(int ch) {
return false;
}
int toLowerCase(int ch) {
return ch;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -58,6 +58,30 @@ class CharacterDataUndefined extends CharacterData {
return false;
}
boolean isEmoji(int ch) {
return false;
}
boolean isEmojiPresentation(int ch) {
return false;
}
boolean isEmojiModifier(int ch) {
return false;
}
boolean isEmojiModifierBase(int ch) {
return false;
}
boolean isEmojiComponent(int ch) {
return false;
}
boolean isExtendedPictographic(int ch) {
return false;
}
int toLowerCase(int ch) {
return ch;
}

View File

@ -1,55 +0,0 @@
/*
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.util.regex;
/**
* Holds data contained in the Unicode Technical Standard #51: Unicode
* Emoji.
*
* Currently it is only used for the rule "GB11" in UAX #29 Unicode Text
* Segmentation.
*/
final class EmojiData {
/**
* Returns whether the code point is an extended pictographic or not.
*
* @param cp code point to examine
* @return true if {@code cp} is an extended pictographic
*/
static boolean isExtendedPictographic(int cp) {
if (cp < 0x2000) {
return
%%%EXTPICT_LOW%%%
} else {
return isHigh(cp);
}
}
private static boolean isHigh(int cp) {
return
%%%EXTPICT_HIGH%%%
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -176,7 +176,7 @@ public final class Grapheme {
return OTHER;
}
if (EmojiData.isExtendedPictographic(cp)) {
if (Character.isExtendedPictographic(cp)) {
return EXTENDED_PICTOGRAPHIC;
}

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8303018
* @summary Check j.l.Character.isEmoji/isEmojiPresentation/isEmojiModifier
* isEmojiModifierBase/isEmojiComponent/isExtendedPictographic
* @library /lib/testlibrary/java/lang
*/
import java.io.IOException;
import java.nio.file.Files;
import java.util.AbstractMap;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Character.MIN_CODE_POINT;
import static java.lang.Character.isEmoji;
import static java.lang.Character.isEmojiPresentation;
import static java.lang.Character.isEmojiModifier;
import static java.lang.Character.isEmojiModifierBase;
import static java.lang.Character.isEmojiComponent;
import static java.lang.Character.isExtendedPictographic;
public class TestEmojiProperties {
// Masks representing Emoji properties (16-bit `B` table masks in
// CharacterData.java)
private static final int EMOJI = 0x0040;
private static final int EMOJI_PRESENTATION = 0x0080;
private static final int EMOJI_MODIFIER = 0x0100;
private static final int EMOJI_MODIFIER_BASE = 0x0200;
private static final int EMOJI_COMPONENT = 0x0400;
private static final int EXTENDED_PICTOGRAPHIC = 0x0800;
public static void main(String[] args) throws IOException {
var emojiProps = Files.readAllLines(UCDFiles.EMOJI_DATA).stream()
.map(line -> line.split("#", 2)[0])
.filter(Predicate.not(String::isBlank))
.map(line -> line.split("[ \t]*;[ \t]*", 2))
.flatMap(map -> {
var range = map[0].split("\\.\\.", 2);
var start = Integer.valueOf(range[0], 16);
return range.length == 1 ?
Stream.of(new AbstractMap.SimpleEntry<>(start, convertType(map[1].trim()))) :
IntStream.rangeClosed(start,
Integer.valueOf(range[1], 16))
.mapToObj(cp -> new AbstractMap.SimpleEntry<>(cp, convertType(map[1].trim())));
})
.collect(Collectors.toMap(AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue, (v1, v2) -> v1 | v2));
final var fails = new Integer[1];
fails[0] = 0;
IntStream.rangeClosed(MIN_CODE_POINT, MAX_CODE_POINT).forEach(cp -> {
var props = emojiProps.getOrDefault(cp, 0L);
if ((props & EMOJI) != 0 ^ isEmoji(cp)) {
System.err.printf("""
isEmoji(0x%x) failed. Returned: %b
""", cp, isEmoji(cp));
fails[0] ++;
}
if ((props & EMOJI_PRESENTATION) != 0 ^ isEmojiPresentation(cp)) {
System.err.printf("""
isEmojiPresentation(0x%x) failed. Returned: %b
""", cp, isEmojiPresentation(cp));
fails[0] ++;
}
if ((props & EMOJI_MODIFIER) != 0 ^ isEmojiModifier(cp)) {
System.err.printf("""
isEmojiModifier(0x%x) failed. Returned: %b
""", cp, isEmojiModifier(cp));
fails[0] ++;
}
if ((props & EMOJI_MODIFIER_BASE) != 0 ^ isEmojiModifierBase(cp)) {
System.err.printf("""
isEmojiModifierBase(0x%x) failed. Returned: %b
""", cp, isEmojiModifierBase(cp));
fails[0] ++;
}
if ((props & EMOJI_COMPONENT) != 0 ^ isEmojiComponent(cp)) {
System.err.printf("""
isEmojiComponent(0x%x) failed. Returned: %b
""", cp, isEmojiComponent(cp));
fails[0] ++;
}
if ((props & EXTENDED_PICTOGRAPHIC) != 0 ^ isExtendedPictographic(cp)) {
System.err.printf("""
isExtendedPictographic(0x%x) failed. Returned: %b
""", cp, isExtendedPictographic(cp));
fails[0] ++;
}
});
if (fails[0] != 0) {
throw new RuntimeException("TestEmojiProperties failed=" + fails);
}
}
private static long convertType(String type) {
return switch (type) {
case "Emoji" -> EMOJI;
case "Emoji_Presentation" -> EMOJI_PRESENTATION;
case "Emoji_Modifier" -> EMOJI_MODIFIER;
case "Emoji_Modifier_Base" -> EMOJI_MODIFIER_BASE;
case "Emoji_Component" -> EMOJI_COMPONENT;
case "Extended_Pictographic" -> EXTENDED_PICTOGRAPHIC;
default -> throw new InternalError();
};
}
}