/* * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ import java.io.BufferedReader; import java.io.FileReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.File; import java.util.regex.Pattern; import java.util.ArrayList; /** * The UnicodeSpec class provides a way to read in Unicode character * properties from a Unicode data file. One instance of class UnicodeSpec * holds a decoded version of one line of the data file. The file may * be obtained from www.unicode.org. The method readSpecFile returns an array * of UnicodeSpec objects. * * @author Guy Steele * @author John O'Conner */ public class UnicodeSpec { public UnicodeSpec() { this(0xffff); } public UnicodeSpec(int codePoint) { this.codePoint = codePoint; generalCategory = UNASSIGNED; bidiCategory = DIRECTIONALITY_UNDEFINED; mirrored = false; titleMap = 0xFFFF; upperMap = 0xFFFF; lowerMap = 0xFFFF; decimalValue = -1; digitValue = -1; numericValue = ""; oldName = null; comment = null; name = null; } public String toString() { StringBuffer result = new StringBuffer(hex6(codePoint)); if (getUpperMap() != 0xffff) { result.append(", upper=").append(hex6(upperMap)); } if (getLowerMap() != 0xffff) { result.append(", lower=").append(hex6(lowerMap)); } if (getTitleMap() != 0xffff) { result.append(", title=").append(hex6(titleMap)); } return result.toString(); } static String hex4(int n) { String q = Long.toHexString(n & 0xFFFF).toUpperCase(); return "0000".substring(Math.min(4, q.length())) + q; } static String hex6(int n) { String str = Integer.toHexString(n & 0xFFFFFF).toUpperCase(); return "000000".substring(Math.min(6, str.length())) + str; } /** * Given one line of a Unicode data file as a String, parse the line * and return a UnicodeSpec object that contains the same character information. * * @param s a line of the Unicode data file to be parsed * @return a UnicodeSpec object, or null if the parsing process failed for some reason */ public static UnicodeSpec parse(String s) { UnicodeSpec spec = null; String[] tokens = null; try { tokens = tokenSeparator.split(s, REQUIRED_FIELDS); spec = new UnicodeSpec(); spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE])); spec.setName(parseName(tokens[FIELD_NAME])); spec.setGeneralCategory(parseGeneralCategory(tokens[FIELD_CATEGORY])); spec.setBidiCategory(parseBidiCategory(tokens[FIELD_BIDI])); spec.setCombiningClass(parseCombiningClass(tokens[FIELD_CLASS])); spec.setDecomposition(parseDecomposition(tokens[FIELD_DECOMPOSITION])); spec.setDecimalValue(parseDecimalValue(tokens[FIELD_DECIMAL])); spec.setDigitValue(parseDigitValue(tokens[FIELD_DIGIT])); spec.setNumericValue(parseNumericValue(tokens[FIELD_NUMERIC])); spec.setMirrored(parseMirrored(tokens[FIELD_MIRRORED])); spec.setOldName(parseOldName(tokens[FIELD_OLDNAME])); spec.setComment(parseComment(tokens[FIELD_COMMENT])); spec.setUpperMap(parseUpperMap(tokens[FIELD_UPPERCASE])); spec.setLowerMap(parseLowerMap(tokens[FIELD_LOWERCASE])); spec.setTitleMap(parseTitleMap(tokens[FIELD_TITLECASE])); } catch(Exception e) { spec = null; System.out.println("Error parsing spec line."); } return spec; } /** * Parse the codePoint attribute for a Unicode character. If the parse succeeds, * the codePoint field of this UnicodeSpec object is updated and false is returned. * * The codePoint attribute should be a four-digit hexadecimal integer. * * @param s the codePoint attribute extracted from a line of the Unicode data file * @return code point if successful * @exception NumberFormatException if unable to parse argument */ public static int parseCodePoint(String s) throws NumberFormatException { return Integer.parseInt(s, 16); } public static String parseName(String s) throws Exception { if (s==null) throw new Exception("Cannot parse name."); return s; } public static byte parseGeneralCategory(String s) throws Exception { byte category = GENERAL_CATEGORY_COUNT; for (byte x=0; x= GENERAL_CATEGORY_COUNT) { throw new Exception("Could not parse general category."); } return category; } public static byte parseBidiCategory(String s) throws Exception { byte category = DIRECTIONALITY_CATEGORY_COUNT; for (byte x=0; x= DIRECTIONALITY_CATEGORY_COUNT) { throw new Exception("Could not parse bidi category."); } return category; } /** * Parse the combining attribute for a Unicode character. If there is a combining * attribute and the parse succeeds, then the hasCombining field is set to true, * the combining field of this UnicodeSpec object is updated, and false is returned. * If the combining attribute is an empty string, the parse succeeds but the * hasCombining field is set to false. (and false is returned). * * The combining attribute, if any, should be a nonnegative decimal integer. * * @param s the combining attribute extracted from a line of the Unicode data file * @return the combining class value if any, -1 if property not defined * @exception Exception if can't parse the combining class */ public static int parseCombiningClass(String s) throws Exception { int combining = -1; if (s.length()>0) { combining = Integer.parseInt(s, 10); } return combining; } /** * Parse the decomposition attribute for a Unicode character. If the parse succeeds, * the decomposition field of this UnicodeSpec object is updated and false is returned. * * The decomposition attribute is complicated; for now, it is treated as a string. * * @param s the decomposition attribute extracted from a line of the Unicode data file * @return true if the parse failed; otherwise false */ public static String parseDecomposition(String s) throws Exception { if (s==null) throw new Exception("Cannot parse decomposition."); return s; } /** * Parse the decimal value attribute for a Unicode character. If there is a decimal value * attribute and the parse succeeds, then the hasDecimalValue field is set to true, * the decimalValue field of this UnicodeSpec object is updated, and false is returned. * If the decimal value attribute is an empty string, the parse succeeds but the * hasDecimalValue field is set to false. (and false is returned). * * The decimal value attribute, if any, should be a nonnegative decimal integer. * * @param s the decimal value attribute extracted from a line of the Unicode data file * @return the decimal value as an int, -1 if no decimal value defined * @exception NumberFormatException if the parse fails */ public static int parseDecimalValue(String s) throws NumberFormatException { int value = -1; if (s.length() > 0) { value = Integer.parseInt(s, 10); } return value; } /** * Parse the digit value attribute for a Unicode character. If there is a digit value * attribute and the parse succeeds, then the hasDigitValue field is set to true, * the digitValue field of this UnicodeSpec object is updated, and false is returned. * If the digit value attribute is an empty string, the parse succeeds but the * hasDigitValue field is set to false. (and false is returned). * * The digit value attribute, if any, should be a nonnegative decimal integer. * * @param s the digit value attribute extracted from a line of the Unicode data file * @return the digit value as an non-negative int, or -1 if no digit property defined * @exception NumberFormatException if the parse fails */ public static int parseDigitValue(String s) throws NumberFormatException { int value = -1; if (s.length() > 0) { value = Integer.parseInt(s, 10); } return value; } public static String parseNumericValue(String s) throws Exception { if (s == null) throw new Exception("Cannot parse numeric value."); return s; } public static String parseComment(String s) throws Exception { if (s == null) throw new Exception("Cannot parse comment."); return s; } public static boolean parseMirrored(String s) throws Exception { boolean mirrored; if (s.length() == 1) { if (s.charAt(0) == 'Y') {mirrored = true;} else if (s.charAt(0) == 'N') {mirrored = false;} else {throw new Exception("Cannot parse mirrored property.");} } else { throw new Exception("Cannot parse mirrored property.");} return mirrored; } public static String parseOldName(String s) throws Exception { if (s == null) throw new Exception("Cannot parse old name"); return s; } /** * Parse the uppercase mapping attribute for a Unicode character. If there is a uppercase * mapping attribute and the parse succeeds, then the hasUpperMap field is set to true, * the upperMap field of this UnicodeSpec object is updated, and false is returned. * If the uppercase mapping attribute is an empty string, the parse succeeds but the * hasUpperMap field is set to false. (and false is returned). * * The uppercase mapping attribute should be a four-digit hexadecimal integer. * * @param s the uppercase mapping attribute extracted from a line of the Unicode data file * @return uppercase char if defined, \uffff otherwise * @exception NumberFormatException if parse fails */ public static int parseUpperMap(String s) throws NumberFormatException { int upperCase = 0xFFFF; if (s.length() >= 4) { upperCase = Integer.parseInt(s, 16); } else if (s.length() != 0) { throw new NumberFormatException(); } return upperCase; } /** * Parse the lowercase mapping attribute for a Unicode character. If there is a lowercase * mapping attribute and the parse succeeds, then the hasLowerMap field is set to true, * the lowerMap field of this UnicodeSpec object is updated, and false is returned. * If the lowercase mapping attribute is an empty string, the parse succeeds but the * hasLowerMap field is set to false. (and false is returned). * * The lowercase mapping attribute should be a four-digit hexadecimal integer. * * @param s the lowercase mapping attribute extracted from a line of the Unicode data file * @return lowercase char mapping if defined, \uFFFF otherwise * @exception NumberFormatException if parse fails */ public static int parseLowerMap(String s) throws NumberFormatException { int lowerCase = 0xFFFF; if (s.length() >= 4) { lowerCase = Integer.parseInt(s, 16); } else if (s.length() != 0) { throw new NumberFormatException(); } return lowerCase; } /** * Parse the titlecase mapping attribute for a Unicode character. If there is a titlecase * mapping attribute and the parse succeeds, then the hasTitleMap field is set to true, * the titleMap field of this UnicodeSpec object is updated, and false is returned. * If the titlecase mapping attribute is an empty string, the parse succeeds but the * hasTitleMap field is set to false. (and false is returned). * * The titlecase mapping attribute should be a four-digit hexadecimal integer. * * @param s the titlecase mapping attribute extracted from a line of the Unicode data file * @return title case char mapping if defined, \uFFFF otherwise * @exception NumberFormatException if parse fails */ public static int parseTitleMap(String s) throws NumberFormatException { int titleCase = 0xFFFF; if (s.length() >= 4) { titleCase = Integer.parseInt(s, 16); } else if (s.length() != 0) { throw new NumberFormatException(); } return titleCase; } /** * Read and parse a Unicode data file. * * @param file a file specifying the Unicode data file to be read * @return an array of UnicodeSpec objects, one for each line of the * Unicode data file that could be successfully parsed as * specifying Unicode character attributes */ public static UnicodeSpec[] readSpecFile(File file, int plane) throws FileNotFoundException { ArrayList list = new ArrayList<>(3000); UnicodeSpec[] result = null; int count = 0; BufferedReader f = new BufferedReader(new FileReader(file)); String line = null; loop: while(true) { try { line = f.readLine(); } catch (IOException e) { break loop; } if (line == null) break loop; UnicodeSpec item = parse(line.trim()); int specPlane = item.getCodePoint() >>> 16; if (specPlane < plane) continue; if (specPlane > plane) break; if (item != null) { list.add(item); } } result = new UnicodeSpec[list.size()]; list.toArray(result); return result; } void setCodePoint(int value) { codePoint = value; } /** * Return the code point in this Unicode specification * @return the char code point representing by the specification */ public int getCodePoint() { return codePoint; } void setName(String name) { this.name = name; } public String getName() { return name; } void setGeneralCategory(byte category) { generalCategory = category; } public byte getGeneralCategory() { return generalCategory; } void setBidiCategory(byte category) { bidiCategory = category; } public byte getBidiCategory() { return bidiCategory; } void setCombiningClass(int combiningClass) { this.combiningClass = combiningClass; } public int getCombiningClass() { return combiningClass; } void setDecomposition(String decomposition) { this.decomposition = decomposition; } public String getDecomposition() { return decomposition; } void setDecimalValue(int value) { decimalValue = value; } public int getDecimalValue() { return decimalValue; } public boolean isDecimalValue() { return decimalValue != -1; } void setDigitValue(int value) { digitValue = value; } public int getDigitValue() { return digitValue; } public boolean isDigitValue() { return digitValue != -1; } void setNumericValue(String value) { numericValue = value; } public String getNumericValue() { return numericValue; } public boolean isNumericValue() { return numericValue.length() > 0; } void setMirrored(boolean value) { mirrored = value; } public boolean isMirrored() { return mirrored; } void setOldName(String name) { oldName = name; } public String getOldName() { return oldName; } void setComment(String comment) { this.comment = comment; } public String getComment() { return comment; } void setUpperMap(int ch) { upperMap = ch; }; public int getUpperMap() { return upperMap; } public boolean hasUpperMap() { return upperMap != 0xffff; } void setLowerMap(int ch) { lowerMap = ch; } public int getLowerMap() { return lowerMap; } public boolean hasLowerMap() { return lowerMap != 0xffff; } void setTitleMap(int ch) { titleMap = ch; } public int getTitleMap() { return titleMap; } public boolean hasTitleMap() { return titleMap != 0xffff; } int codePoint; // the characters UTF-32 code value String name; // the ASCII name byte generalCategory; // general category, available via Characte.getType() byte bidiCategory; // available via Character.getBidiType() int combiningClass; // not used in Character String decomposition; // not used in Character int decimalValue; // decimal digit value int digitValue; // not all digits are decimal String numericValue; // numeric value if digit or non-digit boolean mirrored; // String oldName; String comment; int upperMap; int lowerMap; int titleMap; // this is the number of fields in one line of the UnicodeData.txt file // each field is separated by a semicolon (a token) static final int REQUIRED_FIELDS = 15; /** * General category types * To preserve compatibility, these values cannot be changed */ public static final byte UNASSIGNED = 0, // Cn normative UPPERCASE_LETTER = 1, // Lu normative LOWERCASE_LETTER = 2, // Ll normative TITLECASE_LETTER = 3, // Lt normative MODIFIER_LETTER = 4, // Lm normative OTHER_LETTER = 5, // Lo normative NON_SPACING_MARK = 6, // Mn informative ENCLOSING_MARK = 7, // Me informative COMBINING_SPACING_MARK = 8, // Mc normative DECIMAL_DIGIT_NUMBER = 9, // Nd normative LETTER_NUMBER = 10, // Nl normative OTHER_NUMBER = 11, // No normative SPACE_SEPARATOR = 12, // Zs normative LINE_SEPARATOR = 13, // Zl normative PARAGRAPH_SEPARATOR = 14, // Zp normative CONTROL = 15, // Cc normative FORMAT = 16, // Cf normative // 17 is unused for no apparent reason, // but must preserve forward compatibility PRIVATE_USE = 18, // Co normative SURROGATE = 19, // Cs normative DASH_PUNCTUATION = 20, // Pd informative START_PUNCTUATION = 21, // Ps informative END_PUNCTUATION = 22, // Pe informative CONNECTOR_PUNCTUATION = 23, // Pc informative OTHER_PUNCTUATION = 24, // Po informative MATH_SYMBOL = 25, // Sm informative CURRENCY_SYMBOL = 26, // Sc informative MODIFIER_SYMBOL = 27, // Sk informative OTHER_SYMBOL = 28, // So informative INITIAL_QUOTE_PUNCTUATION = 29, // Pi informative FINAL_QUOTE_PUNCTUATION = 30, // Pf informative // this value is only used in the character generation tool // it can change to accommodate the addition of new categories. GENERAL_CATEGORY_COUNT = 31; // sentinel value static final byte SHORT = 0, LONG = 1; // general category type strings // NOTE: The order of this category array is dependent on the assignment of // category constants above. We want to access this array using constants above. // [][SHORT] is the SHORT name, [][LONG] is the LONG name static final String[][] generalCategoryList = { {"Cn", "UNASSIGNED"}, {"Lu", "UPPERCASE_LETTER"}, {"Ll", "LOWERCASE_LETTER"}, {"Lt", "TITLECASE_LETTER"}, {"Lm", "MODIFIER_LETTER"}, {"Lo", "OTHER_LETTER"}, {"Mn", "NON_SPACING_MARK"}, {"Me", "ENCLOSING_MARK"}, {"Mc", "COMBINING_SPACING_MARK"}, {"Nd", "DECIMAL_DIGIT_NUMBER"}, {"Nl", "LETTER_NUMBER"}, {"No", "OTHER_NUMBER"}, {"Zs", "SPACE_SEPARATOR"}, {"Zl", "LINE_SEPARATOR"}, {"Zp", "PARAGRAPH_SEPARATOR"}, {"Cc", "CONTROL"}, {"Cf", "FORMAT"}, {"xx", "unused"}, {"Co", "PRIVATE_USE"}, {"Cs", "SURROGATE"}, {"Pd", "DASH_PUNCTUATION"}, {"Ps", "START_PUNCTUATION"}, {"Pe", "END_PUNCTUATION"}, {"Pc", "CONNECTOR_PUNCTUATION"}, {"Po", "OTHER_PUNCTUATION"}, {"Sm", "MATH_SYMBOL"}, {"Sc", "CURRENCY_SYMBOL"}, {"Sk", "MODIFIER_SYMBOL"}, {"So", "OTHER_SYMBOL"}, {"Pi", "INITIAL_QUOTE_PUNCTUATION"}, {"Pf", "FINAL_QUOTE_PUNCTUATION"} }; /** * Bidirectional categories */ public static final byte DIRECTIONALITY_UNDEFINED = -1, // Strong category DIRECTIONALITY_LEFT_TO_RIGHT = 0, // L DIRECTIONALITY_RIGHT_TO_LEFT = 1, // R DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2, // AL // Weak category DIRECTIONALITY_EUROPEAN_NUMBER = 3, // EN DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4, // ES DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5, // ET DIRECTIONALITY_ARABIC_NUMBER = 6, // AN DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7, // CS DIRECTIONALITY_NONSPACING_MARK = 8, // NSM DIRECTIONALITY_BOUNDARY_NEUTRAL = 9, // BN // Neutral category DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10, // B DIRECTIONALITY_SEGMENT_SEPARATOR = 11, // S DIRECTIONALITY_WHITESPACE = 12, // WS DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14, // LRE DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15, // LRO DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16, // RLE DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17, // RLO DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18, // PDF DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19, // LRI DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20, // RLI DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21, // FSI DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22, // PDI DIRECTIONALITY_CATEGORY_COUNT = 23; // sentinel value // If changes are made to the above bidi category assignments, this // list of bidi category names must be changed to keep their order in synch. // Access this list using the bidi category constants above. static final String[][] bidiCategoryList = { {"L", "DIRECTIONALITY_LEFT_TO_RIGHT"}, {"R", "DIRECTIONALITY_RIGHT_TO_LEFT"}, {"AL", "DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC"}, {"EN", "DIRECTIONALITY_EUROPEAN_NUMBER"}, {"ES", "DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR"}, {"ET", "DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR"}, {"AN", "DIRECTIONALITY_ARABIC_NUMBER"}, {"CS", "DIRECTIONALITY_COMMON_NUMBER_SEPARATOR"}, {"NSM", "DIRECTIONALITY_NONSPACING_MARK"}, {"BN", "DIRECTIONALITY_BOUNDARY_NEUTRAL"}, {"B", "DIRECTIONALITY_PARAGRAPH_SEPARATOR"}, {"S", "DIRECTIONALITY_SEGMENT_SEPARATOR"}, {"WS", "DIRECTIONALITY_WHITESPACE"}, {"ON", "DIRECTIONALITY_OTHER_NEUTRALS"}, {"LRE", "DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING"}, {"LRO", "DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE"}, {"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"}, {"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"}, {"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"}, {"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"}, {"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"}, {"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"}, {"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"}, }; // Unicode specification lines have fields in this order. static final byte FIELD_VALUE = 0, FIELD_NAME = 1, FIELD_CATEGORY = 2, FIELD_CLASS = 3, FIELD_BIDI = 4, FIELD_DECOMPOSITION = 5, FIELD_DECIMAL = 6, FIELD_DIGIT = 7, FIELD_NUMERIC = 8, FIELD_MIRRORED = 9, FIELD_OLDNAME = 10, FIELD_COMMENT = 11, FIELD_UPPERCASE = 12, FIELD_LOWERCASE = 13, FIELD_TITLECASE = 14; static final Pattern tokenSeparator = Pattern.compile(";"); public static void main(String[] args) { UnicodeSpec[] spec = null; if (args.length == 2 ) { try { File file = new File(args[0]); int plane = Integer.parseInt(args[1]); spec = UnicodeSpec.readSpecFile(file, plane); System.out.println("UnicodeSpec[" + spec.length + "]:"); for (int x=0; x