8191410: Unicode 10

Upgrade to Unicode 10

Reviewed-by: naoto, rriggs, igerasim
This commit is contained in:
Rachna Goel 2018-05-08 11:49:42 +05:30
parent 2dd9adbf24
commit 78bd242097
40 changed files with 39358 additions and 1049 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -139,6 +139,12 @@ class CharacterData00 extends CharacterData {
case 0x0130: mapChar = 0x0069; break;
case 0x023A: mapChar = 0x2C65; break;
case 0x023E: mapChar = 0x2C66; break;
case 0x0412: mapChar = 0x1C80; break;
case 0x0414: mapChar = 0x1C81; break;
case 0x041E: mapChar = 0x1C82; break;
case 0x0421: mapChar = 0x1C83; break;
case 0x042A: mapChar = 0x1C86; break;
case 0x0462: mapChar = 0x1C87; break;
case 0x10A0: mapChar = 0x2D00; break;
case 0x10A1: mapChar = 0x2D01; break;
case 0x10A2: mapChar = 0x2D02; break;
@ -299,12 +305,14 @@ class CharacterData00 extends CharacterData {
case 0x2C70: mapChar = 0x0252; break;
case 0x2C7E: mapChar = 0x023F; break;
case 0x2C7F: mapChar = 0x0240; break;
case 0xA64A: mapChar = 0x1C88; break;
case 0xA77D: mapChar = 0x1D79; break;
case 0xA78D: mapChar = 0x0265; break;
case 0xA7AA: mapChar = 0x0266; break;
case 0xA7AB: mapChar = 0x025C; break;
case 0xA7AC: mapChar = 0x0261; break;
case 0xA7AD: mapChar = 0x026C; break;
case 0xA7AE: mapChar = 0x026A; break;
case 0xA7B0: mapChar = 0x029E; break;
case 0xA7B1: mapChar = 0x0287; break;
case 0xA7B2: mapChar = 0x029D; break;
@ -339,6 +347,7 @@ class CharacterData00 extends CharacterData {
case 0x0261: mapChar = 0xA7AC; break;
case 0x0265: mapChar = 0xA78D; break;
case 0x0266: mapChar = 0xA7AA; break;
case 0x026A: mapChar = 0xA7AE; break;
case 0x026B: mapChar = 0x2C62; break;
case 0x026C: mapChar = 0xA7AD; break;
case 0x0271: mapChar = 0x2C6E; break;
@ -346,6 +355,15 @@ class CharacterData00 extends CharacterData {
case 0x0287: mapChar = 0xA7B1; break;
case 0x029D: mapChar = 0xA7B2; break;
case 0x029E: mapChar = 0xA7B0; break;
case 0x1C80: mapChar = 0x0412; break;
case 0x1C81: mapChar = 0x0414; break;
case 0x1C82: mapChar = 0x041E; break;
case 0x1C83: mapChar = 0x0421; break;
case 0x1C84: mapChar = 0x0422; break;
case 0x1C85: mapChar = 0x0422; break;
case 0x1C86: mapChar = 0x042A; break;
case 0x1C87: mapChar = 0x0462; break;
case 0x1C88: mapChar = 0xA64A; break;
case 0x1D79: mapChar = 0xA77D; break;
case 0x1D7D: mapChar = 0x2C63; break;
case 0x1F80: mapChar = 0x1F88; break;
@ -715,6 +733,7 @@ class CharacterData00 extends CharacterData {
case 0x0261: mapChar = 0xA7AC; break;
case 0x0265: mapChar = 0xA78D; break;
case 0x0266: mapChar = 0xA7AA; break;
case 0x026A: mapChar = 0xA7AE; break;
case 0x026B: mapChar = 0x2C62; break;
case 0x026C: mapChar = 0xA7AD; break;
case 0x0271: mapChar = 0x2C6E; break;
@ -722,6 +741,15 @@ class CharacterData00 extends CharacterData {
case 0x0287: mapChar = 0xA7B1; break;
case 0x029D: mapChar = 0xA7B2; break;
case 0x029E: mapChar = 0xA7B0; break;
case 0x1C80: mapChar = 0x0412; break;
case 0x1C81: mapChar = 0x0414; break;
case 0x1C82: mapChar = 0x041E; break;
case 0x1C83: mapChar = 0x0421; break;
case 0x1C84: mapChar = 0x0422; break;
case 0x1C85: mapChar = 0x0422; break;
case 0x1C86: mapChar = 0x042A; break;
case 0x1C87: mapChar = 0x0462; break;
case 0x1C88: mapChar = 0xA64A; break;
case 0x1D79: mapChar = 0xA77D; break;
case 0x1D7D: mapChar = 0x2C63; break;
case 0x1FBE: mapChar = 0x0399; break;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -372,6 +372,13 @@ class CharacterData01 extends CharacterData {
case 0x11063: retval = 90; break; // BRAHMI NUMBER NINETY
case 0x11064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED
case 0x11065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND
case 0x11C66: retval = 40; break; // BHAIKSUKI NUMBER FORTY
case 0x11C67: retval = 50; break; // BHAIKSUKI NUMBER FIFTY
case 0x11C68: retval = 60; break; // BHAIKSUKI NUMBER SIXTY
case 0x11C69: retval = 70; break; // BHAIKSUKI NUMBER SEVENTY
case 0x11C6A: retval = 80; break; // BHAIKSUKI NUMBER EIGHTY
case 0x11C6B: retval = 90; break; // BHAIKSUKI NUMBER NINETY
case 0x11C6C: retval = 100; break; // BHAIKSUKI HUNDREDS UNIT MARK
case 0x111ED: retval = 40; break; // SINHALA ARCHAIC NUMBER FORTY
case 0x111EE: retval = 50; break; // SINHALA ARCHAIC NUMBER FIFTY
case 0x111EF: retval = 60; break; // SINHALA ARCHAIC NUMBER SIXTY

View File

@ -1,10 +1,11 @@
# PropList-8.0.0.txt
# Date: 2015-05-16, 17:50:38 GMT [MD]
# PropList-10.0.0.txt
# Date: 2017-03-10, 08:25:30 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
# ================================================
@ -192,10 +193,17 @@ FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA
111DE..111DF ; Terminal_Punctuation # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK
112A9 ; Terminal_Punctuation # Po MULTANI SECTION MARK
1144B..1144D ; Terminal_Punctuation # Po [3] NEWA DANDA..NEWA COMMA
1145B ; Terminal_Punctuation # Po NEWA PLACEHOLDER MARK
115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR
115C9..115D7 ; Terminal_Punctuation # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; Terminal_Punctuation # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
11A42..11A43 ; Terminal_Punctuation # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD
11A9B..11A9C ; Terminal_Punctuation # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
11AA1..11AA2 ; Terminal_Punctuation # Po [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
11C41..11C43 ; Terminal_Punctuation # Po [3] BHAIKSUKI DANDA..BHAIKSUKI WORD SEPARATOR
11C71 ; Terminal_Punctuation # Po MARCHEN MARK SHAD
12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP
@ -204,7 +212,7 @@ FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA
1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA87..1DA8A ; Terminal_Punctuation # Po [4] SIGNWRITING COMMA..SIGNWRITING COLON
# Total code points: 238
# Total code points: 252
# ================================================
@ -429,6 +437,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
081B..0823 ; Other_Alphabetic # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN
08D4..08DF ; Other_Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA
08E3..08E9 ; Other_Alphabetic # Mn [7] ARABIC TURNED DAMMA BELOW..ARABIC CURLY KASRATAN
08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA
0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA
@ -465,6 +474,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
0AC9 ; Other_Alphabetic # Mc GUJARATI VOWEL SIGN CANDRA O
0ACB..0ACC ; Other_Alphabetic # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU
0AE2..0AE3 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
0AFA..0AFC ; Other_Alphabetic # Mn [3] GUJARATI SIGN SUKUN..GUJARATI SIGN MADDAH
0B01 ; Other_Alphabetic # Mn ORIYA SIGN CANDRABINDU
0B02..0B03 ; Other_Alphabetic # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA
0B3E ; Other_Alphabetic # Mc ORIYA VOWEL SIGN AA
@ -502,7 +512,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU
0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU
0D00..0D01 ; Other_Alphabetic # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@ -556,6 +566,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
17BE..17C5 ; Other_Alphabetic # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
17C6 ; Other_Alphabetic # Mn KHMER SIGN NIKAHIT
17C7..17C8 ; Other_Alphabetic # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
1885..1886 ; Other_Alphabetic # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
18A9 ; Other_Alphabetic # Mn MONGOLIAN LETTER ALI GALI DAGALGA
1920..1922 ; Other_Alphabetic # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
1923..1926 ; Other_Alphabetic # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
@ -613,6 +624,7 @@ A825..A826 ; Other_Alphabetic # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NA
A827 ; Other_Alphabetic # Mc SYLOTI NAGRI VOWEL SIGN OO
A880..A881 ; Other_Alphabetic # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
A8B4..A8C3 ; Other_Alphabetic # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
A8C5 ; Other_Alphabetic # Mn SAURASHTRA SIGN CANDRABINDU
A926..A92A ; Other_Alphabetic # Mn [5] KAYAH LI VOWEL UE..KAYAH LI VOWEL O
A947..A951 ; Other_Alphabetic # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
A952 ; Other_Alphabetic # Mc REJANG CONSONANT SIGN H
@ -671,6 +683,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA
11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA
1123E ; Other_Alphabetic # Mn KHOJKI SIGN SUKUN
112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA
112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU
@ -683,6 +696,11 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU
11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK
11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
11435..11437 ; Other_Alphabetic # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
11438..1143F ; Other_Alphabetic # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
11440..11441 ; Other_Alphabetic # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
11443..11444 ; Other_Alphabetic # Mn [2] NEWA SIGN CANDRABINDU..NEWA SIGN ANUSVARA
11445 ; Other_Alphabetic # Mc NEWA SIGN VISARGA
114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E
@ -712,14 +730,48 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
11722..11725 ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU
11726 ; Other_Alphabetic # Mc AHOM VOWEL SIGN E
11727..1172A ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN AW..AHOM VOWEL SIGN AM
11A01..11A06 ; Other_Alphabetic # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O
11A07..11A08 ; Other_Alphabetic # Mc [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU
11A09..11A0A ; Other_Alphabetic # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK
11A35..11A38 ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE SIGN CANDRABINDU..ZANABAZAR SQUARE SIGN ANUSVARA
11A39 ; Other_Alphabetic # Mc ZANABAZAR SQUARE SIGN VISARGA
11A3B..11A3E ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
11A51..11A56 ; Other_Alphabetic # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
11A57..11A58 ; Other_Alphabetic # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
11A59..11A5B ; Other_Alphabetic # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
11A8A..11A96 ; Other_Alphabetic # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
11A97 ; Other_Alphabetic # Mc SOYOMBO SIGN VISARGA
11C2F ; Other_Alphabetic # Mc BHAIKSUKI VOWEL SIGN AA
11C30..11C36 ; Other_Alphabetic # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
11C38..11C3D ; Other_Alphabetic # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
11C3E ; Other_Alphabetic # Mc BHAIKSUKI SIGN VISARGA
11C92..11CA7 ; Other_Alphabetic # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
11CA9 ; Other_Alphabetic # Mc MARCHEN SUBJOINED LETTER YA
11CAA..11CB0 ; Other_Alphabetic # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
11CB1 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN I
11CB2..11CB3 ; Other_Alphabetic # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
11CB4 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN O
11CB5..11CB6 ; Other_Alphabetic # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
11D31..11D36 ; Other_Alphabetic # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
11D3A ; Other_Alphabetic # Mn MASARAM GONDI VOWEL SIGN E
11D3C..11D3D ; Other_Alphabetic # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
11D3F..11D41 ; Other_Alphabetic # Mn [3] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI SIGN VISARGA
11D43 ; Other_Alphabetic # Mn MASARAM GONDI SIGN CANDRA
11D47 ; Other_Alphabetic # Mn MASARAM GONDI RA-KARA
16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG
1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK
1E000..1E006 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; Other_Alphabetic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Other_Alphabetic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Other_Alphabetic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
1E947 ; Other_Alphabetic # Mn ADLAM HAMZA
1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
# Total code points: 1116
# Total code points: 1300
# ================================================
@ -728,16 +780,20 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
3021..3029 ; Ideographic # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
3038..303A ; Ideographic # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
3400..4DB5 ; Ideographic # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Ideographic # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Ideographic # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
F900..FA6D ; Ideographic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
17000..187EC ; Ideographic # Lo [6125] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187EC
18800..18AF2 ; Ideographic # Lo [755] TANGUT COMPONENT-001..TANGUT COMPONENT-755
1B170..1B2FB ; Ideographic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
20000..2A6D6 ; Ideographic # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Ideographic # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Ideographic # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Ideographic # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Ideographic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Ideographic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
# Total code points: 81404
# Total code points: 96174
# ================================================
@ -793,12 +849,14 @@ FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COM
0A4D ; Diacritic # Mn GURMUKHI SIGN VIRAMA
0ABC ; Diacritic # Mn GUJARATI SIGN NUKTA
0ACD ; Diacritic # Mn GUJARATI SIGN VIRAMA
0AFD..0AFF ; Diacritic # Mn [3] GUJARATI SIGN THREE-DOT NUKTA ABOVE..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
0B3C ; Diacritic # Mn ORIYA SIGN NUKTA
0B4D ; Diacritic # Mn ORIYA SIGN VIRAMA
0BCD ; Diacritic # Mn TAMIL SIGN VIRAMA
0C4D ; Diacritic # Mn TELUGU SIGN VIRAMA
0CBC ; Diacritic # Mn KANNADA SIGN NUKTA
0CCD ; Diacritic # Mn KANNADA SIGN VIRAMA
0D3B..0D3C ; Diacritic # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
0D4D ; Diacritic # Mn MALAYALAM SIGN VIRAMA
0DCA ; Diacritic # Mn SINHALA SIGN AL-LAKUNA
0E47..0E4C ; Diacritic # Mn [6] THAI CHARACTER MAITAIKHU..THAI CHARACTER THANTHAKHAT
@ -838,10 +896,11 @@ FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COM
1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Diacritic # Mn VEDIC SIGN TIRYAK
1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE
1CF7 ; Diacritic # Mc VEDIC SIGN ATIKRAMA
1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW
1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE
1DF5..1DF9 ; Diacritic # Mn [5] COMBINING UP TACK ABOVE..COMBINING WIDE INVERTED BRIDGE BELOW
1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1FBD ; Diacritic # Sk GREEK KORONIS
1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
@ -906,12 +965,20 @@ FFE3 ; Diacritic # Sk FULLWIDTH MACRON
1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA
11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
11442 ; Diacritic # Mn NEWA SIGN VIRAMA
11446 ; Diacritic # Mn NEWA SIGN NUKTA
114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
1163F ; Diacritic # Mn MODI SIGN VIRAMA
116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA
116B7 ; Diacritic # Mn TAKRI SIGN NUKTA
1172B ; Diacritic # Mn AHOM SIGN KILLER
11A34 ; Diacritic # Mn ZANABAZAR SQUARE SIGN VIRAMA
11A47 ; Diacritic # Mn ZANABAZAR SQUARE SUBJOINER
11A99 ; Diacritic # Mn SOYOMBO SUBJOINER
11C3F ; Diacritic # Mn BHAIKSUKI SIGN VIRAMA
11D42 ; Diacritic # Mn MASARAM GONDI SIGN NUKTA
11D44..11D45 ; Diacritic # Mn [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA
16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
@ -921,8 +988,10 @@ FFE3 ; Diacritic # Sk FULLWIDTH MACRON
1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E946 ; Diacritic # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK
1E948..1E94A ; Diacritic # Mn [3] ADLAM CONSONANT MODIFIER..ADLAM NUKTA
# Total code points: 773
# Total code points: 798
# ================================================
@ -951,9 +1020,12 @@ AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETE
FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1135D ; Extender # Lo GRANTHA SIGN PLUTA
115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3
11A98 ; Extender # Mn SOYOMBO GEMINATION MARK
16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM
16FE0..16FE1 ; Extender # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
1E944..1E946 ; Extender # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK
# Total code points: 38
# Total code points: 44
# ================================================
@ -1027,7 +1099,7 @@ FFFFE..FFFFF ; Noncharacter_Code_Point # Cn [2] <noncharacter-FFFFE>..<noncha
0D57 ; Other_Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK
0DCF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA
0DDF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA
200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
200C ; Other_Grapheme_Extend # Cf ZERO WIDTH NON-JOINER
302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA
@ -1037,8 +1109,9 @@ FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND
115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA
1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM
1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
E0020..E007F ; Other_Grapheme_Extend # Cf [96] TAG SPACE..CANCEL TAG
# Total code points: 30
# Total code points: 125
# ================================================
@ -1064,7 +1137,7 @@ FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND
# ================================================
3400..4DB5 ; Unified_Ideograph # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Unified_Ideograph # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Unified_Ideograph # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F
FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11
FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14
@ -1076,8 +1149,9 @@ FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..C
2A700..2B734 ; Unified_Ideograph # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Unified_Ideograph # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Unified_Ideograph # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Unified_Ideograph # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
# Total code points: 80388
# Total code points: 87882
# ================================================
@ -1106,9 +1180,8 @@ E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>.
2329 ; Deprecated # Ps LEFT-POINTING ANGLE BRACKET
232A ; Deprecated # Pe RIGHT-POINTING ANGLE BRACKET
E0001 ; Deprecated # Cf LANGUAGE TAG
E007F ; Deprecated # Cf CANCEL TAG
# Total code points: 16
# Total code points: 15
# ================================================
@ -1160,11 +1233,12 @@ AABB..AABC ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL AUE..TAI VIET
# ================================================
1885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
2118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
212E ; Other_ID_Start # So ESTIMATED SYMBOL
309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
# Total code points: 4
# Total code points: 6
# ================================================
@ -1177,72 +1251,76 @@ AABB..AABC ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL AUE..TAI VIET
# ================================================
0021 ; STerm # Po EXCLAMATION MARK
002E ; STerm # Po FULL STOP
003F ; STerm # Po QUESTION MARK
0589 ; STerm # Po ARMENIAN FULL STOP
061F ; STerm # Po ARABIC QUESTION MARK
06D4 ; STerm # Po ARABIC FULL STOP
0700..0702 ; STerm # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP
07F9 ; STerm # Po NKO EXCLAMATION MARK
0964..0965 ; STerm # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
104A..104B ; STerm # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION
1362 ; STerm # Po ETHIOPIC FULL STOP
1367..1368 ; STerm # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
166E ; STerm # Po CANADIAN SYLLABICS FULL STOP
1735..1736 ; STerm # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
1803 ; STerm # Po MONGOLIAN FULL STOP
1809 ; STerm # Po MONGOLIAN MANCHU FULL STOP
1944..1945 ; STerm # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1AA8..1AAB ; STerm # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU
1B5A..1B5B ; STerm # Po [2] BALINESE PANTI..BALINESE PAMADA
1B5E..1B5F ; STerm # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN
1C3B..1C3C ; STerm # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL
1C7E..1C7F ; STerm # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; STerm # Po REVERSED QUESTION MARK
2E3C ; STerm # Po STENOGRAPHIC FULL STOP
3002 ; STerm # Po IDEOGRAPHIC FULL STOP
A4FF ; STerm # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK
A6F3 ; STerm # Po BAMUM FULL STOP
A6F7 ; STerm # Po BAMUM QUESTION MARK
A876..A877 ; STerm # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD
A8CE..A8CF ; STerm # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A92F ; STerm # Po KAYAH LI SIGN SHYA
A9C8..A9C9 ; STerm # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI
AA5D..AA5F ; STerm # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; STerm # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; STerm # Po MEETEI MAYEK CHEIKHEI
FE52 ; STerm # Po SMALL FULL STOP
FE56..FE57 ; STerm # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; STerm # Po FULLWIDTH EXCLAMATION MARK
FF0E ; STerm # Po FULLWIDTH FULL STOP
FF1F ; STerm # Po FULLWIDTH QUESTION MARK
FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP
10A56..10A57 ; STerm # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
11047..11048 ; STerm # Po [2] BRAHMI DANDA..BRAHMI DOUBLE DANDA
110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
111CD ; STerm # Po SHARADA SUTRA MARK
111DE..111DF ; STerm # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
112A9 ; STerm # Po MULTANI SECTION MARK
115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
115C9..115D7 ; STerm # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; STerm # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; STerm # Po BASSA VAH FULL STOP
16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS
1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; STerm # Po SIGNWRITING FULL STOP
0021 ; Sentence_Terminal # Po EXCLAMATION MARK
002E ; Sentence_Terminal # Po FULL STOP
003F ; Sentence_Terminal # Po QUESTION MARK
0589 ; Sentence_Terminal # Po ARMENIAN FULL STOP
061F ; Sentence_Terminal # Po ARABIC QUESTION MARK
06D4 ; Sentence_Terminal # Po ARABIC FULL STOP
0700..0702 ; Sentence_Terminal # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP
07F9 ; Sentence_Terminal # Po NKO EXCLAMATION MARK
0964..0965 ; Sentence_Terminal # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
104A..104B ; Sentence_Terminal # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION
1362 ; Sentence_Terminal # Po ETHIOPIC FULL STOP
1367..1368 ; Sentence_Terminal # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
166E ; Sentence_Terminal # Po CANADIAN SYLLABICS FULL STOP
1735..1736 ; Sentence_Terminal # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
1803 ; Sentence_Terminal # Po MONGOLIAN FULL STOP
1809 ; Sentence_Terminal # Po MONGOLIAN MANCHU FULL STOP
1944..1945 ; Sentence_Terminal # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1AA8..1AAB ; Sentence_Terminal # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU
1B5A..1B5B ; Sentence_Terminal # Po [2] BALINESE PANTI..BALINESE PAMADA
1B5E..1B5F ; Sentence_Terminal # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN
1C3B..1C3C ; Sentence_Terminal # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL
1C7E..1C7F ; Sentence_Terminal # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
203C..203D ; Sentence_Terminal # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Sentence_Terminal # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; Sentence_Terminal # Po REVERSED QUESTION MARK
2E3C ; Sentence_Terminal # Po STENOGRAPHIC FULL STOP
3002 ; Sentence_Terminal # Po IDEOGRAPHIC FULL STOP
A4FF ; Sentence_Terminal # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; Sentence_Terminal # Po [2] VAI FULL STOP..VAI QUESTION MARK
A6F3 ; Sentence_Terminal # Po BAMUM FULL STOP
A6F7 ; Sentence_Terminal # Po BAMUM QUESTION MARK
A876..A877 ; Sentence_Terminal # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD
A8CE..A8CF ; Sentence_Terminal # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A92F ; Sentence_Terminal # Po KAYAH LI SIGN SHYA
A9C8..A9C9 ; Sentence_Terminal # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI
AA5D..AA5F ; Sentence_Terminal # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; Sentence_Terminal # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; Sentence_Terminal # Po MEETEI MAYEK CHEIKHEI
FE52 ; Sentence_Terminal # Po SMALL FULL STOP
FE56..FE57 ; Sentence_Terminal # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; Sentence_Terminal # Po FULLWIDTH EXCLAMATION MARK
FF0E ; Sentence_Terminal # Po FULLWIDTH FULL STOP
FF1F ; Sentence_Terminal # Po FULLWIDTH QUESTION MARK
FF61 ; Sentence_Terminal # Po HALFWIDTH IDEOGRAPHIC FULL STOP
10A56..10A57 ; Sentence_Terminal # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
11047..11048 ; Sentence_Terminal # Po [2] BRAHMI DANDA..BRAHMI DOUBLE DANDA
110BE..110C1 ; Sentence_Terminal # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; Sentence_Terminal # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; Sentence_Terminal # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
111CD ; Sentence_Terminal # Po SHARADA SUTRA MARK
111DE..111DF ; Sentence_Terminal # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..11239 ; Sentence_Terminal # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
1123B..1123C ; Sentence_Terminal # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
112A9 ; Sentence_Terminal # Po MULTANI SECTION MARK
1144B..1144C ; Sentence_Terminal # Po [2] NEWA DANDA..NEWA DOUBLE DANDA
115C2..115C3 ; Sentence_Terminal # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
115C9..115D7 ; Sentence_Terminal # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; Sentence_Terminal # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; Sentence_Terminal # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
11A42..11A43 ; Sentence_Terminal # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD
11A9B..11A9C ; Sentence_Terminal # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
11C41..11C42 ; Sentence_Terminal # Po [2] BHAIKSUKI DANDA..BHAIKSUKI DOUBLE DANDA
16A6E..16A6F ; Sentence_Terminal # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; Sentence_Terminal # Po BASSA VAH FULL STOP
16B37..16B38 ; Sentence_Terminal # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
16B44 ; Sentence_Terminal # Po PAHAWH HMONG SIGN XAUS
1BC9F ; Sentence_Terminal # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; Sentence_Terminal # Po SIGNWRITING FULL STOP
# Total code points: 120
# Total code points: 128
# ================================================
@ -1359,9 +1437,7 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
23FB..23FF ; Pattern_Syntax # Cn [5] <reserved-23FB>..<reserved-23FF>
2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
23E2..2426 ; Pattern_Syntax # So [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO
2427..243F ; Pattern_Syntax # Cn [25] <reserved-2427>..<reserved-243F>
2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
244B..245F ; Pattern_Syntax # Cn [21] <reserved-244B>..<reserved-245F>
@ -1449,8 +1525,8 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
2BBA..2BBC ; Pattern_Syntax # Cn [3] <reserved-2BBA>..<reserved-2BBC>
2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
2BC9 ; Pattern_Syntax # Cn <reserved-2BC9>
2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2BD2..2BEB ; Pattern_Syntax # Cn [26] <reserved-2BD2>..<reserved-2BEB>
2BCA..2BD2 ; Pattern_Syntax # So [9] TOP HALF BLACK CIRCLE..GROUP MARK
2BD3..2BEB ; Pattern_Syntax # Cn [25] <reserved-2BD3>..<reserved-2BEB>
2BEC..2BEF ; Pattern_Syntax # So [4] LEFTWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS..DOWNWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS
2BF0..2BFF ; Pattern_Syntax # Cn [16] <reserved-2BF0>..<reserved-2BFF>
2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
@ -1490,7 +1566,8 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN
2E41 ; Pattern_Syntax # Po REVERSED COMMA
2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2E43..2E7F ; Pattern_Syntax # Cn [61] <reserved-2E43>..<reserved-2E7F>
2E43..2E49 ; Pattern_Syntax # Po [7] DASH WITH LEFT UPTURN..DOUBLE STACKED COMMA
2E4A..2E7F ; Pattern_Syntax # Cn [54] <reserved-2E4A>..<reserved-2E7F>
3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET
3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET
@ -1522,4 +1599,20 @@ FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 2760
# ================================================
0600..0605 ; Prepended_Concatenation_Mark # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
06DD ; Prepended_Concatenation_Mark # Cf ARABIC END OF AYAH
070F ; Prepended_Concatenation_Mark # Cf SYRIAC ABBREVIATION MARK
08E2 ; Prepended_Concatenation_Mark # Cf ARABIC DISPUTED END OF AYAH
110BD ; Prepended_Concatenation_Mark # Cf KAITHI NUMBER SIGN
# Total code points: 10
# ================================================
1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
# Total code points: 26
# EOF

View File

@ -1,10 +1,11 @@
# Scripts-8.0.0.txt
# Date: 2015-03-11, 22:29:42 GMT [MD]
# Scripts-10.0.0.txt
# Date: 2017-03-11, 06:40:37 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
# For more information, see:
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
# Especially the sections:
@ -92,10 +93,10 @@
0605 ; Common # Cf ARABIC NUMBER MARK ABOVE
060C ; Common # Po ARABIC COMMA
061B ; Common # Po ARABIC SEMICOLON
061C ; Common # Cf ARABIC LETTER MARK
061F ; Common # Po ARABIC QUESTION MARK
0640 ; Common # Lm ARABIC TATWEEL
06DD ; Common # Cf ARABIC END OF AYAH
08E2 ; Common # Cf ARABIC DISPUTED END OF AYAH
0964..0965 ; Common # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
0E3F ; Common # Sc THAI CURRENCY SYMBOL BAHT
0FD5..0FD8 ; Common # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS
@ -110,6 +111,7 @@
1CEE..1CF1 ; Common # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
1CF2..1CF3 ; Common # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
1CF5..1CF6 ; Common # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
1CF7 ; Common # Mc VEDIC SIGN ATIKRAMA
2000..200A ; Common # Zs [11] EN QUAD..HAIR SPACE
200B ; Common # Cf ZERO WIDTH SPACE
200E..200F ; Common # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
@ -153,7 +155,7 @@
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
20A0..20BE ; Common # Sc [31] EURO-CURRENCY SIGN..LARI SIGN
20A0..20BF ; Common # Sc [32] EURO-CURRENCY SIGN..BITCOIN SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
@ -223,8 +225,7 @@
239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
23E2..2426 ; Common # So [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
249C..24E9 ; Common # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
@ -309,7 +310,7 @@
2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2BCA..2BD2 ; Common # So [9] TOP HALF BLACK CIRCLE..GROUP MARK
2BEC..2BEF ; Common # So [4] LEFTWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS..DOWNWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS
2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET
@ -348,6 +349,7 @@
2E40 ; Common # Pd DOUBLE HYPHEN
2E41 ; Common # Po REVERSED COMMA
2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2E43..2E49 ; Common # Po [7] DASH WITH LEFT UPTURN..DOUBLE STACKED COMMA
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
3000 ; Common # Zs IDEOGRAPHIC SPACE
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
@ -572,19 +574,18 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN
1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
1F170..1F1AC ; Common # So [61] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD
1F1E6..1F1FF ; Common # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
1F201..1F202 ; Common # So [2] SQUARED KATAKANA KOKO..SQUARED KATAKANA SA
1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F210..1F23B ; Common # So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
1F260..1F265 ; Common # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
1F300..1F3FA ; Common # So [251] CYCLONE..AMPHORA
1F3FB..1F3FF ; Common # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
1F400..1F579 ; Common # So [378] RAT..JOYSTICK
1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX
1F5A5..1F6D0 ; Common # So [300] DESKTOP COMPUTER..PLACE OF WORSHIP
1F400..1F6D4 ; Common # So [725] RAT..PAGODA
1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP
1F6F0..1F6F8 ; Common # So [9] SATELLITE..FLYING SAUCER
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
@ -592,13 +593,17 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
1F910..1F918 ; Common # So [9] ZIPPER-MOUTH FACE..SIGN OF THE HORNS
1F980..1F984 ; Common # So [5] CRAB..UNICORN FACE
1F900..1F90B ; Common # So [12] CIRCLED CROSS FORMEE WITH FOUR DOTS..DOWNWARD FACING NOTCHED HOOK WITH DOT
1F910..1F93E ; Common # So [47] ZIPPER-MOUTH FACE..HANDBALL
1F940..1F94C ; Common # So [13] WILTED FLOWER..CURLING STONE
1F950..1F96B ; Common # So [28] CROISSANT..CANNED FOOD
1F980..1F997 ; Common # So [24] CRAB..CRICKET
1F9C0 ; Common # So CHEESE WEDGE
1F9D0..1F9E6 ; Common # So [23] FACE WITH MONOCLE..SOCKS
E0001 ; Common # Cf LANGUAGE TAG
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
# Total code points: 7179
# Total code points: 7363
# ================================================
@ -641,7 +646,7 @@ A770 ; Latin # Lm MODIFIER LETTER US
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
A78F ; Latin # Lo LATIN LETTER SINOLOGICAL DOT
A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT
A790..A7AE ; Latin # L& [31] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER SMALL CAPITAL I
A7B0..A7B7 ; Latin # L& [8] LATIN CAPITAL LETTER TURNED K..LATIN SMALL LETTER OMEGA
A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
@ -654,7 +659,7 @@ FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE S
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
# Total code points: 1349
# Total code points: 1350
# ================================================
@ -708,13 +713,13 @@ AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
1018C ; Greek # So GREEK SINUSOID SIGN
1018C..1018E ; Greek # So [3] GREEK SINUSOID SIGN..NOMISMA SIGN
101A0 ; Greek # So GREEK SYMBOL TAU RHO
1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245 ; Greek # So GREEK MUSICAL LEIMMA
# Total code points: 516
# Total code points: 518
# ================================================
@ -724,6 +729,7 @@ AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE
0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
1C80..1C88 ; Cyrillic # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL
1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN
2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
@ -740,7 +746,7 @@ A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER
A69E..A69F ; Cyrillic # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
FE2E..FE2F ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
# Total code points: 434
# Total code points: 443
# ================================================
@ -791,6 +797,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
060D ; Arabic # Po ARABIC DATE SEPARATOR
060E..060F ; Arabic # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
0610..061A ; Arabic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
061C ; Arabic # Cf ARABIC LETTER MARK
061E ; Arabic # Po ARABIC TRIPLE DOT PUNCTUATION MARK
0620..063F ; Arabic # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
0641..064A ; Arabic # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
@ -815,6 +822,8 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
08A0..08B4 ; Arabic # Lo [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
08B6..08BD ; Arabic # Lo [8] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER AFRICAN NOON
08D4..08E1 ; Arabic # Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA
08E3..08FF ; Arabic # Mn [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
@ -862,7 +871,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
# Total code points: 1257
# Total code points: 1280
# ================================================
@ -873,8 +882,9 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA
0712..072F ; Syriac # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
0730..074A ; Syriac # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
074D..074F ; Syriac # Lo [3] SYRIAC LETTER SOGDIAN ZHAIN..SYRIAC LETTER SOGDIAN FE
0860..086A ; Syriac # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
# Total code points: 77
# Total code points: 88
# ================================================
@ -944,8 +954,10 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
09F4..09F9 ; Bengali # No [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN
09FA ; Bengali # So BENGALI ISSHAR
09FB ; Bengali # Sc BENGALI GANDA MARK
09FC ; Bengali # Lo BENGALI LETTER VEDIC ANUSVARA
09FD ; Bengali # Po BENGALI ABBREVIATION SIGN
# Total code points: 93
# Total code points: 95
# ================================================
@ -998,8 +1010,9 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0AF0 ; Gujarati # Po GUJARATI ABBREVIATION SIGN
0AF1 ; Gujarati # Sc GUJARATI RUPEE SIGN
0AF9 ; Gujarati # Lo GUJARATI LETTER ZHA
0AFA..0AFF ; Gujarati # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
# Total code points: 85
# Total code points: 91
# ================================================
@ -1086,6 +1099,7 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
# ================================================
0C80 ; Kannada # Lo KANNADA SIGN SPACING CANDRABINDU
0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
@ -1109,15 +1123,16 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
# Total code points: 87
# Total code points: 88
# ================================================
0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU
0D00..0D01 ; Malayalam # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
0D12..0D3A ; Malayalam # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
0D3B..0D3C ; Malayalam # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
0D3D ; Malayalam # Lo MALAYALAM SIGN AVAGRAHA
0D3E..0D40 ; Malayalam # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Malayalam # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@ -1125,15 +1140,18 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0D4A..0D4C ; Malayalam # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
0D4D ; Malayalam # Mn MALAYALAM SIGN VIRAMA
0D4E ; Malayalam # Lo MALAYALAM LETTER DOT REPH
0D4F ; Malayalam # So MALAYALAM SIGN PARA
0D54..0D56 ; Malayalam # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
0D57 ; Malayalam # Mc MALAYALAM AU LENGTH MARK
0D58..0D5E ; Malayalam # No [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH
0D5F..0D61 ; Malayalam # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
0D62..0D63 ; Malayalam # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
0D66..0D6F ; Malayalam # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
0D70..0D75 ; Malayalam # No [6] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE QUARTERS
0D70..0D78 ; Malayalam # No [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS
0D79 ; Malayalam # So MALAYALAM DATE MARK
0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
# Total code points: 100
# Total code points: 117
# ================================================
@ -1436,21 +1454,24 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
1844..1877 ; Mongolian # Lo [52] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER MANCHU ZHA
1880..18A8 ; Mongolian # Lo [41] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER MANCHU ALI GALI BHA
1880..1884 ; Mongolian # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
1885..1886 ; Mongolian # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
1887..18A8 ; Mongolian # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
18A9 ; Mongolian # Mn MONGOLIAN LETTER ALI GALI DAGALGA
18AA ; Mongolian # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA
11660..1166C ; Mongolian # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
# Total code points: 153
# Total code points: 166
# ================================================
3041..3096 ; Hiragana # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
309D..309E ; Hiragana # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
309F ; Hiragana # Lo HIRAGANA DIGRAPH YORI
1B001 ; Hiragana # Lo HIRAGANA LETTER ARCHAIC YE
1B001..1B11E ; Hiragana # Lo [286] HIRAGANA LETTER ARCHAIC YE..HENTAIGANA LETTER N-MU-MO-2
1F200 ; Hiragana # So SQUARE HIRAGANA HOKA
# Total code points: 91
# Total code points: 376
# ================================================
@ -1469,10 +1490,10 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
# ================================================
02EA..02EB ; Bopomofo # Sk [2] MODIFIER LETTER YIN DEPARTING TONE MARK..MODIFIER LETTER YANG DEPARTING TONE MARK
3105..312D ; Bopomofo # Lo [41] BOPOMOFO LETTER B..BOPOMOFO LETTER IH
3105..312E ; Bopomofo # Lo [42] BOPOMOFO LETTER B..BOPOMOFO LETTER O WITH DOT ABOVE
31A0..31BA ; Bopomofo # Lo [27] BOPOMOFO LETTER BU..BOPOMOFO LETTER ZY
# Total code points: 70
# Total code points: 71
# ================================================
@ -1485,16 +1506,17 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Han # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Han # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Han # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
# Total code points: 81734
# Total code points: 89228
# ================================================
@ -1509,8 +1531,9 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE
10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
1032D..1032F ; Old_Italic # Lo [3] OLD ITALIC LETTER YE..OLD ITALIC LETTER SOUTHERN TSE
# Total code points: 36
# Total code points: 39
# ================================================
@ -1542,8 +1565,8 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE
1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1DC0..1DF9 ; Inherited # Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
1DFB..1DFF ; Inherited # Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
20DD..20E0 ; Inherited # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
@ -1562,7 +1585,7 @@ FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CON
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
# Total code points: 563
# Total code points: 568
# ================================================
@ -1705,8 +1728,13 @@ E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-2
2C00..2C2E ; Glagolitic # L& [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
2C30..2C5E ; Glagolitic # L& [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
1E000..1E006 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; Glagolitic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Glagolitic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Glagolitic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
# Total code points: 94
# Total code points: 132
# ================================================
@ -1872,11 +1900,11 @@ A62A..A62B ; Vai # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
A880..A881 ; Saurashtra # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
A882..A8B3 ; Saurashtra # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
A8B4..A8C3 ; Saurashtra # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
A8C4 ; Saurashtra # Mn SAURASHTRA SIGN VIRAMA
A8C4..A8C5 ; Saurashtra # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
A8CE..A8CF ; Saurashtra # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A8D0..A8D9 ; Saurashtra # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE
# Total code points: 81
# Total code points: 82
# ================================================
@ -2314,8 +2342,9 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
11235 ; Khojki # Mc KHOJKI SIGN VIRAMA
11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
1123E ; Khojki # Mn KHOJKI SIGN SUKUN
# Total code points: 61
# Total code points: 62
# ================================================
@ -2536,4 +2565,129 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
# Total code points: 672
# ================================================
1E900..1E943 ; Adlam # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
1E944..1E94A ; Adlam # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
1E950..1E959 ; Adlam # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
1E95E..1E95F ; Adlam # Po [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
# Total code points: 87
# ================================================
11C00..11C08 ; Bhaiksuki # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
11C0A..11C2E ; Bhaiksuki # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
11C2F ; Bhaiksuki # Mc BHAIKSUKI VOWEL SIGN AA
11C30..11C36 ; Bhaiksuki # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
11C38..11C3D ; Bhaiksuki # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
11C3E ; Bhaiksuki # Mc BHAIKSUKI SIGN VISARGA
11C3F ; Bhaiksuki # Mn BHAIKSUKI SIGN VIRAMA
11C40 ; Bhaiksuki # Lo BHAIKSUKI SIGN AVAGRAHA
11C41..11C45 ; Bhaiksuki # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2
11C50..11C59 ; Bhaiksuki # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
11C5A..11C6C ; Bhaiksuki # No [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK
# Total code points: 97
# ================================================
11C70..11C71 ; Marchen # Po [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD
11C72..11C8F ; Marchen # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A
11C92..11CA7 ; Marchen # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
11CA9 ; Marchen # Mc MARCHEN SUBJOINED LETTER YA
11CAA..11CB0 ; Marchen # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
11CB1 ; Marchen # Mc MARCHEN VOWEL SIGN I
11CB2..11CB3 ; Marchen # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
11CB4 ; Marchen # Mc MARCHEN VOWEL SIGN O
11CB5..11CB6 ; Marchen # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
# Total code points: 68
# ================================================
11400..11434 ; Newa # Lo [53] NEWA LETTER A..NEWA LETTER HA
11435..11437 ; Newa # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
11438..1143F ; Newa # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
11440..11441 ; Newa # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
11442..11444 ; Newa # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA
11445 ; Newa # Mc NEWA SIGN VISARGA
11446 ; Newa # Mn NEWA SIGN NUKTA
11447..1144A ; Newa # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI
1144B..1144F ; Newa # Po [5] NEWA DANDA..NEWA ABBREVIATION SIGN
11450..11459 ; Newa # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE
1145B ; Newa # Po NEWA PLACEHOLDER MARK
1145D ; Newa # Po NEWA INSERTION SIGN
# Total code points: 92
# ================================================
104B0..104D3 ; Osage # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
104D8..104FB ; Osage # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
# Total code points: 72
# ================================================
16FE0 ; Tangut # Lm TANGUT ITERATION MARK
17000..187EC ; Tangut # Lo [6125] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187EC
18800..18AF2 ; Tangut # Lo [755] TANGUT COMPONENT-001..TANGUT COMPONENT-755
# Total code points: 6881
# ================================================
11D00..11D06 ; Masaram_Gondi # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E
11D08..11D09 ; Masaram_Gondi # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O
11D0B..11D30 ; Masaram_Gondi # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA
11D31..11D36 ; Masaram_Gondi # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
11D3A ; Masaram_Gondi # Mn MASARAM GONDI VOWEL SIGN E
11D3C..11D3D ; Masaram_Gondi # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
11D3F..11D45 ; Masaram_Gondi # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA
11D46 ; Masaram_Gondi # Lo MASARAM GONDI REPHA
11D47 ; Masaram_Gondi # Mn MASARAM GONDI RA-KARA
11D50..11D59 ; Masaram_Gondi # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
# Total code points: 75
# ================================================
16FE1 ; Nushu # Lm NUSHU ITERATION MARK
1B170..1B2FB ; Nushu # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
# Total code points: 397
# ================================================
11A50 ; Soyombo # Lo SOYOMBO LETTER A
11A51..11A56 ; Soyombo # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
11A57..11A58 ; Soyombo # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
11A59..11A5B ; Soyombo # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
11A5C..11A83 ; Soyombo # Lo [40] SOYOMBO LETTER KA..SOYOMBO LETTER KSSA
11A86..11A89 ; Soyombo # Lo [4] SOYOMBO CLUSTER-INITIAL LETTER RA..SOYOMBO CLUSTER-INITIAL LETTER SA
11A8A..11A96 ; Soyombo # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
11A97 ; Soyombo # Mc SOYOMBO SIGN VISARGA
11A98..11A99 ; Soyombo # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER
11A9A..11A9C ; Soyombo # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD
11A9E..11AA2 ; Soyombo # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
# Total code points: 80
# ================================================
11A00 ; Zanabazar_Square # Lo ZANABAZAR SQUARE LETTER A
11A01..11A06 ; Zanabazar_Square # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O
11A07..11A08 ; Zanabazar_Square # Mc [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU
11A09..11A0A ; Zanabazar_Square # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK
11A0B..11A32 ; Zanabazar_Square # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA
11A33..11A38 ; Zanabazar_Square # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA
11A39 ; Zanabazar_Square # Mc ZANABAZAR SQUARE SIGN VISARGA
11A3A ; Zanabazar_Square # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
11A3B..11A3E ; Zanabazar_Square # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
11A3F..11A46 ; Zanabazar_Square # Po [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK
11A47 ; Zanabazar_Square # Mn ZANABAZAR SQUARE SUBJOINER
# Total code points: 72
# EOF

View File

@ -1,10 +1,11 @@
# SpecialCasing-8.0.0.txt
# Date: 2014-12-16, 23:08:04 GMT [MD]
# SpecialCasing-10.0.0.txt
# Date: 2017-04-14, 05:40:43 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Special Casing
#
@ -196,7 +197,7 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# ================================================================================
# Conditional Mappings
# The remainder of this file provides conditional casing data used to produce
# The remainder of this file provides conditional casing data used to produce
# full case mappings.
# ================================================================================
# Language-Insensitive Mappings

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
8.0.0
10.0.0

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -34,16 +34,11 @@ package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.file.FileSystems;
import java.util.Arrays;
import java.security.AccessController;
import java.security.PrivilegedAction;
@ -51,7 +46,7 @@ import java.security.PrivilegedAction;
public final class ICUBinary {
private static final class IsAcceptable implements Authenticate {
// @Override when we switch to Java 6
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 1;
}
@ -93,7 +88,7 @@ public final class ICUBinary {
BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
DataInputStream inputStream = new DataInputStream(b);
byte[] bb = new byte[120000];
byte[] bb = new byte[130000];
int n = inputStream.read(bb);
ByteBuffer bytes = ByteBuffer.wrap(bb, 0, n);
return bytes;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -194,7 +194,7 @@ final class Norm2AllModes {
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); }
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); }
}
public static final class ComposeNormalizer2 extends Normalizer2WithImpl {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -135,8 +135,10 @@ abstract class Normalizer2 {
if(spanLength==src.length()) {
return (String)src;
}
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
if (spanLength != 0) {
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
}
}
return normalize(src, new StringBuilder(src.length())).toString();
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -699,7 +699,8 @@ public final class NumericShaper implements java.io.Serializable {
0x09de, 0x09df,
0x09e2, 0x09e6,
0x09f2, 0x09f4,
0x09fb, 0x0a03,
0x09fb, 0x09fc,
0x09fe, 0x0a03,
0x0a04, 0x0a05,
0x0a0b, 0x0a0f,
0x0a11, 0x0a13,
@ -769,7 +770,7 @@ public final class NumericShaper implements java.io.Serializable {
0x0c5b, 0x0c60,
0x0c62, 0x0c66,
0x0c70, 0x0c7f,
0x0c80, 0x0c82,
0x0c81, 0x0c82,
0x0c84, 0x0c85,
0x0c8d, 0x0c8e,
0x0c91, 0x0c92,
@ -791,10 +792,7 @@ public final class NumericShaper implements java.io.Serializable {
0x0d41, 0x0d46,
0x0d49, 0x0d4a,
0x0d4d, 0x0d4e,
0x0d4f, 0x0d57,
0x0d58, 0x0d5f,
0x0d62, 0x0d66,
0x0d76, 0x0d79,
0x0d80, 0x0d82,
0x0d84, 0x0d85,
0x0d97, 0x0d9a,
@ -892,7 +890,8 @@ public final class NumericShaper implements java.io.Serializable {
0x17dd, 0x17e0,
0x17ea, 0x1810,
0x181a, 0x1820,
0x1878, 0x1880,
0x1878, 0x1884,
0x1885, 0x1887,
0x18a9, 0x18aa,
0x18ab, 0x18b0,
0x18f6, 0x1900,
@ -934,13 +933,12 @@ public final class NumericShaper implements java.io.Serializable {
0x1c2c, 0x1c34,
0x1c36, 0x1c3b,
0x1c4a, 0x1c4d,
0x1c80, 0x1cc0,
0x1cc8, 0x1cd3,
0x1cd4, 0x1ce1,
0x1ce2, 0x1ce9,
0x1ced, 0x1cee,
0x1cf4, 0x1cf5,
0x1cf7, 0x1d00,
0x1cf8, 0x1d00,
0x1dc0, 0x1e00,
0x1f16, 0x1f18,
0x1f1e, 0x1f20,
@ -1012,7 +1010,7 @@ public final class NumericShaper implements java.io.Serializable {
0x30a0, 0x30a1,
0x30fb, 0x30fc,
0x3100, 0x3105,
0x312e, 0x3131,
0x312f, 0x3131,
0x318f, 0x3190,
0x31bb, 0x31f0,
0x321d, 0x3220,
@ -1025,7 +1023,7 @@ public final class NumericShaper implements java.io.Serializable {
0x33de, 0x33e0,
0x33ff, 0x3400,
0x4db6, 0x4e00,
0x9fd6, 0xa000,
0x9feb, 0xa000,
0xa48d, 0xa4d0,
0xa60d, 0xa610,
0xa62c, 0xa640,
@ -1034,7 +1032,7 @@ public final class NumericShaper implements java.io.Serializable {
0xa6f0, 0xa6f2,
0xa6f8, 0xa722,
0xa788, 0xa789,
0xa7ae, 0xa7b0,
0xa7af, 0xa7b0,
0xa7b8, 0xa7f7,
0xa802, 0xa803,
0xa806, 0xa807,
@ -1114,18 +1112,21 @@ public final class NumericShaper implements java.io.Serializable {
0x10101, 0x10102,
0x10103, 0x10107,
0x10134, 0x10137,
0x10140, 0x101d0,
0x10140, 0x1018d,
0x1018f, 0x101d0,
0x101fd, 0x10280,
0x1029d, 0x102a0,
0x102d1, 0x10300,
0x10324, 0x10330,
0x10324, 0x1032d,
0x1034b, 0x10350,
0x10376, 0x10380,
0x1039e, 0x1039f,
0x103c4, 0x103c8,
0x103d6, 0x10400,
0x1049e, 0x104a0,
0x104aa, 0x10500,
0x104aa, 0x104d3,
0x104d4, 0x104d8,
0x104fc, 0x10500,
0x10528, 0x10530,
0x10564, 0x1056f,
0x10570, 0x10600,
@ -1186,7 +1187,13 @@ public final class NumericShaper implements java.io.Serializable {
0x1134e, 0x11350,
0x11351, 0x11357,
0x11358, 0x1135d,
0x11364, 0x11480,
0x11364, 0x11400,
0x11438, 0x11440,
0x11442, 0x11445,
0x11446, 0x11447,
0x1145a, 0x1145b,
0x1145c, 0x1145d,
0x1145e, 0x11480,
0x114b3, 0x114b9,
0x114ba, 0x114bb,
0x114bf, 0x114c1,
@ -1212,8 +1219,33 @@ public final class NumericShaper implements java.io.Serializable {
0x11727, 0x11730,
0x11740, 0x118a0,
0x118f3, 0x118ff,
0x11900, 0x11ac0,
0x11af9, 0x12000,
0x11900, 0x11a00,
0x11a01, 0x11a07,
0x11a09, 0x11a0b,
0x11a33, 0x11a3a,
0x11a3b, 0x11a3f,
0x11a47, 0x11a50,
0x11a51, 0x11a57,
0x11a59, 0x11a5c,
0x11a84, 0x11a86,
0x11a8a, 0x11a97,
0x11a98, 0x11a9a,
0x11a9d, 0x11a9e,
0x11aa3, 0x11ac0,
0x11af9, 0x11C00,
0x11C09, 0x11c0a,
0x11c30, 0x11c3e,
0x11c46, 0x11c50,
0x11c6d, 0x11c70,
0x11c90, 0x11ca9,
0x11caa, 0x11cb1,
0x11cb2, 0x11cb4,
0x11cb5, 0x11d00,
0x11d07, 0x11d08,
0x11d0a, 0x11d0b,
0x11d31, 0x11d46,
0x11d47, 0x11d50,
0x11d5a, 0x12000,
0x1239a, 0x12400,
0x1246f, 0x12470,
0x12475, 0x12480,
@ -1234,8 +1266,12 @@ public final class NumericShaper implements java.io.Serializable {
0x16b90, 0x16f00,
0x16f45, 0x16f50,
0x16f7f, 0x16f93,
0x16fa0, 0x1b000,
0x1b002, 0x1bc00,
0x16fa0, 0x16fe0,
0x16fe2, 0x17000,
0x187ed, 0x18800,
0x18af3, 0x1b000,
0x1b11f, 0x1b170,
0x1b2fc, 0x1bc00,
0x1bc6b, 0x1bc70,
0x1bc7d, 0x1bc80,
0x1bc89, 0x1bc90,
@ -1281,19 +1317,21 @@ public final class NumericShaper implements java.io.Serializable {
0x1da84, 0x1da85,
0x1da8c, 0x1e800,
0x1e8d0, 0x1e8d7,
0x1e944, 0x1e94b,
0x1eef0, 0x1eef2,
0x1f000, 0x1f110,
0x1f12f, 0x1f130,
0x1f16a, 0x1f170,
0x1f19b, 0x1f1e6,
0x1f1ad, 0x1f1e6,
0x1f203, 0x1f210,
0x1f23b, 0x1f240,
0x1f23c, 0x1f240,
0x1f249, 0x1f250,
0x1f252, 0x20000,
0x2a6d7, 0x2a700,
0x2b735, 0x2b740,
0x2b81e, 0x2b820,
0x2cea2, 0x2f800,
0x2cea2, 0x2ceb0,
0x2ebe1, 0x2f800,
0x2fa1e, 0xf0000,
0xffffe, 0x100000,
0x10fffe, 0x10ffff // sentinel

View File

@ -0,0 +1,316 @@
# Blocks-10.0.0.txt
# Date: 2017-04-12, 17:30:00 GMT [KW]
# Copyright (c) 2017 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Format:
# Start Code..End Code; Block Name
# ================================================
# Note: When comparing block names, casing, whitespace, hyphens,
# and underbars are ignored.
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
# For more information on the comparison of property values,
# see UAX #44: http://www.unicode.org/reports/tr44/
#
# All block ranges start with a value where (cp MOD 16) = 0,
# and end with a value where (cp MOD 16) = 15. In other words,
# the last hexadecimal digit of the start of range is ...0
# and the last hexadecimal digit of the end of range is ...F.
# This constraint on block ranges guarantees that allocations
# are done in terms of whole columns, and that code chart display
# never involves splitting columns in the charts.
#
# All code points not explicitly listed for Block
# have the value No_Block.
# Property: Block
#
# @missing: 0000..10FFFF; No_Block
0000..007F; Basic Latin
0080..00FF; Latin-1 Supplement
0100..017F; Latin Extended-A
0180..024F; Latin Extended-B
0250..02AF; IPA Extensions
02B0..02FF; Spacing Modifier Letters
0300..036F; Combining Diacritical Marks
0370..03FF; Greek and Coptic
0400..04FF; Cyrillic
0500..052F; Cyrillic Supplement
0530..058F; Armenian
0590..05FF; Hebrew
0600..06FF; Arabic
0700..074F; Syriac
0750..077F; Arabic Supplement
0780..07BF; Thaana
07C0..07FF; NKo
0800..083F; Samaritan
0840..085F; Mandaic
0860..086F; Syriac Supplement
08A0..08FF; Arabic Extended-A
0900..097F; Devanagari
0980..09FF; Bengali
0A00..0A7F; Gurmukhi
0A80..0AFF; Gujarati
0B00..0B7F; Oriya
0B80..0BFF; Tamil
0C00..0C7F; Telugu
0C80..0CFF; Kannada
0D00..0D7F; Malayalam
0D80..0DFF; Sinhala
0E00..0E7F; Thai
0E80..0EFF; Lao
0F00..0FFF; Tibetan
1000..109F; Myanmar
10A0..10FF; Georgian
1100..11FF; Hangul Jamo
1200..137F; Ethiopic
1380..139F; Ethiopic Supplement
13A0..13FF; Cherokee
1400..167F; Unified Canadian Aboriginal Syllabics
1680..169F; Ogham
16A0..16FF; Runic
1700..171F; Tagalog
1720..173F; Hanunoo
1740..175F; Buhid
1760..177F; Tagbanwa
1780..17FF; Khmer
1800..18AF; Mongolian
18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
1900..194F; Limbu
1950..197F; Tai Le
1980..19DF; New Tai Lue
19E0..19FF; Khmer Symbols
1A00..1A1F; Buginese
1A20..1AAF; Tai Tham
1AB0..1AFF; Combining Diacritical Marks Extended
1B00..1B7F; Balinese
1B80..1BBF; Sundanese
1BC0..1BFF; Batak
1C00..1C4F; Lepcha
1C50..1C7F; Ol Chiki
1C80..1C8F; Cyrillic Extended-C
1CC0..1CCF; Sundanese Supplement
1CD0..1CFF; Vedic Extensions
1D00..1D7F; Phonetic Extensions
1D80..1DBF; Phonetic Extensions Supplement
1DC0..1DFF; Combining Diacritical Marks Supplement
1E00..1EFF; Latin Extended Additional
1F00..1FFF; Greek Extended
2000..206F; General Punctuation
2070..209F; Superscripts and Subscripts
20A0..20CF; Currency Symbols
20D0..20FF; Combining Diacritical Marks for Symbols
2100..214F; Letterlike Symbols
2150..218F; Number Forms
2190..21FF; Arrows
2200..22FF; Mathematical Operators
2300..23FF; Miscellaneous Technical
2400..243F; Control Pictures
2440..245F; Optical Character Recognition
2460..24FF; Enclosed Alphanumerics
2500..257F; Box Drawing
2580..259F; Block Elements
25A0..25FF; Geometric Shapes
2600..26FF; Miscellaneous Symbols
2700..27BF; Dingbats
27C0..27EF; Miscellaneous Mathematical Symbols-A
27F0..27FF; Supplemental Arrows-A
2800..28FF; Braille Patterns
2900..297F; Supplemental Arrows-B
2980..29FF; Miscellaneous Mathematical Symbols-B
2A00..2AFF; Supplemental Mathematical Operators
2B00..2BFF; Miscellaneous Symbols and Arrows
2C00..2C5F; Glagolitic
2C60..2C7F; Latin Extended-C
2C80..2CFF; Coptic
2D00..2D2F; Georgian Supplement
2D30..2D7F; Tifinagh
2D80..2DDF; Ethiopic Extended
2DE0..2DFF; Cyrillic Extended-A
2E00..2E7F; Supplemental Punctuation
2E80..2EFF; CJK Radicals Supplement
2F00..2FDF; Kangxi Radicals
2FF0..2FFF; Ideographic Description Characters
3000..303F; CJK Symbols and Punctuation
3040..309F; Hiragana
30A0..30FF; Katakana
3100..312F; Bopomofo
3130..318F; Hangul Compatibility Jamo
3190..319F; Kanbun
31A0..31BF; Bopomofo Extended
31C0..31EF; CJK Strokes
31F0..31FF; Katakana Phonetic Extensions
3200..32FF; Enclosed CJK Letters and Months
3300..33FF; CJK Compatibility
3400..4DBF; CJK Unified Ideographs Extension A
4DC0..4DFF; Yijing Hexagram Symbols
4E00..9FFF; CJK Unified Ideographs
A000..A48F; Yi Syllables
A490..A4CF; Yi Radicals
A4D0..A4FF; Lisu
A500..A63F; Vai
A640..A69F; Cyrillic Extended-B
A6A0..A6FF; Bamum
A700..A71F; Modifier Tone Letters
A720..A7FF; Latin Extended-D
A800..A82F; Syloti Nagri
A830..A83F; Common Indic Number Forms
A840..A87F; Phags-pa
A880..A8DF; Saurashtra
A8E0..A8FF; Devanagari Extended
A900..A92F; Kayah Li
A930..A95F; Rejang
A960..A97F; Hangul Jamo Extended-A
A980..A9DF; Javanese
A9E0..A9FF; Myanmar Extended-B
AA00..AA5F; Cham
AA60..AA7F; Myanmar Extended-A
AA80..AADF; Tai Viet
AAE0..AAFF; Meetei Mayek Extensions
AB00..AB2F; Ethiopic Extended-A
AB30..AB6F; Latin Extended-E
AB70..ABBF; Cherokee Supplement
ABC0..ABFF; Meetei Mayek
AC00..D7AF; Hangul Syllables
D7B0..D7FF; Hangul Jamo Extended-B
D800..DB7F; High Surrogates
DB80..DBFF; High Private Use Surrogates
DC00..DFFF; Low Surrogates
E000..F8FF; Private Use Area
F900..FAFF; CJK Compatibility Ideographs
FB00..FB4F; Alphabetic Presentation Forms
FB50..FDFF; Arabic Presentation Forms-A
FE00..FE0F; Variation Selectors
FE10..FE1F; Vertical Forms
FE20..FE2F; Combining Half Marks
FE30..FE4F; CJK Compatibility Forms
FE50..FE6F; Small Form Variants
FE70..FEFF; Arabic Presentation Forms-B
FF00..FFEF; Halfwidth and Fullwidth Forms
FFF0..FFFF; Specials
10000..1007F; Linear B Syllabary
10080..100FF; Linear B Ideograms
10100..1013F; Aegean Numbers
10140..1018F; Ancient Greek Numbers
10190..101CF; Ancient Symbols
101D0..101FF; Phaistos Disc
10280..1029F; Lycian
102A0..102DF; Carian
102E0..102FF; Coptic Epact Numbers
10300..1032F; Old Italic
10330..1034F; Gothic
10350..1037F; Old Permic
10380..1039F; Ugaritic
103A0..103DF; Old Persian
10400..1044F; Deseret
10450..1047F; Shavian
10480..104AF; Osmanya
104B0..104FF; Osage
10500..1052F; Elbasan
10530..1056F; Caucasian Albanian
10600..1077F; Linear A
10800..1083F; Cypriot Syllabary
10840..1085F; Imperial Aramaic
10860..1087F; Palmyrene
10880..108AF; Nabataean
108E0..108FF; Hatran
10900..1091F; Phoenician
10920..1093F; Lydian
10980..1099F; Meroitic Hieroglyphs
109A0..109FF; Meroitic Cursive
10A00..10A5F; Kharoshthi
10A60..10A7F; Old South Arabian
10A80..10A9F; Old North Arabian
10AC0..10AFF; Manichaean
10B00..10B3F; Avestan
10B40..10B5F; Inscriptional Parthian
10B60..10B7F; Inscriptional Pahlavi
10B80..10BAF; Psalter Pahlavi
10C00..10C4F; Old Turkic
10C80..10CFF; Old Hungarian
10E60..10E7F; Rumi Numeral Symbols
11000..1107F; Brahmi
11080..110CF; Kaithi
110D0..110FF; Sora Sompeng
11100..1114F; Chakma
11150..1117F; Mahajani
11180..111DF; Sharada
111E0..111FF; Sinhala Archaic Numbers
11200..1124F; Khojki
11280..112AF; Multani
112B0..112FF; Khudawadi
11300..1137F; Grantha
11400..1147F; Newa
11480..114DF; Tirhuta
11580..115FF; Siddham
11600..1165F; Modi
11660..1167F; Mongolian Supplement
11680..116CF; Takri
11700..1173F; Ahom
118A0..118FF; Warang Citi
11A00..11A4F; Zanabazar Square
11A50..11AAF; Soyombo
11AC0..11AFF; Pau Cin Hau
11C00..11C6F; Bhaiksuki
11C70..11CBF; Marchen
11D00..11D5F; Masaram Gondi
12000..123FF; Cuneiform
12400..1247F; Cuneiform Numbers and Punctuation
12480..1254F; Early Dynastic Cuneiform
13000..1342F; Egyptian Hieroglyphs
14400..1467F; Anatolian Hieroglyphs
16800..16A3F; Bamum Supplement
16A40..16A6F; Mro
16AD0..16AFF; Bassa Vah
16B00..16B8F; Pahawh Hmong
16F00..16F9F; Miao
16FE0..16FFF; Ideographic Symbols and Punctuation
17000..187FF; Tangut
18800..18AFF; Tangut Components
1B000..1B0FF; Kana Supplement
1B100..1B12F; Kana Extended-A
1B170..1B2FF; Nushu
1BC00..1BC9F; Duployan
1BCA0..1BCAF; Shorthand Format Controls
1D000..1D0FF; Byzantine Musical Symbols
1D100..1D1FF; Musical Symbols
1D200..1D24F; Ancient Greek Musical Notation
1D300..1D35F; Tai Xuan Jing Symbols
1D360..1D37F; Counting Rod Numerals
1D400..1D7FF; Mathematical Alphanumeric Symbols
1D800..1DAAF; Sutton SignWriting
1E000..1E02F; Glagolitic Supplement
1E800..1E8DF; Mende Kikakui
1E900..1E95F; Adlam
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
1F000..1F02F; Mahjong Tiles
1F030..1F09F; Domino Tiles
1F0A0..1F0FF; Playing Cards
1F100..1F1FF; Enclosed Alphanumeric Supplement
1F200..1F2FF; Enclosed Ideographic Supplement
1F300..1F5FF; Miscellaneous Symbols and Pictographs
1F600..1F64F; Emoticons
1F650..1F67F; Ornamental Dingbats
1F680..1F6FF; Transport and Map Symbols
1F700..1F77F; Alchemical Symbols
1F780..1F7FF; Geometric Shapes Extended
1F800..1F8FF; Supplemental Arrows-C
1F900..1F9FF; Supplemental Symbols and Pictographs
20000..2A6DF; CJK Unified Ideographs Extension B
2A700..2B73F; CJK Unified Ideographs Extension C
2B740..2B81F; CJK Unified Ideographs Extension D
2B820..2CEAF; CJK Unified Ideographs Extension E
2CEB0..2EBEF; CJK Unified Ideographs Extension F
2F800..2FA1F; CJK Compatibility Ideographs Supplement
E0000..E007F; Tags
E0100..E01EF; Variation Selectors Supplement
F0000..FFFFF; Supplementary Private Use Area-A
100000..10FFFF; Supplementary Private Use Area-B
# EOF

View File

@ -0,0 +1,560 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*
* @author Alan Liu
* @author John O'Conner
*/
import java.io.*;
/**
* This class either loads or dumps the character properties of all Unicode
* characters out to a file. When loading, it compares the loaded data with
* that obtained through the java.lang.Character API. This allows detection of
* changes to the character properties between versions of the Java VM. A
* typical usage would be to dump the properties under an early VM, and load
* them under a later VM.
*
* Also: Check the current VM's character properties against those in a
* Unicode database. The database should be of the format
* available on ftp.unicode.org/Public/UNIDATA.
*
*/
public class CharCheck {
static int differences = 0;
public static void main(String args[]) throws Exception {
if (args.length != 2 && args.length != 3) usage();
if (args[0].equals("dump"))
dump(Integer.parseInt(args[1], 16), new ObjectOutputStream(new FileOutputStream(args[2])));
else if (args[0].equals("load"))
load(Integer.parseInt(args[1], 16), new ObjectInputStream(new FileInputStream(args[2])));
else if (args[0].equals("check"))
check(Integer.parseInt(args[1], 16), new File(args[2]));
else if (args[0].equals("char"))
showChar(Integer.parseInt(args[1],16));
else if (args[0].equals("fchar"))
showFileChar(args[1], Integer.parseInt(args[2],16));
else usage();
if (differences != 0) {
throw new RuntimeException("There are differences between Character properties and the specification.");
}
}
static void usage() {
System.err.println("Usage: java CharCheck <command>");
System.err.println("where <command> is one of the following:");
System.err.println("dump <plane> <file> - dumps the character properties of the given plane,");
System.err.println(" read from the current VM, to the given file.");
System.err.println("load <plane> <file> - loads the character properties from the given");
System.err.println(" file and compares them to those of the given character plane");
System.err.println(" in the current VM.");
System.err.println("check <plane> <file> - compare the current VM's character properties");
System.err.println(" in the given plane to those listed in the given file, ");
System.err.println(" which should be in the format available on ");
System.err.println(" ftp.unicode.org/Public/2.0-Update.");
System.err.println("char <code> - show current VM properties of the given Unicode char.");
System.err.println("fchar <file> <code> - show file properties of the given Unicode char.");
System.exit(0);
}
static String getTypeName(int type) {
return (type >= 0 && type < UnicodeSpec.generalCategoryList.length) ?
(UnicodeSpec.generalCategoryList[type][UnicodeSpec.LONG] + '(' + type + ')') :
("<Illegal type value " + type + ">");
}
static int check(int plane, File specFile) throws Exception {
String version = System.getProperty("java.version");
System.out.println("Current VM version " + version);
int rangeLimit = (plane << 16) | 0xFFFF;
String record;
UnicodeSpec[] spec = UnicodeSpec.readSpecFile(specFile, plane);
int rangeStart = 0x0000;
boolean isRange = false;
lastCheck = (plane << 16) - 1;
for (int currentSpec = 0; currentSpec < spec.length; currentSpec++) {
int c = spec[currentSpec].getCodePoint();
if (isRange) {
// Must see end of range now
if (spec[currentSpec].getName().endsWith("Last>")) {
for (int d=rangeStart; d<=c; d++) {
checkOneChar(d, spec[currentSpec]);
}
}
else {
// No good -- First without Last
System.out.println("BAD FILE: First without last at '" + escape(rangeStart) + "'");
}
isRange = false;
}
else {
// Look for a First, Last pair: This is a pair of entries like the following:
// 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
// 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
if (spec[currentSpec].getName().endsWith("First>")) {
rangeStart = c;
isRange = true;
}
else {
checkOneChar(c, spec[currentSpec]);
}
}
}
// Check undefined chars at the end of the range
while (lastCheck < rangeLimit) checkOneCharDefined(++lastCheck, "?", false);
System.out.println("Total differences: "+differences);
return differences;
}
static int lastCheck = -1;
static final void checkOneCharDefined(int c, String name, boolean fileDefined) {
if (Character.isDefined(c) != fileDefined)
showDifference(c, name, "isDefined", ""+(!fileDefined), ""+fileDefined);
}
// In GenerateCharacter, the following ranges are handled specially.
// Each is the start of a 26-character range with values 10..35.
static final char NUMERIC_EXCEPTION[] = { '\u0041', '\u0061', '\uFF21', '\uFF41' };
static void checkOneChar(int c, UnicodeSpec charSpec) {
// Handle intervening ranges -- we assume that we will be called in monotonically
// increasing order. If the last char we checked is more than one before this
// char, then check the intervening range -- it should all be undefined.
int lowerLimit = (c & 0xFF0000);
if (lastCheck >= lowerLimit && (lastCheck+1) != c) {
for (int i=lastCheck+1; i<c; ++i)
checkOneCharDefined(i, "?", false);
}
lastCheck = c;
// isDefined should be true
checkOneCharDefined(c, charSpec.getName(), true);
// Check lower, upper, and titlecase conversion
int upper = Character.toUpperCase(c);
int lower = Character.toLowerCase(c);
int title = Character.toTitleCase(c);
int upperDB = charSpec.hasUpperMap() ? charSpec.getUpperMap() : c;
int lowerDB = charSpec.hasLowerMap() ? charSpec.getLowerMap() : c;
int titleDB = charSpec.hasTitleMap() ? charSpec.getTitleMap() : c;
if (upper != upperDB) showDifference(c, charSpec.getName(), "upper", hex6(upper), hex6(upperDB));
if (lower != lowerDB) showDifference(c, charSpec.getName(), "lower", hex6(lower), hex6(lowerDB));
if (title != titleDB) showDifference(c, charSpec.getName(), "title", hex6(title), hex6(titleDB));
// Check the character general category (type)
int type = Character.getType(c);
int typeDB = charSpec.getGeneralCategory();
if (type != typeDB) {
showDifference(c, charSpec.getName(), "type",
UnicodeSpec.generalCategoryList[type][UnicodeSpec.SHORT],
UnicodeSpec.generalCategoryList[typeDB][UnicodeSpec.SHORT]);
}
// Check the mirrored property
boolean isMirrored = Character.isMirrored(c);
boolean isMirroredDB = charSpec.isMirrored();
if (isMirrored != isMirroredDB) {
showDifference(c, charSpec.getName(), "isMirrored", ""+isMirrored, ""+isMirroredDB);
}
// Check the directionality property
byte directionality = Character.getDirectionality(c);
byte directionalityDB = charSpec.getBidiCategory();
if (directionality != directionalityDB) {
showDifference(c, charSpec.getName(), "directionality", ""+directionality, ""+directionalityDB);
}
// Check the decimal digit property
int decimalDigit = Character.digit(c, 10);
int decimalDigitDB = -1;
if (charSpec.getGeneralCategory() == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
decimalDigitDB = charSpec.getDecimalValue();
}
if (decimalDigit != decimalDigitDB)
showDifference(c, charSpec.getName(), "decimal digit", ""+decimalDigit, ""+decimalDigitDB);
// Check the numeric property
int numericValue = Character.getNumericValue(c);
int numericValueDB;
if (charSpec.getNumericValue().length() == 0) {
numericValueDB = -1;
// Handle exceptions where Character deviates from the UCS spec
for (int k=0; k<NUMERIC_EXCEPTION.length; ++k) {
if (c >= NUMERIC_EXCEPTION[k] && c < (char)(NUMERIC_EXCEPTION[k]+26)) {
numericValueDB = c - NUMERIC_EXCEPTION[k] + 10;
break;
}
}
}
else {
String strValue = charSpec.getNumericValue();
int parsedNumericValue;
if (strValue.equals("10000000000")
|| strValue.equals("1000000000000")) {
System.out.println("Skipping strValue: " + strValue
+ " for " + charSpec.getName()
+ "(0x" + Integer.toHexString(c) + ")");
parsedNumericValue = -2;
} else {
parsedNumericValue = strValue.indexOf('/') < 0 ?
Integer.parseInt(strValue) : -2;
}
numericValueDB = parsedNumericValue < 0 ? -2 : parsedNumericValue;
}
if (numericValue != numericValueDB)
showDifference(c, charSpec.getName(), "numeric value", ""+numericValue, ""+numericValueDB);
}
static void showDifference(int c, String name, String property, String vmValue, String dbValue) {
System.out.println(escape("Mismatch at '" + hex6(c) + "' (" + name+ "): " +
property + "=" + vmValue + ", db=" + dbValue));
++differences;
}
/**
* Given a record containing ';'-separated fields, return the fieldno-th
* field. The first field is field 0.
*/
static String getField(String record, int fieldno) {
int i=0;
int j=record.indexOf(';');
while (fieldno > 0) {
i=j+1;
j=record.indexOf(';', i);
}
return record.substring(i, j);
}
static final int FIELD_COUNT = 15;
/**
* Given a record containing ';'-separated fields, return an array of
* the fields. It is assumed that there are FIELD_COUNT fields per record.
*/
static void getFields(String record, String[] fields) {
int i=0;
int j=record.indexOf(';');
fields[0] = record.substring(i, j);
for (int n=1; n<FIELD_COUNT; ++n) {
i=j+1;
j=record.indexOf(';', i);
fields[n] = (j<0) ? record.substring(i) : record.substring(i, j);
}
}
/**
* Given a record containing ';'-separated fields, return an array of
* the fields. It is assumed that there are FIELD_COUNT fields per record.
*/
static String[] getFields(String record) {
String[] fields = new String[FIELD_COUNT];
getFields(record, fields);
return fields;
}
static void dump(int plane, ObjectOutputStream out) throws Exception {
String version = System.getProperty("java.version");
System.out.println("Writing file version " + version);
out.writeObject(version);
long[] data = new long[0x20000];
long[] onechar = new long[2];
int j=0;
int begin = plane<<16;
int end = begin + 0xFFFF;
for (int i = begin; i <= end; ++i) {
getPackedCharacterData(i, onechar);
data[j++] = onechar[0];
data[j++] = onechar[1];
}
out.writeObject(data);
}
static long[] loadData(ObjectInputStream in) throws Exception {
String version = System.getProperty("java.version");
String inVersion = (String)in.readObject();
System.out.println("Reading file version " + inVersion);
System.out.println("Current version " + version);
long[] data = (long[])in.readObject();
if (data.length != 0x20000) {
System.out.println("BAD ARRAY LENGTH: " + data.length);
}
return data;
}
static int load(int plane, ObjectInputStream in) throws Exception {
long[] data = CharCheck.loadData(in);
CharCheck.checkData(data, plane);
return differences;
}
static int checkData(long[] data, int plane) {
long[] onechar = new long[2];
for (int i=0; i<0x10000; ++i) {
int c = (plane << 16) | i;
getPackedCharacterData(c, onechar);
if (data[2*i] != onechar[0] || data[2*i+1] != onechar[1]) {
long[] filechar = { data[2*i], data[2*i+1] };
showDifference(c, onechar, filechar);
}
}
System.out.println("Total differences: " + differences);
return differences;
}
static String hex6(long n) {
String q = Long.toHexString(n).toUpperCase();
return "000000".substring(Math.min(6, q.length())) + q;
}
static void showChar(int c) {
long[] chardata = new long[2];
getPackedCharacterData(c, chardata);
System.out.println("Current VM properties for '" + hex6(c) + "': " +
hex6(chardata[1]) + ' ' + hex6(chardata[0]));
String[] data = unpackCharacterData(chardata);
for (int i=0; i<data.length; ++i)
System.out.println(" " + escape(data[i]));
}
static void showFileChar(String fileName, int c) throws Exception {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(fileName));
String inVersion = (String)in.readObject();
System.out.println("Reading file version " + inVersion);
long[] data = (long[])in.readObject();
if (data.length != 0x20000) {
System.out.println("BAD ARRAY LENGTH: " + data.length);
}
int offset = c & 0xFFFF;
long[] chardata = { data[2*offset], data[2*offset+1] };
String[] datap = unpackCharacterData(chardata);
System.out.println(escape("File properties for '" + hex6(c)+ "':"));
for (int i=0; i<datap.length; ++i)
System.out.println(" " + escape(datap[i]));
}
/**
* The packed character data encapsulates all the information obtainable
* about a character in a single numeric value.
*
* data[0]:
*
* 5 bits for getType()
* 6 bits for digit() -- add one
* 6 bits for getNumericValue() -- add two
* 15 bits for isXxx()
*
* 21 bits for toUpperCase()
*
*
* data[1]:
* 21 bits for toLowerCase()
* 21 bits for toTitleCase()
*/
static void getPackedCharacterData(int c, long[] data) {
data[0] =
(long)Character.getType(c) |
((long)(Character.digit(c, Character.MAX_RADIX) + 1) << 5) |
((long)(Character.getNumericValue(c) + 2) << 11) |
(Character.isDefined(c) ? (1L<<17) : 0L) |
(Character.isDigit(c) ? (1L<<18) : 0L) |
(Character.isIdentifierIgnorable(c) ? (1L<<19) : 0L) |
(Character.isISOControl(c) ? (1L<<20) : 0L) |
(Character.isJavaIdentifierPart(c) ? (1L<<21) : 0L) |
(Character.isJavaIdentifierStart(c) ? (1L<<22) : 0L) |
(Character.isLetter(c) ? (1L<<23) : 0L) |
(Character.isLetterOrDigit(c) ? (1L<<24) : 0L) |
(Character.isLowerCase(c) ? (1L<<25) : 0L) |
(Character.isSpaceChar(c) ? (1L<<26) : 0L) |
(Character.isTitleCase(c) ? (1L<<27) : 0L) |
(Character.isUnicodeIdentifierPart(c) ? (1L<<28) : 0L) |
(Character.isUnicodeIdentifierStart(c) ? (1L<<29) : 0L) |
(Character.isUpperCase(c) ? (1L<<30) : 0L) |
(Character.isWhitespace(c) ? (1L<<31) : 0L) |
((long)Character.toUpperCase(c) << 32);
data[1] = (long)Character.toLowerCase(c) |
((long)Character.toTitleCase(c) << 21);
}
/**
* Given a long, set the bits at the given offset and length to the given value.
*/
static long setBits(long data, int offset, int length, long value) {
long himask = -1L << (offset+length);
long lomask = ~(-1L << offset);
long lengthmask = ~(-1L << length);
return (data & (himask | lomask)) | ((value & lengthmask) << offset);
}
/**
* Given packed character data, change the attribute
* toLower
*/
static void setToLower(long[] data, int value) {
data[0] = setBits(data[0], 48, 16, value);
}
/**
* Given packed character data, change the attribute
* toUpper
*/
static void setToUpper(long[] data, int value) {
data[0] = setBits(data[0], 32, 16, value);
}
/**
* Given packed character data, change the attribute
* toTitle
*/
static void setToTitle(long[] data, int value) {
data[1] = value;
}
/**
* Given packed character data, change the attribute
* getType
*/
static void setGetType(long[] data, int value) {
data[0] = setBits(data[0], 0, 5, value);
}
/**
* Given packed character data, change the attribute
* isDefined
*/
static void setIsDefined(long[] data, boolean value) {
data[0] = setBits(data[0], 17, 1, value?1:0);
}
/**
* Given packed character data, change the attribute
* isJavaIdentifierPart
*/
static void setIsJavaIdentifierPart(long[] data, boolean value) {
data[0] = setBits(data[0], 21, 1, value?1:0);
}
/**
* Given packed character data, change the attribute
* isJavaIdentifierStart
*/
static void setIsJavaIdentifierStart(long[] data, boolean value) {
data[0] = setBits(data[0], 22, 1, value?1:0);
}
static String[] unpackCharacterData(long[] dataL) {
long data = dataL[0];
String[] result = {
"type=" + getTypeName((int)(data&0x1F)),
"digit=" + (((data>>5)&0x3F)-1),
"numeric=" + (((data>>11)&0x3F)-2),
"isDefined=" + (((data>>17)&1)==1),
"isDigit=" + (((data>>18)&1)==1),
"isIdentifierIgnorable=" + (((data>>19)&1)==1),
"isISOControl=" + (((data>>20)&1)==1),
"isJavaIdentifierPart=" + (((data>>21)&1)==1),
"isJavaIdentifierStart=" + (((data>>22)&1)==1),
"isLetter=" + (((data>>23)&1)==1),
"isLetterOrDigit=" + (((data>>24)&1)==1),
"isLowerCase=" + (((data>>25)&1)==1),
"isSpaceChar=" + (((data>>26)&1)==1),
"isTitleCase=" + (((data>>27)&1)==1),
"isUnicodeIdentifierPart=" + (((data>>28)&1)==1),
"isUnicodeIdentifierStart=" + (((data>>29)&1)==1),
"isUpperCase=" + (((data>>30)&1)==1),
"isWhitespace=" + (((data>>31)&1)==1),
"toUpper=" + hex6(((int)(data>>32) & 0X1FFFFF)),
"toLower=" + hex6((int)(dataL[1] & 0x1FFFFF)),
"toTitle=" + hex6(((int)(dataL[1] >> 21) & 0x1FFFFF))
};
return result;
}
static String[] getCharacterData(int c) {
long[] data = new long[2];
getPackedCharacterData(c, data);
return unpackCharacterData(data);
}
static void showDifference(int c, long[] currentData, long[] fileData) {
System.out.println("Difference at " + hex6(c));
String[] current = unpackCharacterData(currentData);
String[] file = unpackCharacterData(fileData);
for (int i=0; i<current.length; ++i) {
if (!current[i].equals(file[i])) {
System.out.println(escape(" current " + current[i] +
", file " + file[i]));
}
}
++differences;
}
static String escape(String s) {
StringBuffer buf = new StringBuffer();
for (int i=0; i<s.length(); ++i) {
char c = s.charAt(i);
if (c >= 0x20 && c <= 0x7F) buf.append(c);
else {
buf.append("\\u");
String h = "000" + Integer.toHexString(c);
if (h.length() > 4) h = h.substring(h.length() - 4);
buf.append(h);
}
}
return buf.toString();
}
static String escape(int c) {
StringBuffer buf = new StringBuffer();
if (c >= 0x20 && c <= 0x7F) buf.append(c);
else {
buf.append("\\u");
String h = "000" + Integer.toHexString(c);
if (h.length() > 4) h = h.substring(h.length() - 4);
buf.append(h);
}
return buf.toString();
}
}
//eof

View File

@ -0,0 +1,301 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 4830803 4886934 6565620 6959267 7070436 7198195 8032446 8072600
* @summary Check that the UnicodeBlock forName() method works as expected and block ranges are correct for all Unicode characters.
* @run main CheckBlocks
* @author John O'Conner
*/
import java.io.*;
import java.util.*;
import java.lang.Character.UnicodeBlock;
public class CheckBlocks {
static boolean err = false;
static Class<?> character;
public static void main(String[] args) throws Exception {
generateBlockList();
try {
character = Class.forName("java.lang.Character$UnicodeBlock");
} catch (ClassNotFoundException e) {
throw new RuntimeException("Class.forName(\"Character\") failed.");
}
for (Block blk : blocks) {
test4830803_1(blk);
test4830803_2();
test4886934(blk);
}
if (err) {
throw new RuntimeException("Failed");
} else {
System.out.println("Passed");
}
}
/**
* Check that the UnicodeBlock forName() method works as expected.
*/
private static void test4830803_1(Block blk) throws Exception {
/*
* Try 3 forms of block name in the forName() method. Each form should
* produce the same expected block.
*/
String blkName = blk.getName();
// For backward compatibility
if (blkName.equals("COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS")) {
blkName = "COMBINING_MARKS_FOR_SYMBOLS";
System.out.println("*** COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS is replaced with COMBINING_MARKS_FOR_SYMBOLS for backward compatibility.");
} else if (blkName.equals("GREEK_AND_COPTIC")) {
blkName = "GREEK";
System.out.println("*** GREEK_AND_COPTIC is replaced with GREEK for backward compatibility.");
} else if (blkName.equals("CYRILLIC_SUPPLEMENT")) {
blkName = "CYRILLIC_SUPPLEMENTARY";
System.out.println("*** CYRILLIC_SUPPLEMENT is replaced with CYRILLIC_SUPPLEMENTARY for backward compatibility.");
}
String expectedBlock = null;
try {
expectedBlock = character.getField(blkName).getName();
} catch (NoSuchFieldException | SecurityException e) {
System.err.println("Error: " + blkName + " was not found.");
err = true;
return;
}
String canonicalBlockName = blk.getOriginalName();
String idBlockName = expectedBlock;
String regexBlockName = toRegExString(canonicalBlockName);
if (regexBlockName == null) {
System.err.println("Error: Block name which was processed with regex was null.");
err = true;
return;
}
if (!expectedBlock.equals(UnicodeBlock.forName(canonicalBlockName).toString())) {
System.err.println("Error #1: UnicodeBlock.forName(\"" +
canonicalBlockName + "\") returned wrong value.\n\tGot: " +
UnicodeBlock.forName(canonicalBlockName) +
"\n\tExpected: " + expectedBlock);
err = true;
}
if (!expectedBlock.equals(UnicodeBlock.forName(idBlockName).toString())) {
System.err.println("Error #2: UnicodeBlock.forName(\"" +
idBlockName + "\") returned wrong value.\n\tGot: " +
UnicodeBlock.forName(idBlockName) +
"\n\tExpected: " + expectedBlock);
err = true;
}
if (!expectedBlock.equals(UnicodeBlock.forName(regexBlockName).toString())) {
System.err.println("Error #3: UnicodeBlock.forName(\"" +
regexBlockName + "\") returned wrong value.\n\tGot: " +
UnicodeBlock.forName(regexBlockName) +
"\n\tExpected: " + expectedBlock);
err = true;
}
}
/**
* now try a bad block name. This should produce an IAE.
*/
private static void test4830803_2() {
boolean threwExpected = false;
try {
UnicodeBlock block = UnicodeBlock.forName("notdefined");
}
catch(IllegalArgumentException e) {
threwExpected = true;
}
if (threwExpected == false) {
System.err.println("Error: UnicodeBlock.forName(\"notdefined\") should throw IllegalArgumentException.");
err = true;
}
}
/**
* Convert the argument to a block name form used by the regex package.
* That is, remove all spaces.
*/
private static String toRegExString(String str) {
String[] tokens = null;
StringBuilder retStr = new StringBuilder();
try {
tokens = str.split(" ");
}
catch(java.util.regex.PatternSyntaxException e) {
return null;
}
for(int x=0; x < tokens.length; ++x) {
retStr.append(tokens[x]);
}
return retStr.toString();
}
private static void test4886934(Block blk) {
String blkName = blk.getName();
String blkOrigName = blk.getOriginalName();
int ch = blk.getBegin();
UnicodeBlock block = UnicodeBlock.of(ch);
if (block == null) {
System.err.println("Error: The block for " + blkName +
" is missing. Please check java.lang.Character.UnicodeBlock.");
err = true;
return;
}
// For backward compatibility
if (blkName.equals("COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS")) {
blkName = "COMBINING_MARKS_FOR_SYMBOLS";
System.out.println("*** COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS is replaced with COMBINING_MARKS_FOR_SYMBOLS for backward compatibility.");
} else if (blkName.equals("GREEK_AND_COPTIC")) {
blkName = "GREEK";
System.out.println("*** GREEK_AND_COPTIC is replaced with GREEK for backward compatibility.");
} else if (blkName.equals("CYRILLIC_SUPPLEMENT")) {
blkName = "CYRILLIC_SUPPLEMENTARY";
System.out.println("*** CYRILLIC_SUPPLEMENT is replaced with CYRILLIC_SUPPLEMENTARY for backward compatibility.");
}
String blockName = block.toString();
if (!blockName.equals(blkName)) {
System.err.println("Error: Begin-of-block character(0x" +
Integer.toHexString(ch).toUpperCase() +
") should be in \"" + blkName + "\" block " +
"(Block name is \"" + blkOrigName + "\")" +
" but found in \"" + blockName + "\" block.");
err = true;
}
block = UnicodeBlock.of(++ch);
blockName = block.toString();
if (!blockName.equals(blkName)) {
System.err.println("Error: Character(0x" +
Integer.toHexString(ch).toUpperCase() +
") should be in \"" + blkName + "\" block " +
"(Block name is \"" + blkOrigName + "\")" +
" but found in \"" + blockName + "\" block.");
err = true;
}
ch = blk.getEnd();
block = UnicodeBlock.of(ch);
blockName = block.toString();
if (!blockName.equals(blkName)) {
System.err.println("Error: End-of-block Character(0x" +
Integer.toHexString(ch).toUpperCase() +
") should be in \"" + blkName + "\" block " +
"(Block name is \"" + blkOrigName + "\")" +
" but found in \"" + blockName + "\" block.");
err = true;
}
}
// List of all Unicode blocks, their start, and end codepoints.
public static HashSet<Block> blocks = new HashSet<>();
private static void generateBlockList() throws Exception {
BufferedReader f = new BufferedReader(new FileReader(new File(System.getProperty("test.src", "."), "Blocks.txt")));
String line;
while ((line = f.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue;
}
int index1 = line.indexOf('.');
int begin = Integer.parseInt(line.substring(0, index1), 16);
int index2 = line.indexOf(';');
int end = Integer.parseInt(line.substring(index1+2, index2), 16);
String name = line.substring(index2+1).trim();
System.out.println(" Adding a Block(" +
Integer.toHexString(begin) + ", " + Integer.toHexString(end) +
", " + name + ")");
blocks.add(new Block(begin, end, name));
}
f.close();
}
}
class Block {
public Block() {
blockBegin = 0;
blockEnd = 0;
blockName = null;
}
public Block(int begin, int end, String name) {
blockBegin = begin;
blockEnd = end;
blockName = name.replaceAll("[ -]", "_").toUpperCase(Locale.ENGLISH);
originalBlockName = name;
}
public int getBegin() {
return blockBegin;
}
public int getEnd() {
return blockEnd;
}
public String getName() {
return blockName;
}
public String getOriginalName() {
return originalBlockName;
}
@Override
public boolean equals(Object obj) {
if (obj == null) return false;
if (!(obj instanceof Block)) return false;
Block other = (Block)obj;
return other.blockBegin == blockBegin &&
other.blockEnd == blockEnd &&
other.blockName.equals(blockName) &&
other.originalBlockName.equals(originalBlockName);
}
int blockBegin, blockEnd;
String blockName, originalBlockName;
}

View File

@ -0,0 +1,111 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 4114080 6565620 6959267 7070436 7198195 8032446 8072600
* @summary Make sure the attributes of Unicode characters, as
* returned by the Character API, are as expected. Do this by
* comparing them to a baseline file together with a list of
* known diffs.
* @build UnicodeSpec CharCheck
* @run main CheckUnicode
* @author Alan Liu
* @author John O'Conner
*/
import java.io.*;
public class CheckUnicode {
public static void main(String args[]) throws Exception {
// 1. Check that the dumped property files for planes 0, 1, 2, 3, 14, 15, and 16
// are the same as in the current Character properties.
int[] planes = {0, 1, 2, 3, 14, 15, 16};
String[] fileNames = {"charprop00.bin", "charprop01.bin", "charprop02.bin", "charprop03.bin",
"charprop0E.bin", "charprop0F.bin", "charprop10.bin" };
// Read in the Unicode 4.0 data
for (int x=0; x < planes.length && x < fileNames.length; ++x) {
File unicodeProp = new File(System.getProperty("test.src", "."), fileNames[x]);
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(unicodeProp));
// Find differences -- should be none
int diffs = CharCheck.load(planes[x], ois);
if (diffs != 0) {
throw new RuntimeException("Bug 4114080 - Unicode properties have changed " +
"in an unexpected way");
}
}
// 2. Check that the current 4.0 spec file is handled by the current
// version of Character.
File unicodeSpec = new File(System.getProperty("test.src", "."), "UnicodeData.txt");
for (int x=0; x<planes.length; ++x) {
int diffs = CharCheck.check(planes[x], unicodeSpec);
if (diffs != 0) {
throw new RuntimeException("Bug 4114080 - Unicode properties have changed " +
"in an unexpected way");
}
}
// 3. Check that Java identifiers are recognized correctly.
// test a few characters that are good id starts
char[] idStartChar = {'$', '\u20AC', 'a', 'A', 'z', 'Z', '_', '\u0E3F',
'\u1004', '\u10A0', '\u3400', '\u4E00', '\uAC00' };
for (int x = 0; x < idStartChar.length; x++) {
if (Character.isJavaIdentifierStart(idStartChar[x]) != true) {
throw new RuntimeException("Java id start characters are not recognized.");
}
}
// test a few characters that are good id parts
char[] idPartChar = {'0', '9', '\u0000', '\u0008', '\u000E', '\u007F'};
for (int x=0; x< idStartChar.length; x++) {
if (Character.isJavaIdentifierPart(idStartChar[x]) != true) {
throw new RuntimeException("Java id part characters are not recognized.");
}
}
for (int x=0; x<idPartChar.length; x++) {
if (Character.isJavaIdentifierPart(idPartChar[x]) != true) {
throw new RuntimeException("Java id part characters are not recognized.");
}
}
// now do some negative checks
for (int x=0; x< idPartChar.length; x++) {
if (Character.isJavaIdentifierStart(idPartChar[x]) != false) {
throw new RuntimeException("These Java id part characters" +
"should not be start characters.");
}
}
}
}

View File

@ -0,0 +1,72 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*
* @author Martin Buchholz
*/
import java.util.*;
import static java.lang.Character.*;
public class DumpCharProperties {
final static Locale turkish = new Locale("tr");
static String charProps(int i) {
String s = new String(new int[]{i},0,1);
return String.format
("%b %b %b %b %b %b %b %b %b %b %b %b %d %d %d %d %d %b %b %d %d %b %d %d",
isLowerCase(i),
isUpperCase(i),
isTitleCase(i),
isDigit(i),
isDefined(i),
isLetter(i),
isLetterOrDigit(i),
isJavaIdentifierStart(i),
isJavaIdentifierPart(i),
isUnicodeIdentifierStart(i),
isUnicodeIdentifierPart(i),
isIdentifierIgnorable(i),
toLowerCase(i),
toUpperCase(i),
toTitleCase(i),
digit(i, 16),
getNumericValue(i),
isSpaceChar(i),
isWhitespace(i),
getType(i),
getDirectionality(i),
isMirrored(i),
(int) s.toUpperCase(Locale.GERMAN).charAt(0),
(int) s.toUpperCase(turkish).charAt(0));
}
public static void main(String[] args) throws Throwable {
for (int i = 0; i < 17*0x10000; i++) {
System.out.println(charProps(i));
}
}
}

View File

@ -1,10 +1,11 @@
# PropList-8.0.0.txt
# Date: 2015-05-16, 17:50:38 GMT [MD]
# PropList-10.0.0.txt
# Date: 2017-03-10, 08:25:30 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
# ================================================
@ -192,10 +193,17 @@ FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA
111DE..111DF ; Terminal_Punctuation # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK
112A9 ; Terminal_Punctuation # Po MULTANI SECTION MARK
1144B..1144D ; Terminal_Punctuation # Po [3] NEWA DANDA..NEWA COMMA
1145B ; Terminal_Punctuation # Po NEWA PLACEHOLDER MARK
115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR
115C9..115D7 ; Terminal_Punctuation # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; Terminal_Punctuation # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
11A42..11A43 ; Terminal_Punctuation # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD
11A9B..11A9C ; Terminal_Punctuation # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
11AA1..11AA2 ; Terminal_Punctuation # Po [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
11C41..11C43 ; Terminal_Punctuation # Po [3] BHAIKSUKI DANDA..BHAIKSUKI WORD SEPARATOR
11C71 ; Terminal_Punctuation # Po MARCHEN MARK SHAD
12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP
@ -204,7 +212,7 @@ FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA
1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA87..1DA8A ; Terminal_Punctuation # Po [4] SIGNWRITING COMMA..SIGNWRITING COLON
# Total code points: 238
# Total code points: 252
# ================================================
@ -429,6 +437,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
081B..0823 ; Other_Alphabetic # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN
08D4..08DF ; Other_Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA
08E3..08E9 ; Other_Alphabetic # Mn [7] ARABIC TURNED DAMMA BELOW..ARABIC CURLY KASRATAN
08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA
0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA
@ -465,6 +474,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
0AC9 ; Other_Alphabetic # Mc GUJARATI VOWEL SIGN CANDRA O
0ACB..0ACC ; Other_Alphabetic # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU
0AE2..0AE3 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
0AFA..0AFC ; Other_Alphabetic # Mn [3] GUJARATI SIGN SUKUN..GUJARATI SIGN MADDAH
0B01 ; Other_Alphabetic # Mn ORIYA SIGN CANDRABINDU
0B02..0B03 ; Other_Alphabetic # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA
0B3E ; Other_Alphabetic # Mc ORIYA VOWEL SIGN AA
@ -502,7 +512,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU
0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU
0D00..0D01 ; Other_Alphabetic # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@ -556,6 +566,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
17BE..17C5 ; Other_Alphabetic # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
17C6 ; Other_Alphabetic # Mn KHMER SIGN NIKAHIT
17C7..17C8 ; Other_Alphabetic # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
1885..1886 ; Other_Alphabetic # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
18A9 ; Other_Alphabetic # Mn MONGOLIAN LETTER ALI GALI DAGALGA
1920..1922 ; Other_Alphabetic # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
1923..1926 ; Other_Alphabetic # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
@ -613,6 +624,7 @@ A825..A826 ; Other_Alphabetic # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NA
A827 ; Other_Alphabetic # Mc SYLOTI NAGRI VOWEL SIGN OO
A880..A881 ; Other_Alphabetic # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
A8B4..A8C3 ; Other_Alphabetic # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
A8C5 ; Other_Alphabetic # Mn SAURASHTRA SIGN CANDRABINDU
A926..A92A ; Other_Alphabetic # Mn [5] KAYAH LI VOWEL UE..KAYAH LI VOWEL O
A947..A951 ; Other_Alphabetic # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
A952 ; Other_Alphabetic # Mc REJANG CONSONANT SIGN H
@ -671,6 +683,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA
11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA
1123E ; Other_Alphabetic # Mn KHOJKI SIGN SUKUN
112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA
112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU
@ -683,6 +696,11 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU
11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK
11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
11435..11437 ; Other_Alphabetic # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
11438..1143F ; Other_Alphabetic # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
11440..11441 ; Other_Alphabetic # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
11443..11444 ; Other_Alphabetic # Mn [2] NEWA SIGN CANDRABINDU..NEWA SIGN ANUSVARA
11445 ; Other_Alphabetic # Mc NEWA SIGN VISARGA
114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E
@ -712,14 +730,48 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
11722..11725 ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU
11726 ; Other_Alphabetic # Mc AHOM VOWEL SIGN E
11727..1172A ; Other_Alphabetic # Mn [4] AHOM VOWEL SIGN AW..AHOM VOWEL SIGN AM
11A01..11A06 ; Other_Alphabetic # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O
11A07..11A08 ; Other_Alphabetic # Mc [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU
11A09..11A0A ; Other_Alphabetic # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK
11A35..11A38 ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE SIGN CANDRABINDU..ZANABAZAR SQUARE SIGN ANUSVARA
11A39 ; Other_Alphabetic # Mc ZANABAZAR SQUARE SIGN VISARGA
11A3B..11A3E ; Other_Alphabetic # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
11A51..11A56 ; Other_Alphabetic # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
11A57..11A58 ; Other_Alphabetic # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
11A59..11A5B ; Other_Alphabetic # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
11A8A..11A96 ; Other_Alphabetic # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
11A97 ; Other_Alphabetic # Mc SOYOMBO SIGN VISARGA
11C2F ; Other_Alphabetic # Mc BHAIKSUKI VOWEL SIGN AA
11C30..11C36 ; Other_Alphabetic # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
11C38..11C3D ; Other_Alphabetic # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
11C3E ; Other_Alphabetic # Mc BHAIKSUKI SIGN VISARGA
11C92..11CA7 ; Other_Alphabetic # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
11CA9 ; Other_Alphabetic # Mc MARCHEN SUBJOINED LETTER YA
11CAA..11CB0 ; Other_Alphabetic # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
11CB1 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN I
11CB2..11CB3 ; Other_Alphabetic # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
11CB4 ; Other_Alphabetic # Mc MARCHEN VOWEL SIGN O
11CB5..11CB6 ; Other_Alphabetic # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
11D31..11D36 ; Other_Alphabetic # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
11D3A ; Other_Alphabetic # Mn MASARAM GONDI VOWEL SIGN E
11D3C..11D3D ; Other_Alphabetic # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
11D3F..11D41 ; Other_Alphabetic # Mn [3] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI SIGN VISARGA
11D43 ; Other_Alphabetic # Mn MASARAM GONDI SIGN CANDRA
11D47 ; Other_Alphabetic # Mn MASARAM GONDI RA-KARA
16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG
1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK
1E000..1E006 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; Other_Alphabetic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; Other_Alphabetic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Other_Alphabetic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Other_Alphabetic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
1E947 ; Other_Alphabetic # Mn ADLAM HAMZA
1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
# Total code points: 1116
# Total code points: 1300
# ================================================
@ -728,16 +780,20 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
3021..3029 ; Ideographic # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
3038..303A ; Ideographic # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
3400..4DB5 ; Ideographic # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Ideographic # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Ideographic # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
F900..FA6D ; Ideographic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
17000..187EC ; Ideographic # Lo [6125] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187EC
18800..18AF2 ; Ideographic # Lo [755] TANGUT COMPONENT-001..TANGUT COMPONENT-755
1B170..1B2FB ; Ideographic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
20000..2A6D6 ; Ideographic # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Ideographic # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Ideographic # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Ideographic # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Ideographic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Ideographic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
# Total code points: 81404
# Total code points: 96174
# ================================================
@ -793,12 +849,14 @@ FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COM
0A4D ; Diacritic # Mn GURMUKHI SIGN VIRAMA
0ABC ; Diacritic # Mn GUJARATI SIGN NUKTA
0ACD ; Diacritic # Mn GUJARATI SIGN VIRAMA
0AFD..0AFF ; Diacritic # Mn [3] GUJARATI SIGN THREE-DOT NUKTA ABOVE..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
0B3C ; Diacritic # Mn ORIYA SIGN NUKTA
0B4D ; Diacritic # Mn ORIYA SIGN VIRAMA
0BCD ; Diacritic # Mn TAMIL SIGN VIRAMA
0C4D ; Diacritic # Mn TELUGU SIGN VIRAMA
0CBC ; Diacritic # Mn KANNADA SIGN NUKTA
0CCD ; Diacritic # Mn KANNADA SIGN VIRAMA
0D3B..0D3C ; Diacritic # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
0D4D ; Diacritic # Mn MALAYALAM SIGN VIRAMA
0DCA ; Diacritic # Mn SINHALA SIGN AL-LAKUNA
0E47..0E4C ; Diacritic # Mn [6] THAI CHARACTER MAITAIKHU..THAI CHARACTER THANTHAKHAT
@ -838,10 +896,11 @@ FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COM
1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Diacritic # Mn VEDIC SIGN TIRYAK
1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE
1CF7 ; Diacritic # Mc VEDIC SIGN ATIKRAMA
1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW
1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE
1DF5..1DF9 ; Diacritic # Mn [5] COMBINING UP TACK ABOVE..COMBINING WIDE INVERTED BRIDGE BELOW
1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1FBD ; Diacritic # Sk GREEK KORONIS
1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
@ -906,12 +965,20 @@ FFE3 ; Diacritic # Sk FULLWIDTH MACRON
1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA
11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
11442 ; Diacritic # Mn NEWA SIGN VIRAMA
11446 ; Diacritic # Mn NEWA SIGN NUKTA
114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
1163F ; Diacritic # Mn MODI SIGN VIRAMA
116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA
116B7 ; Diacritic # Mn TAKRI SIGN NUKTA
1172B ; Diacritic # Mn AHOM SIGN KILLER
11A34 ; Diacritic # Mn ZANABAZAR SQUARE SIGN VIRAMA
11A47 ; Diacritic # Mn ZANABAZAR SQUARE SUBJOINER
11A99 ; Diacritic # Mn SOYOMBO SUBJOINER
11C3F ; Diacritic # Mn BHAIKSUKI SIGN VIRAMA
11D42 ; Diacritic # Mn MASARAM GONDI SIGN NUKTA
11D44..11D45 ; Diacritic # Mn [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA
16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
@ -921,8 +988,10 @@ FFE3 ; Diacritic # Sk FULLWIDTH MACRON
1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
1E944..1E946 ; Diacritic # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK
1E948..1E94A ; Diacritic # Mn [3] ADLAM CONSONANT MODIFIER..ADLAM NUKTA
# Total code points: 773
# Total code points: 798
# ================================================
@ -951,9 +1020,12 @@ AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETE
FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1135D ; Extender # Lo GRANTHA SIGN PLUTA
115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3
11A98 ; Extender # Mn SOYOMBO GEMINATION MARK
16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM
16FE0..16FE1 ; Extender # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
1E944..1E946 ; Extender # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK
# Total code points: 38
# Total code points: 44
# ================================================
@ -1027,7 +1099,7 @@ FFFFE..FFFFF ; Noncharacter_Code_Point # Cn [2] <noncharacter-FFFFE>..<noncha
0D57 ; Other_Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK
0DCF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA
0DDF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA
200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
200C ; Other_Grapheme_Extend # Cf ZERO WIDTH NON-JOINER
302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA
@ -1037,8 +1109,9 @@ FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND
115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA
1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM
1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
E0020..E007F ; Other_Grapheme_Extend # Cf [96] TAG SPACE..CANCEL TAG
# Total code points: 30
# Total code points: 125
# ================================================
@ -1064,7 +1137,7 @@ FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND
# ================================================
3400..4DB5 ; Unified_Ideograph # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Unified_Ideograph # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Unified_Ideograph # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F
FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11
FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14
@ -1076,8 +1149,9 @@ FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..C
2A700..2B734 ; Unified_Ideograph # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Unified_Ideograph # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Unified_Ideograph # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Unified_Ideograph # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
# Total code points: 80388
# Total code points: 87882
# ================================================
@ -1106,9 +1180,8 @@ E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>.
2329 ; Deprecated # Ps LEFT-POINTING ANGLE BRACKET
232A ; Deprecated # Pe RIGHT-POINTING ANGLE BRACKET
E0001 ; Deprecated # Cf LANGUAGE TAG
E007F ; Deprecated # Cf CANCEL TAG
# Total code points: 16
# Total code points: 15
# ================================================
@ -1160,11 +1233,12 @@ AABB..AABC ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL AUE..TAI VIET
# ================================================
1885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
2118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
212E ; Other_ID_Start # So ESTIMATED SYMBOL
309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
# Total code points: 4
# Total code points: 6
# ================================================
@ -1177,72 +1251,76 @@ AABB..AABC ; Logical_Order_Exception # Lo [2] TAI VIET VOWEL AUE..TAI VIET
# ================================================
0021 ; STerm # Po EXCLAMATION MARK
002E ; STerm # Po FULL STOP
003F ; STerm # Po QUESTION MARK
0589 ; STerm # Po ARMENIAN FULL STOP
061F ; STerm # Po ARABIC QUESTION MARK
06D4 ; STerm # Po ARABIC FULL STOP
0700..0702 ; STerm # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP
07F9 ; STerm # Po NKO EXCLAMATION MARK
0964..0965 ; STerm # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
104A..104B ; STerm # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION
1362 ; STerm # Po ETHIOPIC FULL STOP
1367..1368 ; STerm # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
166E ; STerm # Po CANADIAN SYLLABICS FULL STOP
1735..1736 ; STerm # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
1803 ; STerm # Po MONGOLIAN FULL STOP
1809 ; STerm # Po MONGOLIAN MANCHU FULL STOP
1944..1945 ; STerm # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1AA8..1AAB ; STerm # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU
1B5A..1B5B ; STerm # Po [2] BALINESE PANTI..BALINESE PAMADA
1B5E..1B5F ; STerm # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN
1C3B..1C3C ; STerm # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL
1C7E..1C7F ; STerm # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; STerm # Po REVERSED QUESTION MARK
2E3C ; STerm # Po STENOGRAPHIC FULL STOP
3002 ; STerm # Po IDEOGRAPHIC FULL STOP
A4FF ; STerm # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK
A6F3 ; STerm # Po BAMUM FULL STOP
A6F7 ; STerm # Po BAMUM QUESTION MARK
A876..A877 ; STerm # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD
A8CE..A8CF ; STerm # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A92F ; STerm # Po KAYAH LI SIGN SHYA
A9C8..A9C9 ; STerm # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI
AA5D..AA5F ; STerm # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; STerm # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; STerm # Po MEETEI MAYEK CHEIKHEI
FE52 ; STerm # Po SMALL FULL STOP
FE56..FE57 ; STerm # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; STerm # Po FULLWIDTH EXCLAMATION MARK
FF0E ; STerm # Po FULLWIDTH FULL STOP
FF1F ; STerm # Po FULLWIDTH QUESTION MARK
FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP
10A56..10A57 ; STerm # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
11047..11048 ; STerm # Po [2] BRAHMI DANDA..BRAHMI DOUBLE DANDA
110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
111CD ; STerm # Po SHARADA SUTRA MARK
111DE..111DF ; STerm # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
112A9 ; STerm # Po MULTANI SECTION MARK
115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
115C9..115D7 ; STerm # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; STerm # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; STerm # Po BASSA VAH FULL STOP
16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS
1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; STerm # Po SIGNWRITING FULL STOP
0021 ; Sentence_Terminal # Po EXCLAMATION MARK
002E ; Sentence_Terminal # Po FULL STOP
003F ; Sentence_Terminal # Po QUESTION MARK
0589 ; Sentence_Terminal # Po ARMENIAN FULL STOP
061F ; Sentence_Terminal # Po ARABIC QUESTION MARK
06D4 ; Sentence_Terminal # Po ARABIC FULL STOP
0700..0702 ; Sentence_Terminal # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP
07F9 ; Sentence_Terminal # Po NKO EXCLAMATION MARK
0964..0965 ; Sentence_Terminal # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
104A..104B ; Sentence_Terminal # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION
1362 ; Sentence_Terminal # Po ETHIOPIC FULL STOP
1367..1368 ; Sentence_Terminal # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
166E ; Sentence_Terminal # Po CANADIAN SYLLABICS FULL STOP
1735..1736 ; Sentence_Terminal # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
1803 ; Sentence_Terminal # Po MONGOLIAN FULL STOP
1809 ; Sentence_Terminal # Po MONGOLIAN MANCHU FULL STOP
1944..1945 ; Sentence_Terminal # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1AA8..1AAB ; Sentence_Terminal # Po [4] TAI THAM SIGN KAAN..TAI THAM SIGN SATKAANKUU
1B5A..1B5B ; Sentence_Terminal # Po [2] BALINESE PANTI..BALINESE PAMADA
1B5E..1B5F ; Sentence_Terminal # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN
1C3B..1C3C ; Sentence_Terminal # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL
1C7E..1C7F ; Sentence_Terminal # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
203C..203D ; Sentence_Terminal # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Sentence_Terminal # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; Sentence_Terminal # Po REVERSED QUESTION MARK
2E3C ; Sentence_Terminal # Po STENOGRAPHIC FULL STOP
3002 ; Sentence_Terminal # Po IDEOGRAPHIC FULL STOP
A4FF ; Sentence_Terminal # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; Sentence_Terminal # Po [2] VAI FULL STOP..VAI QUESTION MARK
A6F3 ; Sentence_Terminal # Po BAMUM FULL STOP
A6F7 ; Sentence_Terminal # Po BAMUM QUESTION MARK
A876..A877 ; Sentence_Terminal # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD
A8CE..A8CF ; Sentence_Terminal # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A92F ; Sentence_Terminal # Po KAYAH LI SIGN SHYA
A9C8..A9C9 ; Sentence_Terminal # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI
AA5D..AA5F ; Sentence_Terminal # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; Sentence_Terminal # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; Sentence_Terminal # Po MEETEI MAYEK CHEIKHEI
FE52 ; Sentence_Terminal # Po SMALL FULL STOP
FE56..FE57 ; Sentence_Terminal # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; Sentence_Terminal # Po FULLWIDTH EXCLAMATION MARK
FF0E ; Sentence_Terminal # Po FULLWIDTH FULL STOP
FF1F ; Sentence_Terminal # Po FULLWIDTH QUESTION MARK
FF61 ; Sentence_Terminal # Po HALFWIDTH IDEOGRAPHIC FULL STOP
10A56..10A57 ; Sentence_Terminal # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
11047..11048 ; Sentence_Terminal # Po [2] BRAHMI DANDA..BRAHMI DOUBLE DANDA
110BE..110C1 ; Sentence_Terminal # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; Sentence_Terminal # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; Sentence_Terminal # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
111CD ; Sentence_Terminal # Po SHARADA SUTRA MARK
111DE..111DF ; Sentence_Terminal # Po [2] SHARADA SECTION MARK-1..SHARADA SECTION MARK-2
11238..11239 ; Sentence_Terminal # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
1123B..1123C ; Sentence_Terminal # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
112A9 ; Sentence_Terminal # Po MULTANI SECTION MARK
1144B..1144C ; Sentence_Terminal # Po [2] NEWA DANDA..NEWA DOUBLE DANDA
115C2..115C3 ; Sentence_Terminal # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
115C9..115D7 ; Sentence_Terminal # Po [15] SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
11641..11642 ; Sentence_Terminal # Po [2] MODI DANDA..MODI DOUBLE DANDA
1173C..1173E ; Sentence_Terminal # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
11A42..11A43 ; Sentence_Terminal # Po [2] ZANABAZAR SQUARE MARK SHAD..ZANABAZAR SQUARE MARK DOUBLE SHAD
11A9B..11A9C ; Sentence_Terminal # Po [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
11C41..11C42 ; Sentence_Terminal # Po [2] BHAIKSUKI DANDA..BHAIKSUKI DOUBLE DANDA
16A6E..16A6F ; Sentence_Terminal # Po [2] MRO DANDA..MRO DOUBLE DANDA
16AF5 ; Sentence_Terminal # Po BASSA VAH FULL STOP
16B37..16B38 ; Sentence_Terminal # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
16B44 ; Sentence_Terminal # Po PAHAWH HMONG SIGN XAUS
1BC9F ; Sentence_Terminal # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; Sentence_Terminal # Po SIGNWRITING FULL STOP
# Total code points: 120
# Total code points: 128
# ================================================
@ -1359,9 +1437,7 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
23FB..23FF ; Pattern_Syntax # Cn [5] <reserved-23FB>..<reserved-23FF>
2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
23E2..2426 ; Pattern_Syntax # So [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO
2427..243F ; Pattern_Syntax # Cn [25] <reserved-2427>..<reserved-243F>
2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
244B..245F ; Pattern_Syntax # Cn [21] <reserved-244B>..<reserved-245F>
@ -1449,8 +1525,8 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
2BBA..2BBC ; Pattern_Syntax # Cn [3] <reserved-2BBA>..<reserved-2BBC>
2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
2BC9 ; Pattern_Syntax # Cn <reserved-2BC9>
2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2BD2..2BEB ; Pattern_Syntax # Cn [26] <reserved-2BD2>..<reserved-2BEB>
2BCA..2BD2 ; Pattern_Syntax # So [9] TOP HALF BLACK CIRCLE..GROUP MARK
2BD3..2BEB ; Pattern_Syntax # Cn [25] <reserved-2BD3>..<reserved-2BEB>
2BEC..2BEF ; Pattern_Syntax # So [4] LEFTWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS..DOWNWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS
2BF0..2BFF ; Pattern_Syntax # Cn [16] <reserved-2BF0>..<reserved-2BFF>
2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
@ -1490,7 +1566,8 @@ E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION S
2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN
2E41 ; Pattern_Syntax # Po REVERSED COMMA
2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2E43..2E7F ; Pattern_Syntax # Cn [61] <reserved-2E43>..<reserved-2E7F>
2E43..2E49 ; Pattern_Syntax # Po [7] DASH WITH LEFT UPTURN..DOUBLE STACKED COMMA
2E4A..2E7F ; Pattern_Syntax # Cn [54] <reserved-2E4A>..<reserved-2E7F>
3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET
3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET
@ -1522,4 +1599,20 @@ FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 2760
# ================================================
0600..0605 ; Prepended_Concatenation_Mark # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
06DD ; Prepended_Concatenation_Mark # Cf ARABIC END OF AYAH
070F ; Prepended_Concatenation_Mark # Cf SYRIAC ABBREVIATION MARK
08E2 ; Prepended_Concatenation_Mark # Cf ARABIC DISPUTED END OF AYAH
110BD ; Prepended_Concatenation_Mark # Cf KAITHI NUMBER SIGN
# Total code points: 10
# ================================================
1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
# Total code points: 26
# EOF

View File

@ -1,10 +1,11 @@
# PropertyValueAliases-8.0.0.txt
# Date: 2015-03-11, 22:29:33 GMT [MD]
# PropertyValueAliases-10.0.0.txt
# Date: 2017-05-17, 08:45:34 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
#
# This file contains aliases for property values used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
@ -78,6 +79,8 @@ age; 6.2 ; V6_2
age; 6.3 ; V6_3
age; 7.0 ; V7_0
age; 8.0 ; V8_0
age; 9.0 ; V9_0
age; 10.0 ; V10_0
age; NA ; Unassigned
# Alphabetic (Alpha)
@ -138,6 +141,7 @@ bpt; o ; Open
# Block (blk)
blk; Adlam ; Adlam
blk; Aegean_Numbers ; Aegean_Numbers
blk; Ahom ; Ahom
blk; Alchemical ; Alchemical_Symbols
@ -162,6 +166,7 @@ blk; Bamum_Sup ; Bamum_Supplement
blk; Bassa_Vah ; Bassa_Vah
blk; Batak ; Batak
blk; Bengali ; Bengali
blk; Bhaiksuki ; Bhaiksuki
blk; Block_Elements ; Block_Elements
blk; Bopomofo ; Bopomofo
blk; Bopomofo_Ext ; Bopomofo_Extended
@ -187,6 +192,7 @@ blk; CJK_Ext_B ; CJK_Unified_Ideographs_Extension_B
blk; CJK_Ext_C ; CJK_Unified_Ideographs_Extension_C
blk; CJK_Ext_D ; CJK_Unified_Ideographs_Extension_D
blk; CJK_Ext_E ; CJK_Unified_Ideographs_Extension_E
blk; CJK_Ext_F ; CJK_Unified_Ideographs_Extension_F
blk; CJK_Radicals_Sup ; CJK_Radicals_Supplement
blk; CJK_Strokes ; CJK_Strokes
blk; CJK_Symbols ; CJK_Symbols_And_Punctuation
@ -202,6 +208,7 @@ blk; Cypriot_Syllabary ; Cypriot_Syllabary
blk; Cyrillic ; Cyrillic
blk; Cyrillic_Ext_A ; Cyrillic_Extended_A
blk; Cyrillic_Ext_B ; Cyrillic_Extended_B
blk; Cyrillic_Ext_C ; Cyrillic_Extended_C
blk; Cyrillic_Sup ; Cyrillic_Supplement ; Cyrillic_Supplementary
blk; Deseret ; Deseret
blk; Devanagari ; Devanagari
@ -230,6 +237,7 @@ blk; Geometric_Shapes_Ext ; Geometric_Shapes_Extended
blk; Georgian ; Georgian
blk; Georgian_Sup ; Georgian_Supplement
blk; Glagolitic ; Glagolitic
blk; Glagolitic_Sup ; Glagolitic_Supplement
blk; Gothic ; Gothic
blk; Grantha ; Grantha
blk; Greek ; Greek_And_Coptic
@ -246,6 +254,7 @@ blk; High_PU_Surrogates ; High_Private_Use_Surrogates
blk; High_Surrogates ; High_Surrogates
blk; Hiragana ; Hiragana
blk; IDC ; Ideographic_Description_Characters
blk; Ideographic_Symbols ; Ideographic_Symbols_And_Punctuation
blk; Imperial_Aramaic ; Imperial_Aramaic
blk; Indic_Number_Forms ; Common_Indic_Number_Forms
blk; Inscriptional_Pahlavi ; Inscriptional_Pahlavi
@ -256,6 +265,7 @@ blk; Jamo_Ext_A ; Hangul_Jamo_Extended_A
blk; Jamo_Ext_B ; Hangul_Jamo_Extended_B
blk; Javanese ; Javanese
blk; Kaithi ; Kaithi
blk; Kana_Ext_A ; Kana_Extended_A
blk; Kana_Sup ; Kana_Supplement
blk; Kanbun ; Kanbun
blk; Kangxi ; Kangxi_Radicals
@ -291,6 +301,8 @@ blk; Mahjong ; Mahjong_Tiles
blk; Malayalam ; Malayalam
blk; Mandaic ; Mandaic
blk; Manichaean ; Manichaean
blk; Marchen ; Marchen
blk; Masaram_Gondi ; Masaram_Gondi
blk; Math_Alphanum ; Mathematical_Alphanumeric_Symbols
blk; Math_Operators ; Mathematical_Operators
blk; Meetei_Mayek ; Meetei_Mayek
@ -309,6 +321,7 @@ blk; Modi ; Modi
blk; Modifier_Letters ; Spacing_Modifier_Letters
blk; Modifier_Tone_Letters ; Modifier_Tone_Letters
blk; Mongolian ; Mongolian
blk; Mongolian_Sup ; Mongolian_Supplement
blk; Mro ; Mro
blk; Multani ; Multani
blk; Music ; Musical_Symbols
@ -318,8 +331,10 @@ blk; Myanmar_Ext_B ; Myanmar_Extended_B
blk; Nabataean ; Nabataean
blk; NB ; No_Block
blk; New_Tai_Lue ; New_Tai_Lue
blk; Newa ; Newa
blk; NKo ; NKo
blk; Number_Forms ; Number_Forms
blk; Nushu ; Nushu
blk; OCR ; Optical_Character_Recognition
blk; Ogham ; Ogham
blk; Ol_Chiki ; Ol_Chiki
@ -332,6 +347,7 @@ blk; Old_South_Arabian ; Old_South_Arabian
blk; Old_Turkic ; Old_Turkic
blk; Oriya ; Oriya
blk; Ornamental_Dingbats ; Ornamental_Dingbats
blk; Osage ; Osage
blk; Osmanya ; Osmanya
blk; Pahawh_Hmong ; Pahawh_Hmong
blk; Palmyrene ; Palmyrene
@ -358,6 +374,7 @@ blk; Sinhala ; Sinhala
blk; Sinhala_Archaic_Numbers ; Sinhala_Archaic_Numbers
blk; Small_Forms ; Small_Form_Variants
blk; Sora_Sompeng ; Sora_Sompeng
blk; Soyombo ; Soyombo
blk; Specials ; Specials
blk; Sundanese ; Sundanese
blk; Sundanese_Sup ; Sundanese_Supplement
@ -373,6 +390,7 @@ blk; Super_And_Sub ; Superscripts_And_Subscripts
blk; Sutton_SignWriting ; Sutton_SignWriting
blk; Syloti_Nagri ; Syloti_Nagri
blk; Syriac ; Syriac
blk; Syriac_Sup ; Syriac_Supplement
blk; Tagalog ; Tagalog
blk; Tagbanwa ; Tagbanwa
blk; Tags ; Tags
@ -382,6 +400,8 @@ blk; Tai_Viet ; Tai_Viet
blk; Tai_Xuan_Jing ; Tai_Xuan_Jing_Symbols
blk; Takri ; Takri
blk; Tamil ; Tamil
blk; Tangut ; Tangut
blk; Tangut_Components ; Tangut_Components
blk; Telugu ; Telugu
blk; Thaana ; Thaana
blk; Thai ; Thai
@ -401,6 +421,7 @@ blk; Warang_Citi ; Warang_Citi
blk; Yi_Radicals ; Yi_Radicals
blk; Yi_Syllables ; Yi_Syllables
blk; Yijing ; Yijing_Hexagram_Symbols
blk; Zanabazar_Square ; Zanabazar_Square
# Canonical_Combining_Class (ccc)
@ -650,7 +671,11 @@ Gr_Base; Y ; Yes ; T
GCB; CN ; Control
GCB; CR ; CR
GCB; EB ; E_Base
GCB; EBG ; E_Base_GAZ
GCB; EM ; E_Modifier
GCB; EX ; Extend
GCB; GAZ ; Glue_After_Zwj
GCB; L ; L
GCB; LF ; LF
GCB; LV ; LV
@ -661,6 +686,7 @@ GCB; SM ; SpacingMark
GCB; T ; T
GCB; V ; V
GCB; XX ; Other
GCB; ZWJ ; ZWJ
# Grapheme_Extend (Gr_Ext)
@ -723,6 +749,7 @@ Ideo; Y ; Yes ; T
# Indic_Positional_Category (InPC)
InPC; Bottom ; Bottom
InPC; Bottom_And_Left ; Bottom_And_Left
InPC; Bottom_And_Right ; Bottom_And_Right
InPC; Left ; Left
InPC; Left_And_Right ; Left_And_Right
@ -838,6 +865,9 @@ Join_C; Y ; Yes ; T
# Joining_Group (jg)
jg ; African_Feh ; African_Feh
jg ; African_Noon ; African_Noon
jg ; African_Qaf ; African_Qaf
jg ; Ain ; Ain
jg ; Alaph ; Alaph
jg ; Alef ; Alef
@ -864,6 +894,17 @@ jg ; Khaph ; Khaph
jg ; Knotted_Heh ; Knotted_Heh
jg ; Lam ; Lam
jg ; Lamadh ; Lamadh
jg ; Malayalam_Bha ; Malayalam_Bha
jg ; Malayalam_Ja ; Malayalam_Ja
jg ; Malayalam_Lla ; Malayalam_Lla
jg ; Malayalam_Llla ; Malayalam_Llla
jg ; Malayalam_Nga ; Malayalam_Nga
jg ; Malayalam_Nna ; Malayalam_Nna
jg ; Malayalam_Nnna ; Malayalam_Nnna
jg ; Malayalam_Nya ; Malayalam_Nya
jg ; Malayalam_Ra ; Malayalam_Ra
jg ; Malayalam_Ssa ; Malayalam_Ssa
jg ; Malayalam_Tta ; Malayalam_Tta
jg ; Manichaean_Aleph ; Manichaean_Aleph
jg ; Manichaean_Ayin ; Manichaean_Ayin
jg ; Manichaean_Beth ; Manichaean_Beth
@ -948,6 +989,8 @@ lb ; CL ; Close_Punctuation
lb ; CM ; Combining_Mark
lb ; CP ; Close_Parenthesis
lb ; CR ; Carriage_Return
lb ; EB ; E_Base
lb ; EM ; E_Modifier
lb ; EX ; Exclamation
lb ; GL ; Glue
lb ; H2 ; H2
@ -976,6 +1019,7 @@ lb ; SY ; Break_Symbols
lb ; WJ ; Word_Joiner
lb ; XX ; Unknown
lb ; ZW ; ZWSpace
lb ; ZWJ ; ZWJ
# Logical_Order_Exception (LOE)
@ -1096,6 +1140,11 @@ Pat_Syn; Y ; Yes ; T
Pat_WS; N ; No ; F ; False
Pat_WS; Y ; Yes ; T ; True
# Prepended_Concatenation_Mark (PCM)
PCM; N ; No ; F ; False
PCM; Y ; Yes ; T ; True
# Quotation_Mark (QMark)
QMark; N ; No ; F ; False
@ -1106,13 +1155,14 @@ QMark; Y ; Yes ; T
Radical; N ; No ; F ; False
Radical; Y ; Yes ; T ; True
# STerm (STerm)
# Regional_Indicator (RI)
STerm; N ; No ; F ; False
STerm; Y ; Yes ; T ; True
RI ; N ; No ; F ; False
RI ; Y ; Yes ; T ; True
# Script (sc)
sc ; Adlm ; Adlam
sc ; Aghb ; Caucasian_Albanian
sc ; Ahom ; Ahom
sc ; Arab ; Arabic
@ -1124,6 +1174,7 @@ sc ; Bamu ; Bamum
sc ; Bass ; Bassa_Vah
sc ; Batk ; Batak
sc ; Beng ; Bengali
sc ; Bhks ; Bhaiksuki
sc ; Bopo ; Bopomofo
sc ; Brah ; Brahmi
sc ; Brai ; Braille
@ -1145,6 +1196,7 @@ sc ; Elba ; Elbasan
sc ; Ethi ; Ethiopic
sc ; Geor ; Georgian
sc ; Glag ; Glagolitic
sc ; Gonm ; Masaram_Gondi
sc ; Goth ; Gothic
sc ; Gran ; Grantha
sc ; Grek ; Greek
@ -1182,6 +1234,7 @@ sc ; Lydi ; Lydian
sc ; Mahj ; Mahajani
sc ; Mand ; Mandaic
sc ; Mani ; Manichaean
sc ; Marc ; Marchen
sc ; Mend ; Mende_Kikakui
sc ; Merc ; Meroitic_Cursive
sc ; Mero ; Meroitic_Hieroglyphs
@ -1194,11 +1247,14 @@ sc ; Mult ; Multani
sc ; Mymr ; Myanmar
sc ; Narb ; Old_North_Arabian
sc ; Nbat ; Nabataean
sc ; Newa ; Newa
sc ; Nkoo ; Nko
sc ; Nshu ; Nushu
sc ; Ogam ; Ogham
sc ; Olck ; Ol_Chiki
sc ; Orkh ; Old_Turkic
sc ; Orya ; Oriya
sc ; Osge ; Osage
sc ; Osma ; Osmanya
sc ; Palm ; Palmyrene
sc ; Pauc ; Pau_Cin_Hau
@ -1221,6 +1277,7 @@ sc ; Sidd ; Siddham
sc ; Sind ; Khudawadi
sc ; Sinh ; Sinhala
sc ; Sora ; Sora_Sompeng
sc ; Soyo ; Soyombo
sc ; Sund ; Sundanese
sc ; Sylo ; Syloti_Nagri
sc ; Syrc ; Syriac
@ -1229,6 +1286,7 @@ sc ; Takr ; Takri
sc ; Tale ; Tai_Le
sc ; Talu ; New_Tai_Lue
sc ; Taml ; Tamil
sc ; Tang ; Tangut
sc ; Tavt ; Tai_Viet
sc ; Telu ; Telugu
sc ; Tfng ; Tifinagh
@ -1243,6 +1301,7 @@ sc ; Wara ; Warang_Citi
sc ; Xpeo ; Old_Persian
sc ; Xsux ; Cuneiform
sc ; Yiii ; Yi
sc ; Zanb ; Zanabazar_Square
sc ; Zinh ; Inherited ; Qaai
sc ; Zyyy ; Common
sc ; Zzzz ; Unknown
@ -1269,6 +1328,11 @@ SB ; ST ; STerm
SB ; UP ; Upper
SB ; XX ; Other
# Sentence_Terminal (STerm)
STerm; N ; No ; F ; False
STerm; Y ; Yes ; T ; True
# Simple_Case_Folding (scf)
# @missing: 0000..10FFFF; Simple_Case_Folding; <code point>
@ -1322,6 +1386,13 @@ Upper; Y ; Yes ; T
VS ; N ; No ; F ; False
VS ; Y ; Yes ; T ; True
# Vertical_Orientation (vo)
vo ; R ; Rotated
vo ; Tr ; Transformed_Rotated
vo ; Tu ; Transformed_Upright
vo ; U ; Upright
# White_Space (WSpace)
WSpace; N ; No ; F ; False
@ -1331,9 +1402,13 @@ WSpace; Y ; Yes ; T
WB ; CR ; CR
WB ; DQ ; Double_Quote
WB ; EB ; E_Base
WB ; EBG ; E_Base_GAZ
WB ; EM ; E_Modifier
WB ; EX ; ExtendNumLet
WB ; Extend ; Extend
WB ; FO ; Format
WB ; GAZ ; Glue_After_Zwj
WB ; HL ; Hebrew_Letter
WB ; KA ; Katakana
WB ; LE ; ALetter
@ -1346,6 +1421,7 @@ WB ; NU ; Numeric
WB ; RI ; Regional_Indicator
WB ; SQ ; Single_Quote
WB ; XX ; Other
WB ; ZWJ ; ZWJ
# XID_Continue (XIDC)

View File

@ -1,10 +1,11 @@
# Scripts-8.0.0.txt
# Date: 2015-03-11, 22:29:42 GMT [MD]
# Scripts-10.0.0.txt
# Date: 2017-03-11, 06:40:37 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
# For more information, see:
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
# Especially the sections:
@ -92,10 +93,10 @@
0605 ; Common # Cf ARABIC NUMBER MARK ABOVE
060C ; Common # Po ARABIC COMMA
061B ; Common # Po ARABIC SEMICOLON
061C ; Common # Cf ARABIC LETTER MARK
061F ; Common # Po ARABIC QUESTION MARK
0640 ; Common # Lm ARABIC TATWEEL
06DD ; Common # Cf ARABIC END OF AYAH
08E2 ; Common # Cf ARABIC DISPUTED END OF AYAH
0964..0965 ; Common # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
0E3F ; Common # Sc THAI CURRENCY SYMBOL BAHT
0FD5..0FD8 ; Common # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS
@ -110,6 +111,7 @@
1CEE..1CF1 ; Common # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
1CF2..1CF3 ; Common # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
1CF5..1CF6 ; Common # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
1CF7 ; Common # Mc VEDIC SIGN ATIKRAMA
2000..200A ; Common # Zs [11] EN QUAD..HAIR SPACE
200B ; Common # Cf ZERO WIDTH SPACE
200E..200F ; Common # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
@ -153,7 +155,7 @@
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
20A0..20BE ; Common # Sc [31] EURO-CURRENCY SIGN..LARI SIGN
20A0..20BF ; Common # Sc [32] EURO-CURRENCY SIGN..BITCOIN SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
@ -223,8 +225,7 @@
239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
23E2..2426 ; Common # So [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
249C..24E9 ; Common # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
@ -309,7 +310,7 @@
2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2BCA..2BD2 ; Common # So [9] TOP HALF BLACK CIRCLE..GROUP MARK
2BEC..2BEF ; Common # So [4] LEFTWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS..DOWNWARDS TWO-HEADED ARROW WITH TRIANGLE ARROWHEADS
2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET
@ -348,6 +349,7 @@
2E40 ; Common # Pd DOUBLE HYPHEN
2E41 ; Common # Po REVERSED COMMA
2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2E43..2E49 ; Common # Po [7] DASH WITH LEFT UPTURN..DOUBLE STACKED COMMA
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
3000 ; Common # Zs IDEOGRAPHIC SPACE
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
@ -572,19 +574,18 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN
1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
1F170..1F1AC ; Common # So [61] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD
1F1E6..1F1FF ; Common # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
1F201..1F202 ; Common # So [2] SQUARED KATAKANA KOKO..SQUARED KATAKANA SA
1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F210..1F23B ; Common # So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
1F260..1F265 ; Common # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
1F300..1F3FA ; Common # So [251] CYCLONE..AMPHORA
1F3FB..1F3FF ; Common # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
1F400..1F579 ; Common # So [378] RAT..JOYSTICK
1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX
1F5A5..1F6D0 ; Common # So [300] DESKTOP COMPUTER..PLACE OF WORSHIP
1F400..1F6D4 ; Common # So [725] RAT..PAGODA
1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP
1F6F0..1F6F8 ; Common # So [9] SATELLITE..FLYING SAUCER
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
@ -592,13 +593,17 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR
1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
1F910..1F918 ; Common # So [9] ZIPPER-MOUTH FACE..SIGN OF THE HORNS
1F980..1F984 ; Common # So [5] CRAB..UNICORN FACE
1F900..1F90B ; Common # So [12] CIRCLED CROSS FORMEE WITH FOUR DOTS..DOWNWARD FACING NOTCHED HOOK WITH DOT
1F910..1F93E ; Common # So [47] ZIPPER-MOUTH FACE..HANDBALL
1F940..1F94C ; Common # So [13] WILTED FLOWER..CURLING STONE
1F950..1F96B ; Common # So [28] CROISSANT..CANNED FOOD
1F980..1F997 ; Common # So [24] CRAB..CRICKET
1F9C0 ; Common # So CHEESE WEDGE
1F9D0..1F9E6 ; Common # So [23] FACE WITH MONOCLE..SOCKS
E0001 ; Common # Cf LANGUAGE TAG
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
# Total code points: 7179
# Total code points: 7363
# ================================================
@ -641,7 +646,7 @@ A770 ; Latin # Lm MODIFIER LETTER US
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
A78F ; Latin # Lo LATIN LETTER SINOLOGICAL DOT
A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT
A790..A7AE ; Latin # L& [31] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER SMALL CAPITAL I
A7B0..A7B7 ; Latin # L& [8] LATIN CAPITAL LETTER TURNED K..LATIN SMALL LETTER OMEGA
A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
@ -654,7 +659,7 @@ FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE S
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
# Total code points: 1349
# Total code points: 1350
# ================================================
@ -708,13 +713,13 @@ AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
1018C ; Greek # So GREEK SINUSOID SIGN
1018C..1018E ; Greek # So [3] GREEK SINUSOID SIGN..NOMISMA SIGN
101A0 ; Greek # So GREEK SYMBOL TAU RHO
1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245 ; Greek # So GREEK MUSICAL LEIMMA
# Total code points: 516
# Total code points: 518
# ================================================
@ -724,6 +729,7 @@ AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE
0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
1C80..1C88 ; Cyrillic # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL
1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN
2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
@ -740,7 +746,7 @@ A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER
A69E..A69F ; Cyrillic # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
FE2E..FE2F ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
# Total code points: 434
# Total code points: 443
# ================================================
@ -791,6 +797,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
060D ; Arabic # Po ARABIC DATE SEPARATOR
060E..060F ; Arabic # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
0610..061A ; Arabic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
061C ; Arabic # Cf ARABIC LETTER MARK
061E ; Arabic # Po ARABIC TRIPLE DOT PUNCTUATION MARK
0620..063F ; Arabic # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
0641..064A ; Arabic # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
@ -815,6 +822,8 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
08A0..08B4 ; Arabic # Lo [21] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER KAF WITH DOT BELOW
08B6..08BD ; Arabic # Lo [8] ARABIC LETTER BEH WITH SMALL MEEM ABOVE..ARABIC LETTER AFRICAN NOON
08D4..08E1 ; Arabic # Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA
08E3..08FF ; Arabic # Mn [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
@ -862,7 +871,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
# Total code points: 1257
# Total code points: 1280
# ================================================
@ -873,8 +882,9 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA
0712..072F ; Syriac # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
0730..074A ; Syriac # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
074D..074F ; Syriac # Lo [3] SYRIAC LETTER SOGDIAN ZHAIN..SYRIAC LETTER SOGDIAN FE
0860..086A ; Syriac # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
# Total code points: 77
# Total code points: 88
# ================================================
@ -944,8 +954,10 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
09F4..09F9 ; Bengali # No [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN
09FA ; Bengali # So BENGALI ISSHAR
09FB ; Bengali # Sc BENGALI GANDA MARK
09FC ; Bengali # Lo BENGALI LETTER VEDIC ANUSVARA
09FD ; Bengali # Po BENGALI ABBREVIATION SIGN
# Total code points: 93
# Total code points: 95
# ================================================
@ -998,8 +1010,9 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0AF0 ; Gujarati # Po GUJARATI ABBREVIATION SIGN
0AF1 ; Gujarati # Sc GUJARATI RUPEE SIGN
0AF9 ; Gujarati # Lo GUJARATI LETTER ZHA
0AFA..0AFF ; Gujarati # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
# Total code points: 85
# Total code points: 91
# ================================================
@ -1086,6 +1099,7 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
# ================================================
0C80 ; Kannada # Lo KANNADA SIGN SPACING CANDRABINDU
0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
@ -1109,15 +1123,16 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
# Total code points: 87
# Total code points: 88
# ================================================
0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU
0D00..0D01 ; Malayalam # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
0D12..0D3A ; Malayalam # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
0D3B..0D3C ; Malayalam # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
0D3D ; Malayalam # Lo MALAYALAM SIGN AVAGRAHA
0D3E..0D40 ; Malayalam # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Malayalam # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@ -1125,15 +1140,18 @@ A8FD ; Devanagari # Lo DEVANAGARI JAIN OM
0D4A..0D4C ; Malayalam # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
0D4D ; Malayalam # Mn MALAYALAM SIGN VIRAMA
0D4E ; Malayalam # Lo MALAYALAM LETTER DOT REPH
0D4F ; Malayalam # So MALAYALAM SIGN PARA
0D54..0D56 ; Malayalam # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
0D57 ; Malayalam # Mc MALAYALAM AU LENGTH MARK
0D58..0D5E ; Malayalam # No [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH
0D5F..0D61 ; Malayalam # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
0D62..0D63 ; Malayalam # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
0D66..0D6F ; Malayalam # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
0D70..0D75 ; Malayalam # No [6] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE QUARTERS
0D70..0D78 ; Malayalam # No [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS
0D79 ; Malayalam # So MALAYALAM DATE MARK
0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
# Total code points: 100
# Total code points: 117
# ================================================
@ -1436,21 +1454,24 @@ AB70..ABBF ; Cherokee # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETT
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
1844..1877 ; Mongolian # Lo [52] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER MANCHU ZHA
1880..18A8 ; Mongolian # Lo [41] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER MANCHU ALI GALI BHA
1880..1884 ; Mongolian # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
1885..1886 ; Mongolian # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
1887..18A8 ; Mongolian # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
18A9 ; Mongolian # Mn MONGOLIAN LETTER ALI GALI DAGALGA
18AA ; Mongolian # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA
11660..1166C ; Mongolian # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
# Total code points: 153
# Total code points: 166
# ================================================
3041..3096 ; Hiragana # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
309D..309E ; Hiragana # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
309F ; Hiragana # Lo HIRAGANA DIGRAPH YORI
1B001 ; Hiragana # Lo HIRAGANA LETTER ARCHAIC YE
1B001..1B11E ; Hiragana # Lo [286] HIRAGANA LETTER ARCHAIC YE..HENTAIGANA LETTER N-MU-MO-2
1F200 ; Hiragana # So SQUARE HIRAGANA HOKA
# Total code points: 91
# Total code points: 376
# ================================================
@ -1469,10 +1490,10 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
# ================================================
02EA..02EB ; Bopomofo # Sk [2] MODIFIER LETTER YIN DEPARTING TONE MARK..MODIFIER LETTER YANG DEPARTING TONE MARK
3105..312D ; Bopomofo # Lo [41] BOPOMOFO LETTER B..BOPOMOFO LETTER IH
3105..312E ; Bopomofo # Lo [42] BOPOMOFO LETTER B..BOPOMOFO LETTER O WITH DOT ABOVE
31A0..31BA ; Bopomofo # Lo [27] BOPOMOFO LETTER BU..BOPOMOFO LETTER ZY
# Total code points: 70
# Total code points: 71
# ================================================
@ -1485,16 +1506,17 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
4E00..9FD5 ; Han # Lo [20950] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FD5
4E00..9FEA ; Han # Lo [20971] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEA
F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Han # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
# Total code points: 81734
# Total code points: 89228
# ================================================
@ -1509,8 +1531,9 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE
10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
1032D..1032F ; Old_Italic # Lo [3] OLD ITALIC LETTER YE..OLD ITALIC LETTER SOUTHERN TSE
# Total code points: 36
# Total code points: 39
# ================================================
@ -1542,8 +1565,8 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE
1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1DC0..1DF9 ; Inherited # Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
1DFB..1DFF ; Inherited # Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
20DD..20E0 ; Inherited # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
@ -1562,7 +1585,7 @@ FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CON
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
# Total code points: 563
# Total code points: 568
# ================================================
@ -1705,8 +1728,13 @@ E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-2
2C00..2C2E ; Glagolitic # L& [47] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
2C30..2C5E ; Glagolitic # L& [47] GLAGOLITIC SMALL LETTER AZU..GLAGOLITIC SMALL LETTER LATINATE MYSLITE
1E000..1E006 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
1E008..1E018 ; Glagolitic # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
1E01B..1E021 ; Glagolitic # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
1E023..1E024 ; Glagolitic # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
1E026..1E02A ; Glagolitic # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
# Total code points: 94
# Total code points: 132
# ================================================
@ -1872,11 +1900,11 @@ A62A..A62B ; Vai # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
A880..A881 ; Saurashtra # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
A882..A8B3 ; Saurashtra # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
A8B4..A8C3 ; Saurashtra # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
A8C4 ; Saurashtra # Mn SAURASHTRA SIGN VIRAMA
A8C4..A8C5 ; Saurashtra # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
A8CE..A8CF ; Saurashtra # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
A8D0..A8D9 ; Saurashtra # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE
# Total code points: 81
# Total code points: 82
# ================================================
@ -2314,8 +2342,9 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
11235 ; Khojki # Mc KHOJKI SIGN VIRAMA
11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
1123E ; Khojki # Mn KHOJKI SIGN SUKUN
# Total code points: 61
# Total code points: 62
# ================================================
@ -2536,4 +2565,129 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI
# Total code points: 672
# ================================================
1E900..1E943 ; Adlam # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
1E944..1E94A ; Adlam # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
1E950..1E959 ; Adlam # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
1E95E..1E95F ; Adlam # Po [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
# Total code points: 87
# ================================================
11C00..11C08 ; Bhaiksuki # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
11C0A..11C2E ; Bhaiksuki # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
11C2F ; Bhaiksuki # Mc BHAIKSUKI VOWEL SIGN AA
11C30..11C36 ; Bhaiksuki # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
11C38..11C3D ; Bhaiksuki # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
11C3E ; Bhaiksuki # Mc BHAIKSUKI SIGN VISARGA
11C3F ; Bhaiksuki # Mn BHAIKSUKI SIGN VIRAMA
11C40 ; Bhaiksuki # Lo BHAIKSUKI SIGN AVAGRAHA
11C41..11C45 ; Bhaiksuki # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2
11C50..11C59 ; Bhaiksuki # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
11C5A..11C6C ; Bhaiksuki # No [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK
# Total code points: 97
# ================================================
11C70..11C71 ; Marchen # Po [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD
11C72..11C8F ; Marchen # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A
11C92..11CA7 ; Marchen # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
11CA9 ; Marchen # Mc MARCHEN SUBJOINED LETTER YA
11CAA..11CB0 ; Marchen # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
11CB1 ; Marchen # Mc MARCHEN VOWEL SIGN I
11CB2..11CB3 ; Marchen # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
11CB4 ; Marchen # Mc MARCHEN VOWEL SIGN O
11CB5..11CB6 ; Marchen # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
# Total code points: 68
# ================================================
11400..11434 ; Newa # Lo [53] NEWA LETTER A..NEWA LETTER HA
11435..11437 ; Newa # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
11438..1143F ; Newa # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
11440..11441 ; Newa # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
11442..11444 ; Newa # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA
11445 ; Newa # Mc NEWA SIGN VISARGA
11446 ; Newa # Mn NEWA SIGN NUKTA
11447..1144A ; Newa # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI
1144B..1144F ; Newa # Po [5] NEWA DANDA..NEWA ABBREVIATION SIGN
11450..11459 ; Newa # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE
1145B ; Newa # Po NEWA PLACEHOLDER MARK
1145D ; Newa # Po NEWA INSERTION SIGN
# Total code points: 92
# ================================================
104B0..104D3 ; Osage # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
104D8..104FB ; Osage # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
# Total code points: 72
# ================================================
16FE0 ; Tangut # Lm TANGUT ITERATION MARK
17000..187EC ; Tangut # Lo [6125] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187EC
18800..18AF2 ; Tangut # Lo [755] TANGUT COMPONENT-001..TANGUT COMPONENT-755
# Total code points: 6881
# ================================================
11D00..11D06 ; Masaram_Gondi # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E
11D08..11D09 ; Masaram_Gondi # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O
11D0B..11D30 ; Masaram_Gondi # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA
11D31..11D36 ; Masaram_Gondi # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
11D3A ; Masaram_Gondi # Mn MASARAM GONDI VOWEL SIGN E
11D3C..11D3D ; Masaram_Gondi # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
11D3F..11D45 ; Masaram_Gondi # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA
11D46 ; Masaram_Gondi # Lo MASARAM GONDI REPHA
11D47 ; Masaram_Gondi # Mn MASARAM GONDI RA-KARA
11D50..11D59 ; Masaram_Gondi # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
# Total code points: 75
# ================================================
16FE1 ; Nushu # Lm NUSHU ITERATION MARK
1B170..1B2FB ; Nushu # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
# Total code points: 397
# ================================================
11A50 ; Soyombo # Lo SOYOMBO LETTER A
11A51..11A56 ; Soyombo # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
11A57..11A58 ; Soyombo # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
11A59..11A5B ; Soyombo # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
11A5C..11A83 ; Soyombo # Lo [40] SOYOMBO LETTER KA..SOYOMBO LETTER KSSA
11A86..11A89 ; Soyombo # Lo [4] SOYOMBO CLUSTER-INITIAL LETTER RA..SOYOMBO CLUSTER-INITIAL LETTER SA
11A8A..11A96 ; Soyombo # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
11A97 ; Soyombo # Mc SOYOMBO SIGN VISARGA
11A98..11A99 ; Soyombo # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER
11A9A..11A9C ; Soyombo # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD
11A9E..11AA2 ; Soyombo # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
# Total code points: 80
# ================================================
11A00 ; Zanabazar_Square # Lo ZANABAZAR SQUARE LETTER A
11A01..11A06 ; Zanabazar_Square # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O
11A07..11A08 ; Zanabazar_Square # Mc [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU
11A09..11A0A ; Zanabazar_Square # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK
11A0B..11A32 ; Zanabazar_Square # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA
11A33..11A38 ; Zanabazar_Square # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA
11A39 ; Zanabazar_Square # Mc ZANABAZAR SQUARE SIGN VISARGA
11A3A ; Zanabazar_Square # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
11A3B..11A3E ; Zanabazar_Square # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
11A3F..11A46 ; Zanabazar_Square # Po [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK
11A47 ; Zanabazar_Square # Mn ZANABAZAR SQUARE SUBJOINER
# Total code points: 72
# EOF

View File

@ -0,0 +1,281 @@
# SpecialCasing-10.0.0.txt
# Date: 2017-04-14, 05:40:43 GMT
# Copyright (c) 2017 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Special Casing
#
# This file is a supplement to the UnicodeData.txt file. It does not define any
# properties, but rather provides additional information about the casing of
# Unicode characters, for situations when casing incurs a change in string length
# or is dependent on context or locale. For compatibility, the UnicodeData.txt
# file only contains simple case mappings for characters where they are one-to-one
# and independent of context and language. The data in this file, combined with
# the simple case mappings in UnicodeData.txt, defines the full case mappings
# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
#
# Note that the preferred mechanism for defining tailored casing operations is
# the Unicode Common Locale Data Repository (CLDR). For more information, see the
# discussion of case mappings and case algorithms in the Unicode Standard.
#
# All code points not listed in this file that do not have a simple case mappings
# in UnicodeData.txt map to themselves.
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
# of <code>, expressed as character values in hex. If there is more than one character,
# they are separated by spaces. Other than as used to separate elements, spaces are
# to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more language IDs
# or casing contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - The casing context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
# The condition list is not represented in the UCD as a formal property.
#
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
#
# A casing context for a character is defined by Section 3.13 Default Case Algorithms
# of The Unicode Standard.
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
# ================================================================================
# Unconditional mappings
# ================================================================================
# The German es-zed is special--the normal mapping is to SS.
# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
# Preserve canonical equivalence for I with dot. Turkic is handled below.
0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# Ligatures
FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# No corresponding uppercase precomposed character
0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
# the result will be incorrect unless the iota-subscript is moved to the end
# of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
# This process can be achieved by first transforming the text to NFC before casing.
# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
# The following cases are already in the UnicodeData.txt file, so are only commented here.
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
# have special uppercases.
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
# ================================================================================
# Conditional Mappings
# The remainder of this file provides conditional casing data used to produce
# full case mappings.
# ================================================================================
# Language-Insensitive Mappings
# These are characters whose full case mappings do not depend on language, but do
# depend on context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Special case for final form of sigma
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
# Note: the following cases for non-final are already in the UnicodeData.txt file.
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
# Note: the following cases are not included, since they would case-fold in lowercasing
# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Language-Sensitive Mappings
# These are characters whose full case mappings depend on language and perhaps also
# context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Lithuanian
# Lithuanian retains the dot in a lowercase i when followed by accents.
# Remove DOT ABOVE after "i" with upper or titlecase
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
# Introduce an explicit dot above when lowercasing capital I's and J's
# whenever there are more accents above.
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
# ================================================================================
# Turkish and Azeri
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
# When uppercasing, i turns into a dotted capital I
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
# Note: the following case is already in the UnicodeData.txt file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -23,7 +23,7 @@
/**
* @test
* @bug 8080535
* @bug 8080535 8191410
* @summary Expected size of Character.UnicodeBlock.map is not optimal
* @library /lib/testlibrary
* @modules java.base/java.lang:open
@ -41,8 +41,8 @@ import jdk.testlibrary.OptimalCapacity;
// According to http://www.unicode.org/versions/beta-8.0.0.html ,
// in Unicode 8 there will be added 10 more blocks (30 with aliases).
//
// After implementing support of Unicode 7 and 8 in Java, there will
// be 510+96+30 = 636 entries in Character.UnicodeBlock.map.
// After implementing support of Unicode 9 and 10 in Java, there will
// be 638 entries in Character.UnicodeBlock.map.
//
// Initialization of the map and this test will have to be adjusted
// accordingly then.
@ -51,7 +51,7 @@ public class OptimalMapSize {
public static void main(String[] args) throws Throwable {
// The initial size of Character.UnicodeBlock.map.
// See src/java.base/share/classes/java/lang/Character.java
int initialCapacity = (int)(510 / 0.75f + 1.0f);
int initialCapacity = (int)(638 / 0.75f + 1.0f);
OptimalCapacity.ofHashMap(Character.UnicodeBlock.class,
"map", initialCapacity);

View File

@ -0,0 +1,177 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 4397357 6565620 6959267 8032446 8072600
* @summary Confirm normal case mappings are handled correctly.
* @run main/timeout=200 UnicodeCasingTest
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
public class UnicodeCasingTest {
private static boolean err = false;
// Locales which are used for testing
private static List<Locale> locales = new ArrayList<>();
static {
locales.add(new Locale("az", ""));
locales.addAll(java.util.Arrays.asList(Locale.getAvailableLocales()));
}
public static void main(String[] args) {
UnicodeCasingTest specialCasingTest = new UnicodeCasingTest();
specialCasingTest.test();
}
private void test() {
Locale defaultLocale = Locale.getDefault();
BufferedReader in = null;
try {
File file = new File(System.getProperty("test.src", "."),
"UnicodeData.txt");
int locale_num = locales.size();
for (int l = 0; l < locale_num; l++) {
Locale locale = locales.get(l);
Locale.setDefault(locale);
System.out.println("Testing on " + locale + " locale....");
in = new BufferedReader(new FileReader(file));
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue;
}
test(line);
}
in.close();
in = null;
}
}
catch (Exception e) {
err = true;
e.printStackTrace();
}
finally {
if (in != null) {
try {
in.close();
}
catch (Exception e) {
}
}
Locale.setDefault(defaultLocale);
if (err) {
throw new RuntimeException("UnicodeCasingTest failed.");
} else {
System.out.println("UnicodeCasingTest passed.");
}
}
}
private void test(String line) {
String[] fields = line.split(";", 15);
int orig = convert(fields[0]);
if (fields[12].length() != 0) {
testUpperCase(orig, convert(fields[12]));
} else {
testUpperCase(orig, orig);
}
if (fields[13].length() != 0) {
testLowerCase(orig, convert(fields[13]));
} else {
testLowerCase(orig, orig);
}
if (fields[14].length() != 0) {
testTitleCase(orig, convert(fields[14]));
} else {
testTitleCase(orig, orig);
}
}
private void testUpperCase(int orig, int expected) {
int got = Character.toUpperCase(orig);
if (expected != got) {
err = true;
System.err.println("toUpperCase(" +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected));
}
}
private void testLowerCase(int orig, int expected) {
int got = Character.toLowerCase(orig);
if (expected != got) {
err = true;
System.err.println("toLowerCase(" +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected));
}
}
private void testTitleCase(int orig, int expected) {
int got = Character.toTitleCase(orig);
if (expected != got) {
err = true;
System.err.println("toTitleCase(" +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected));
}
}
private int convert(String str) {
return Integer.parseInt(str, 16);
}
private String toString(int i) {
return Integer.toHexString(i).toUpperCase();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,756 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.File;
import java.util.regex.Pattern;
import java.util.ArrayList;
/**
* The UnicodeSpec class provides a way to read in Unicode character
* properties from a Unicode data file. One instance of class UnicodeSpec
* holds a decoded version of one line of the data file. The file may
* be obtained from www.unicode.org. The method readSpecFile returns an array
* of UnicodeSpec objects.
*
* @author Guy Steele
* @author John O'Conner
*/
public class UnicodeSpec {
public UnicodeSpec() {
this(0xffff);
}
public UnicodeSpec(int codePoint) {
this.codePoint = codePoint;
generalCategory = UNASSIGNED;
bidiCategory = DIRECTIONALITY_UNDEFINED;
mirrored = false;
titleMap = 0xFFFF;
upperMap = 0xFFFF;
lowerMap = 0xFFFF;
decimalValue = -1;
digitValue = -1;
numericValue = "";
oldName = null;
comment = null;
name = null;
}
public String toString() {
StringBuffer result = new StringBuffer(hex6(codePoint));
if (getUpperMap() != 0xffff) {
result.append(", upper=").append(hex6(upperMap));
}
if (getLowerMap() != 0xffff) {
result.append(", lower=").append(hex6(lowerMap));
}
if (getTitleMap() != 0xffff) {
result.append(", title=").append(hex6(titleMap));
}
return result.toString();
}
static String hex4(int n) {
String q = Long.toHexString(n & 0xFFFF).toUpperCase();
return "0000".substring(Math.min(4, q.length())) + q;
}
static String hex6(int n) {
String str = Integer.toHexString(n & 0xFFFFFF).toUpperCase();
return "000000".substring(Math.min(6, str.length())) + str;
}
/**
* Given one line of a Unicode data file as a String, parse the line
* and return a UnicodeSpec object that contains the same character information.
*
* @param s a line of the Unicode data file to be parsed
* @return a UnicodeSpec object, or null if the parsing process failed for some reason
*/
public static UnicodeSpec parse(String s) {
UnicodeSpec spec = null;
String[] tokens = null;
try {
tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
spec = new UnicodeSpec();
spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE]));
spec.setName(parseName(tokens[FIELD_NAME]));
spec.setGeneralCategory(parseGeneralCategory(tokens[FIELD_CATEGORY]));
spec.setBidiCategory(parseBidiCategory(tokens[FIELD_BIDI]));
spec.setCombiningClass(parseCombiningClass(tokens[FIELD_CLASS]));
spec.setDecomposition(parseDecomposition(tokens[FIELD_DECOMPOSITION]));
spec.setDecimalValue(parseDecimalValue(tokens[FIELD_DECIMAL]));
spec.setDigitValue(parseDigitValue(tokens[FIELD_DIGIT]));
spec.setNumericValue(parseNumericValue(tokens[FIELD_NUMERIC]));
spec.setMirrored(parseMirrored(tokens[FIELD_MIRRORED]));
spec.setOldName(parseOldName(tokens[FIELD_OLDNAME]));
spec.setComment(parseComment(tokens[FIELD_COMMENT]));
spec.setUpperMap(parseUpperMap(tokens[FIELD_UPPERCASE]));
spec.setLowerMap(parseLowerMap(tokens[FIELD_LOWERCASE]));
spec.setTitleMap(parseTitleMap(tokens[FIELD_TITLECASE]));
}
catch(Exception e) {
spec = null;
System.out.println("Error parsing spec line.");
}
return spec;
}
/**
* Parse the codePoint attribute for a Unicode character. If the parse succeeds,
* the codePoint field of this UnicodeSpec object is updated and false is returned.
*
* The codePoint attribute should be a four-digit hexadecimal integer.
*
* @param s the codePoint attribute extracted from a line of the Unicode data file
* @return code point if successful
* @exception NumberFormatException if unable to parse argument
*/
public static int parseCodePoint(String s) throws NumberFormatException {
return Integer.parseInt(s, 16);
}
public static String parseName(String s) throws Exception {
if (s==null) throw new Exception("Cannot parse name.");
return s;
}
public static byte parseGeneralCategory(String s) throws Exception {
byte category = GENERAL_CATEGORY_COUNT;
for (byte x=0; x<generalCategoryList.length; x++) {
if (s.equals(generalCategoryList[x][SHORT])) {
category = x;
break;
}
}
if (category >= GENERAL_CATEGORY_COUNT) {
throw new Exception("Could not parse general category.");
}
return category;
}
public static byte parseBidiCategory(String s) throws Exception {
byte category = DIRECTIONALITY_CATEGORY_COUNT;
for (byte x=0; x<bidiCategoryList.length; x++) {
if (s.equals(bidiCategoryList[x][SHORT])) {
category = x;
break;
}
}
if (category >= DIRECTIONALITY_CATEGORY_COUNT) {
throw new Exception("Could not parse bidi category.");
}
return category;
}
/**
* Parse the combining attribute for a Unicode character. If there is a combining
* attribute and the parse succeeds, then the hasCombining field is set to true,
* the combining field of this UnicodeSpec object is updated, and false is returned.
* If the combining attribute is an empty string, the parse succeeds but the
* hasCombining field is set to false. (and false is returned).
*
* The combining attribute, if any, should be a nonnegative decimal integer.
*
* @param s the combining attribute extracted from a line of the Unicode data file
* @return the combining class value if any, -1 if property not defined
* @exception Exception if can't parse the combining class
*/
public static int parseCombiningClass(String s) throws Exception {
int combining = -1;
if (s.length()>0) {
combining = Integer.parseInt(s, 10);
}
return combining;
}
/**
* Parse the decomposition attribute for a Unicode character. If the parse succeeds,
* the decomposition field of this UnicodeSpec object is updated and false is returned.
*
* The decomposition attribute is complicated; for now, it is treated as a string.
*
* @param s the decomposition attribute extracted from a line of the Unicode data file
* @return true if the parse failed; otherwise false
*/
public static String parseDecomposition(String s) throws Exception {
if (s==null) throw new Exception("Cannot parse decomposition.");
return s;
}
/**
* Parse the decimal value attribute for a Unicode character. If there is a decimal value
* attribute and the parse succeeds, then the hasDecimalValue field is set to true,
* the decimalValue field of this UnicodeSpec object is updated, and false is returned.
* If the decimal value attribute is an empty string, the parse succeeds but the
* hasDecimalValue field is set to false. (and false is returned).
*
* The decimal value attribute, if any, should be a nonnegative decimal integer.
*
* @param s the decimal value attribute extracted from a line of the Unicode data file
* @return the decimal value as an int, -1 if no decimal value defined
* @exception NumberFormatException if the parse fails
*/
public static int parseDecimalValue(String s) throws NumberFormatException {
int value = -1;
if (s.length() > 0) {
value = Integer.parseInt(s, 10);
}
return value;
}
/**
* Parse the digit value attribute for a Unicode character. If there is a digit value
* attribute and the parse succeeds, then the hasDigitValue field is set to true,
* the digitValue field of this UnicodeSpec object is updated, and false is returned.
* If the digit value attribute is an empty string, the parse succeeds but the
* hasDigitValue field is set to false. (and false is returned).
*
* The digit value attribute, if any, should be a nonnegative decimal integer.
*
* @param s the digit value attribute extracted from a line of the Unicode data file
* @return the digit value as an non-negative int, or -1 if no digit property defined
* @exception NumberFormatException if the parse fails
*/
public static int parseDigitValue(String s) throws NumberFormatException {
int value = -1;
if (s.length() > 0) {
value = Integer.parseInt(s, 10);
}
return value;
}
public static String parseNumericValue(String s) throws Exception {
if (s == null) throw new Exception("Cannot parse numeric value.");
return s;
}
public static String parseComment(String s) throws Exception {
if (s == null) throw new Exception("Cannot parse comment.");
return s;
}
public static boolean parseMirrored(String s) throws Exception {
boolean mirrored;
if (s.length() == 1) {
if (s.charAt(0) == 'Y') {mirrored = true;}
else if (s.charAt(0) == 'N') {mirrored = false;}
else {throw new Exception("Cannot parse mirrored property.");}
}
else { throw new Exception("Cannot parse mirrored property.");}
return mirrored;
}
public static String parseOldName(String s) throws Exception {
if (s == null) throw new Exception("Cannot parse old name");
return s;
}
/**
* Parse the uppercase mapping attribute for a Unicode character. If there is a uppercase
* mapping attribute and the parse succeeds, then the hasUpperMap field is set to true,
* the upperMap field of this UnicodeSpec object is updated, and false is returned.
* If the uppercase mapping attribute is an empty string, the parse succeeds but the
* hasUpperMap field is set to false. (and false is returned).
*
* The uppercase mapping attribute should be a four-digit hexadecimal integer.
*
* @param s the uppercase mapping attribute extracted from a line of the Unicode data file
* @return uppercase char if defined, \uffff otherwise
* @exception NumberFormatException if parse fails
*/
public static int parseUpperMap(String s) throws NumberFormatException {
int upperCase = 0xFFFF;
if (s.length() >= 4) {
upperCase = Integer.parseInt(s, 16);
}
else if (s.length() != 0) {
throw new NumberFormatException();
}
return upperCase;
}
/**
* Parse the lowercase mapping attribute for a Unicode character. If there is a lowercase
* mapping attribute and the parse succeeds, then the hasLowerMap field is set to true,
* the lowerMap field of this UnicodeSpec object is updated, and false is returned.
* If the lowercase mapping attribute is an empty string, the parse succeeds but the
* hasLowerMap field is set to false. (and false is returned).
*
* The lowercase mapping attribute should be a four-digit hexadecimal integer.
*
* @param s the lowercase mapping attribute extracted from a line of the Unicode data file
* @return lowercase char mapping if defined, \uFFFF otherwise
* @exception NumberFormatException if parse fails
*/
public static int parseLowerMap(String s) throws NumberFormatException {
int lowerCase = 0xFFFF;
if (s.length() >= 4) {
lowerCase = Integer.parseInt(s, 16);
}
else if (s.length() != 0) {
throw new NumberFormatException();
}
return lowerCase;
}
/**
* Parse the titlecase mapping attribute for a Unicode character. If there is a titlecase
* mapping attribute and the parse succeeds, then the hasTitleMap field is set to true,
* the titleMap field of this UnicodeSpec object is updated, and false is returned.
* If the titlecase mapping attribute is an empty string, the parse succeeds but the
* hasTitleMap field is set to false. (and false is returned).
*
* The titlecase mapping attribute should be a four-digit hexadecimal integer.
*
* @param s the titlecase mapping attribute extracted from a line of the Unicode data file
* @return title case char mapping if defined, \uFFFF otherwise
* @exception NumberFormatException if parse fails
*/
public static int parseTitleMap(String s) throws NumberFormatException {
int titleCase = 0xFFFF;
if (s.length() >= 4) {
titleCase = Integer.parseInt(s, 16);
}
else if (s.length() != 0) {
throw new NumberFormatException();
}
return titleCase;
}
/**
* Read and parse a Unicode data file.
*
* @param file a file specifying the Unicode data file to be read
* @return an array of UnicodeSpec objects, one for each line of the
* Unicode data file that could be successfully parsed as
* specifying Unicode character attributes
*/
public static UnicodeSpec[] readSpecFile(File file, int plane) throws FileNotFoundException {
ArrayList<UnicodeSpec> list = new ArrayList<>(3000);
UnicodeSpec[] result = null;
int count = 0;
BufferedReader f = new BufferedReader(new FileReader(file));
String line = null;
loop:
while(true) {
try {
line = f.readLine();
}
catch (IOException e) {
break loop;
}
if (line == null) break loop;
UnicodeSpec item = parse(line.trim());
int specPlane = item.getCodePoint() >>> 16;
if (specPlane < plane) continue;
if (specPlane > plane) break;
if (item != null) {
list.add(item);
}
}
result = new UnicodeSpec[list.size()];
list.toArray(result);
return result;
}
void setCodePoint(int value) {
codePoint = value;
}
/**
* Return the code point in this Unicode specification
* @return the char code point representing by the specification
*/
public int getCodePoint() {
return codePoint;
}
void setName(String name) {
this.name = name;
}
public String getName() {
return name;
}
void setGeneralCategory(byte category) {
generalCategory = category;
}
public byte getGeneralCategory() {
return generalCategory;
}
void setBidiCategory(byte category) {
bidiCategory = category;
}
public byte getBidiCategory() {
return bidiCategory;
}
void setCombiningClass(int combiningClass) {
this.combiningClass = combiningClass;
}
public int getCombiningClass() {
return combiningClass;
}
void setDecomposition(String decomposition) {
this.decomposition = decomposition;
}
public String getDecomposition() {
return decomposition;
}
void setDecimalValue(int value) {
decimalValue = value;
}
public int getDecimalValue() {
return decimalValue;
}
public boolean isDecimalValue() {
return decimalValue != -1;
}
void setDigitValue(int value) {
digitValue = value;
}
public int getDigitValue() {
return digitValue;
}
public boolean isDigitValue() {
return digitValue != -1;
}
void setNumericValue(String value) {
numericValue = value;
}
public String getNumericValue() {
return numericValue;
}
public boolean isNumericValue() {
return numericValue.length() > 0;
}
void setMirrored(boolean value) {
mirrored = value;
}
public boolean isMirrored() {
return mirrored;
}
void setOldName(String name) {
oldName = name;
}
public String getOldName() {
return oldName;
}
void setComment(String comment) {
this.comment = comment;
}
public String getComment() {
return comment;
}
void setUpperMap(int ch) {
upperMap = ch;
};
public int getUpperMap() {
return upperMap;
}
public boolean hasUpperMap() {
return upperMap != 0xffff;
}
void setLowerMap(int ch) {
lowerMap = ch;
}
public int getLowerMap() {
return lowerMap;
}
public boolean hasLowerMap() {
return lowerMap != 0xffff;
}
void setTitleMap(int ch) {
titleMap = ch;
}
public int getTitleMap() {
return titleMap;
}
public boolean hasTitleMap() {
return titleMap != 0xffff;
}
int codePoint; // the characters UTF-32 code value
String name; // the ASCII name
byte generalCategory; // general category, available via Characte.getType()
byte bidiCategory; // available via Character.getBidiType()
int combiningClass; // not used in Character
String decomposition; // not used in Character
int decimalValue; // decimal digit value
int digitValue; // not all digits are decimal
String numericValue; // numeric value if digit or non-digit
boolean mirrored; //
String oldName;
String comment;
int upperMap;
int lowerMap;
int titleMap;
// this is the number of fields in one line of the UnicodeData.txt file
// each field is separated by a semicolon (a token)
static final int REQUIRED_FIELDS = 15;
/**
* General category types
* To preserve compatibility, these values cannot be changed
*/
public static final byte
UNASSIGNED = 0, // Cn normative
UPPERCASE_LETTER = 1, // Lu normative
LOWERCASE_LETTER = 2, // Ll normative
TITLECASE_LETTER = 3, // Lt normative
MODIFIER_LETTER = 4, // Lm normative
OTHER_LETTER = 5, // Lo normative
NON_SPACING_MARK = 6, // Mn informative
ENCLOSING_MARK = 7, // Me informative
COMBINING_SPACING_MARK = 8, // Mc normative
DECIMAL_DIGIT_NUMBER = 9, // Nd normative
LETTER_NUMBER = 10, // Nl normative
OTHER_NUMBER = 11, // No normative
SPACE_SEPARATOR = 12, // Zs normative
LINE_SEPARATOR = 13, // Zl normative
PARAGRAPH_SEPARATOR = 14, // Zp normative
CONTROL = 15, // Cc normative
FORMAT = 16, // Cf normative
// 17 is unused for no apparent reason,
// but must preserve forward compatibility
PRIVATE_USE = 18, // Co normative
SURROGATE = 19, // Cs normative
DASH_PUNCTUATION = 20, // Pd informative
START_PUNCTUATION = 21, // Ps informative
END_PUNCTUATION = 22, // Pe informative
CONNECTOR_PUNCTUATION = 23, // Pc informative
OTHER_PUNCTUATION = 24, // Po informative
MATH_SYMBOL = 25, // Sm informative
CURRENCY_SYMBOL = 26, // Sc informative
MODIFIER_SYMBOL = 27, // Sk informative
OTHER_SYMBOL = 28, // So informative
INITIAL_QUOTE_PUNCTUATION = 29, // Pi informative
FINAL_QUOTE_PUNCTUATION = 30, // Pf informative
// this value is only used in the character generation tool
// it can change to accommodate the addition of new categories.
GENERAL_CATEGORY_COUNT = 31; // sentinel value
static final byte SHORT = 0, LONG = 1;
// general category type strings
// NOTE: The order of this category array is dependent on the assignment of
// category constants above. We want to access this array using constants above.
// [][SHORT] is the SHORT name, [][LONG] is the LONG name
static final String[][] generalCategoryList = {
{"Cn", "UNASSIGNED"},
{"Lu", "UPPERCASE_LETTER"},
{"Ll", "LOWERCASE_LETTER"},
{"Lt", "TITLECASE_LETTER"},
{"Lm", "MODIFIER_LETTER"},
{"Lo", "OTHER_LETTER"},
{"Mn", "NON_SPACING_MARK"},
{"Me", "ENCLOSING_MARK"},
{"Mc", "COMBINING_SPACING_MARK"},
{"Nd", "DECIMAL_DIGIT_NUMBER"},
{"Nl", "LETTER_NUMBER"},
{"No", "OTHER_NUMBER"},
{"Zs", "SPACE_SEPARATOR"},
{"Zl", "LINE_SEPARATOR"},
{"Zp", "PARAGRAPH_SEPARATOR"},
{"Cc", "CONTROL"},
{"Cf", "FORMAT"},
{"xx", "unused"},
{"Co", "PRIVATE_USE"},
{"Cs", "SURROGATE"},
{"Pd", "DASH_PUNCTUATION"},
{"Ps", "START_PUNCTUATION"},
{"Pe", "END_PUNCTUATION"},
{"Pc", "CONNECTOR_PUNCTUATION"},
{"Po", "OTHER_PUNCTUATION"},
{"Sm", "MATH_SYMBOL"},
{"Sc", "CURRENCY_SYMBOL"},
{"Sk", "MODIFIER_SYMBOL"},
{"So", "OTHER_SYMBOL"},
{"Pi", "INITIAL_QUOTE_PUNCTUATION"},
{"Pf", "FINAL_QUOTE_PUNCTUATION"}
};
/**
* Bidirectional categories
*/
public static final byte
DIRECTIONALITY_UNDEFINED = -1,
// Strong category
DIRECTIONALITY_LEFT_TO_RIGHT = 0, // L
DIRECTIONALITY_RIGHT_TO_LEFT = 1, // R
DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2, // AL
// Weak category
DIRECTIONALITY_EUROPEAN_NUMBER = 3, // EN
DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4, // ES
DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5, // ET
DIRECTIONALITY_ARABIC_NUMBER = 6, // AN
DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7, // CS
DIRECTIONALITY_NONSPACING_MARK = 8, // NSM
DIRECTIONALITY_BOUNDARY_NEUTRAL = 9, // BN
// Neutral category
DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10, // B
DIRECTIONALITY_SEGMENT_SEPARATOR = 11, // S
DIRECTIONALITY_WHITESPACE = 12, // WS
DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON
DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14, // LRE
DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15, // LRO
DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16, // RLE
DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17, // RLO
DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18, // PDF
DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19, // LRI
DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20, // RLI
DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21, // FSI
DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22, // PDI
DIRECTIONALITY_CATEGORY_COUNT = 23; // sentinel value
// If changes are made to the above bidi category assignments, this
// list of bidi category names must be changed to keep their order in synch.
// Access this list using the bidi category constants above.
static final String[][] bidiCategoryList = {
{"L", "DIRECTIONALITY_LEFT_TO_RIGHT"},
{"R", "DIRECTIONALITY_RIGHT_TO_LEFT"},
{"AL", "DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC"},
{"EN", "DIRECTIONALITY_EUROPEAN_NUMBER"},
{"ES", "DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR"},
{"ET", "DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR"},
{"AN", "DIRECTIONALITY_ARABIC_NUMBER"},
{"CS", "DIRECTIONALITY_COMMON_NUMBER_SEPARATOR"},
{"NSM", "DIRECTIONALITY_NONSPACING_MARK"},
{"BN", "DIRECTIONALITY_BOUNDARY_NEUTRAL"},
{"B", "DIRECTIONALITY_PARAGRAPH_SEPARATOR"},
{"S", "DIRECTIONALITY_SEGMENT_SEPARATOR"},
{"WS", "DIRECTIONALITY_WHITESPACE"},
{"ON", "DIRECTIONALITY_OTHER_NEUTRALS"},
{"LRE", "DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING"},
{"LRO", "DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE"},
{"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"},
{"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"},
{"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"},
{"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"},
{"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"},
{"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"},
{"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"},
};
// Unicode specification lines have fields in this order.
static final byte
FIELD_VALUE = 0,
FIELD_NAME = 1,
FIELD_CATEGORY = 2,
FIELD_CLASS = 3,
FIELD_BIDI = 4,
FIELD_DECOMPOSITION = 5,
FIELD_DECIMAL = 6,
FIELD_DIGIT = 7,
FIELD_NUMERIC = 8,
FIELD_MIRRORED = 9,
FIELD_OLDNAME = 10,
FIELD_COMMENT = 11,
FIELD_UPPERCASE = 12,
FIELD_LOWERCASE = 13,
FIELD_TITLECASE = 14;
static final Pattern tokenSeparator = Pattern.compile(";");
public static void main(String[] args) {
UnicodeSpec[] spec = null;
if (args.length == 2 ) {
try {
File file = new File(args[0]);
int plane = Integer.parseInt(args[1]);
spec = UnicodeSpec.readSpecFile(file, plane);
System.out.println("UnicodeSpec[" + spec.length + "]:");
for (int x=0; x<spec.length; x++) {
System.out.println(spec[x].toString());
}
}
catch(Exception e) {
e.printStackTrace();
}
}
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,354 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 4397357 6565620 6959267 7070436 7198195 8041791 8032446 8072600
* @summary Confirm special case mappings are handled correctly.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.StringTokenizer;
public class SpecialCasingTest {
private static boolean err = false;
// Locales which are used for testing
private static List<Locale> locales = new ArrayList<>();
static {
locales.add(new Locale("az", ""));
locales.addAll(java.util.Arrays.asList(Locale.getAvailableLocales()));
}
// Default locale
private static String defaultLang;
// True if the default language is az, lt, or tr which has locale-specific
// mappings.
private static boolean specificLocale;
// Additional test cases
// Pseudo-locales which are used here:
// L1: locales other than lt
// L2: locales other than az and tr
// L3: locales other than az, lt and tr
private static final String[] additionalTestData = {
// Format:
// <code>; <lower>; <title>; <upper>; (<condition_list>)
// Counterpart of Final_Sigma test case
// 03A3; 03C2; 03A3; 03A3; Final_Sigma
"03A3; 03C3; 03A3; 03A3; SunSpecific_Not_Final_Sigma1",
"03A3; 03C3; 03A3; 03A3; SunSpecific_Not_Final_Sigma2",
// Counterpart of After_Soft_Dotted test case
// 0307; 0307; ; ; lt After_Soft_Dotted
"0307; 0307; 0307; 0307; L1 After_Soft_Dotted",
"0307; 0307; 0307; 0307; lt SunSpecific_Not_After_Soft_Dotted",
"0307; 0307; 0307; 0307; L1 SunSpecific_Not_After_Soft_Dotted",
// Counterpart of More_Above test cases
// 0049; 0069 0307; 0049; 0049; lt More_Above
"0049; 0131 ; 0049; 0049; az More_Above",
"0049; 0131 ; 0049; 0049; tr More_Above",
"0049; 0069 ; 0049; 0049; L3 More_Above",
"0049; 0069 ; 0049; 0049; lt SunSpecific_Not_More_Above",
"0049; 0131 ; 0049; 0049; az SunSpecific_Not_More_Above",
"0049; 0131 ; 0049; 0049; tr SunSpecific_Not_More_Above",
"0049; 0069 ; 0049; 0049; L3 SunSpecific_Not_More_Above",
// 004A; 006A 0307; 004A; 004A; lt More_Above
"004A; 006A ; 004A; 004A; L1 More_Above",
"004A; 006A ; 004A; 004A; lt SunSpecific_Not_More_Above",
"004A; 006A ; 004A; 004A; L1 SunSpecific_Not_More_Above",
// 012E; 012F 0307; 012E; 012E; lt More_Above
"012E; 012F ; 012E; 012E; L1 More_Above",
"012E; 012F ; 012E; 012E; lt SunSpecific_Not_More_Above",
"012E; 012F ; 012E; 012E; L1 SunSpecific_Not_More_Above",
// Counterpart of After_I test cases
// 0307; ; 0307; 0307; tr After_I
// 0307; ; 0307; 0307; az After_I
"0307; 0307 0307; 0307; 0307; lt After_I",
"0307; 0307 ; 0307; 0307; L3 After_I",
"0307; 0307 ; 0307; 0307; tr SunSpecific_Not_After_I",
"0307; 0307 ; 0307; 0307; az SunSpecific_Not_After_I",
"0307; 0307 ; 0307; 0307; L2 SunSpecific_Not_After_I",
// Counterpart of Not_Before_Dot test cases
// 0049; 0131 ; 0049; 0049; tr Not_Before_Dot
// 0049; 0131 ; 0049; 0049; az Not_Before_Dot
"0049; 0069 ; 0049; 0049; L2 Not_Before_Dot",
"0049; 0069 ; 0049; 0049; tr SunSpecific_Before_Dot",
"0049; 0069 ; 0049; 0049; az SunSpecific_Before_Dot",
"0049; 0069 0307 0307; 0049; 0049; lt SunSpecific_Before_Dot",
"0049; 0069 0307 ; 0049; 0049; L3 SunSpecific_Before_Dot",
};
public static void main (String[] args) {
SpecialCasingTest specialCasingTest = new SpecialCasingTest();
specialCasingTest.test();
}
private void test () {
Locale defaultLocale = Locale.getDefault();
BufferedReader in = null;
try {
int locale_num = locales.size();
for (int l = 0; l < locale_num; l++) {
Locale locale = locales.get(l);
Locale.setDefault(locale);
System.out.println("Testing on " + locale + " locale....");
defaultLang = locale.getLanguage();
if (defaultLang.equals("az") ||
defaultLang.equals("lt") ||
defaultLang.equals("tr")) {
specificLocale = true;
} else {
specificLocale = false;
}
in = Files.newBufferedReader(Paths.get(System.getProperty("test.src.path"), "..", "/Character/SpecialCasing.txt")
.toRealPath());
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue;
}
test(line);
}
in.close();
in = null;
System.out.println("Testing with Sun original data....");
for (String additionalTestData1 : additionalTestData) {
test(additionalTestData1);
}
}
}
catch (IOException e) {
err = true;
e.printStackTrace();
}
finally {
if (in != null) {
try {
in.close();
}
catch (IOException e) {
}
}
Locale.setDefault(defaultLocale);
if (err) {
throw new RuntimeException("SpecialCasingTest failed.");
} else {
System.out.println("*** SpecialCasingTest passed.");
}
}
}
private void test(String line) {
int index = line.indexOf('#');
if (index != -1) {
line = line.substring(0, index);
}
String lang = null;
String condition = null;
String[] fields = line.split("; ");
for (int i = 0; i < 4; i++) {
if (fields[i].length() != 0) {
fields[i] = convert(fields[i]);
}
}
if (fields.length != 4) {
StringTokenizer st = new StringTokenizer(fields[4]);
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.equals("Final_Sigma")) {
condition = "Final Sigma";
fields[0] = "Abc" + fields[0];
fields[1] = "abc" + fields[1];
fields[3] = "ABC" + fields[3];
} else if (token.equals("SunSpecific_Not_Final_Sigma1")) {
condition = "*Sun Specific* Not Final Sigma 1";
fields[0] = "Abc" + fields[0] + "xyz";
fields[1] = "abc" + fields[1] + "xyz";
fields[3] = "ABC" + fields[3] + "XYZ";
} else if (token.equals("SunSpecific_Not_Final_Sigma2")) {
condition = "*Sun Specific* Not Final Sigma 2";
} else if (token.equals("After_Soft_Dotted")) {
condition = "After Soft-Dotted";
fields[0] = "\u1E2D" + fields[0];
fields[1] = "\u1E2D" + fields[1];
fields[3] = "\u1E2C" + fields[3];
} else if (token.equals("SunSpecific_Not_After_Soft_Dotted")) {
condition = "*Sun Specific* Not After Soft-Dotted";
fields[0] = "Dot" + fields[0];
fields[1] = "dot" + fields[1];
fields[3] = "DOT" + fields[3];
} else if (token.equals("More_Above")) {
condition = "More Above";
fields[0] = fields[0] + "\u0306";
fields[1] = fields[1] + "\u0306";
fields[3] = fields[3] + "\u0306";
} else if (token.equals("SunSpecific_Not_More_Above")) {
condition = "*Sun Specific* Not More Above";
fields[0] = fields[0] + "breve";
fields[1] = fields[1] + "breve";
fields[3] = fields[3] + "BREVE";
} else if (token.equals("After_I")) {
condition = "After I";
fields[0] = "I" + fields[0];
fields[1] = "i" + fields[1];
fields[3] = "I" + fields[3];
} else if (token.equals("SunSpecific_Not_After_I")) {
condition = "*Sun Specific* Not After I";
fields[0] = "A" + fields[0];
fields[1] = "a" + fields[1];
fields[3] = "A" + fields[3];
} else if (token.equals("Not_Before_Dot")) {
condition = "Not Before Dot";
fields[0] = fields[0] + "Z";
fields[1] = fields[1] + "z";
fields[3] = fields[3] + "Z";
} else if (token.equals("SunSpecific_Before_Dot")) {
condition = "*Sun Specific* Before Dot";
fields[0] = fields[0] + "\u0307";
fields[3] = fields[3] + "\u0307";
} else if (token.length() == 2) {
lang = token;
if (lang.equals("L1")) {
if (defaultLang.equals("lt")) {
lang = "en";
} else {
lang = defaultLang;
}
} else if (lang.equals("L2")) {
if (defaultLang.equals("az") ||
defaultLang.equals("tr")) {
lang = "en";
} else {
lang = defaultLang;
}
} else if (lang.equals("L3")) {
if (defaultLang.equals("az") ||
defaultLang.equals("lt") ||
defaultLang.equals("tr")) {
lang = "en";
} else {
lang = defaultLang;
}
// I want to have another test case here for double-check.
// Current implementation for Character and String considers
// only az, lt, and tr locales. I want to detect if other
// locales are specified.
} else if (!lang.equals("az") &&
!lang.equals("lt") &&
!lang.equals("tr")) {
throw new RuntimeException("Unsupported locale: " +
lang + ". It may need to be considered in ConditionalSpecialCasing.java. Please confirm.");
}
} else {
throw new RuntimeException("Unknown condition: " + token);
}
}
} else if (fields[0].equals("\u0130")) {
// special case for \u0130
if (defaultLang.equals("az") ||
defaultLang.equals("tr")) {
lang = "en";
} else {
lang = defaultLang;
}
}
testLowerCase(fields[0], fields[1], lang, condition);
testUpperCase(fields[0], fields[3], lang, condition);
}
private void testLowerCase(String orig, String expected,
String lang, String condition) {
String got = (lang == null) ?
orig.toLowerCase() : orig.toLowerCase(new Locale(lang, ""));
if (!expected.equals(got)) {
err = true;
System.err.println("toLowerCase(lang=" + lang +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected) +
((condition == null) ? "" : ("\n under condition(" +
condition + ")")));
}
}
private void testUpperCase(String orig, String expected,
String lang, String condition) {
String got = (lang == null) ?
orig.toUpperCase() : orig.toUpperCase(new Locale(lang, ""));
if (!expected.equals(got)) {
err = true;
System.err.println("toUpperCase(lang=" + lang +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected) +
((condition == null) ? "" : ("\n under condition(" +
condition + ")")));
}
}
StringBuilder sb = new StringBuilder();
private String convert(String str) {
sb.setLength(0);
String[] tokens = str.split(" ");
for (String token : tokens) {
sb.append((char) Integer.parseInt(token, 16));
}
return sb.toString();
}
private String toString(String str) {
sb.setLength(0);
int len = str.length();
for (int i = 0; i < len; i++) {
sb.append("0x").append(Integer.toHexString(str.charAt(i)).toUpperCase()).append(" ");
}
return sb.toString();
}
}

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 4397357 6565620 6959267 7070436 7198195 8032446 8072600
* @summary Confirm normal case mappings are handled correctly.
* @run main/timeout=200 UnicodeCasingTest
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Locale;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class UnicodeCasingTest {
private static boolean err = false;
// Locales which are used for testing
private static List<Locale> locales = new ArrayList<>();
static {
locales.add(new Locale("az", ""));
locales.addAll(java.util.Arrays.asList(Locale.getAvailableLocales()));
}
// Default locale
private static String defaultLang;
// List for Unicode characters whose mappings are included in
// SpecialCasing.txt and mappings in UnicodeData.txt isn't applicable.
private static Map<String, String> excludeList = new HashMap<>();
public static void main(String[] args) {
UnicodeCasingTest specialCasingTest = new UnicodeCasingTest();
specialCasingTest.test();
}
private void test() {
Locale defaultLocale = Locale.getDefault();
BufferedReader in = null;
try {
// First, we create exlude lists of characters whose mappings exist
// in SpecialCasing.txt and mapping rules in UnicodeData.txt aren't
// applicable.
in = Files.newBufferedReader(Paths.get(System.getProperty("test.src.path"), "..", "/Character/SpecialCasing.txt")
.toRealPath());
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue;
}
updateExcludeList(line);
}
in.close();
in = null;
int locale_num = locales.size();
for (int l = 0; l < locale_num; l++) {
Locale locale = locales.get(l);
Locale.setDefault(locale);
defaultLang = locale.getLanguage();
// System.out.println("Testing on " + locale + " locale....");
System.err.println("Testing on " + locale + " locale....");
in = Files.newBufferedReader(Paths.get(System.getProperty("test.src.path"), "..", "/Character/UnicodeData.txt")
.toRealPath());
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue;
}
test(line);
}
in.close();
in = null;
}
}
catch (IOException e) {
err = true;
e.printStackTrace();
}
finally {
if (in != null) {
try {
in.close();
}
catch (IOException e) {
}
}
Locale.setDefault(defaultLocale);
if (err) {
throw new RuntimeException("UnicodeCasingTest failed.");
} else {
System.out.println("*** UnicodeCasingTest passed.");
}
}
}
private void updateExcludeList(String line) {
int index = line.indexOf('#');
if (index != -1) {
line = line.substring(0, index);
}
String lang = null;
String condition = null;
String[] fields = line.split("; ");
// If the given character is mapped to multiple characters under the
// normal condition, add it to the exclude list.
if (fields.length == 4) {
excludeList.put(fields[0], "all");
} else if (fields.length == 5) {
if (fields[4].length() == 2) { /// locale
if (excludeList.get(fields[0]) == null) {
excludeList.put(fields[0], fields[4]);
}
}
}
}
private void test(String line) {
String[] fields = line.split(";", 15);
String orig = convert(fields[0]);
String lang = excludeList.get(fields[0]);
if (!"all".equals(lang) && !defaultLang.equals(lang)) {
if (fields[12].length() == 0) {
testUpperCase(orig, convert(fields[0]));
} else {
testUpperCase(orig, convert(fields[12]));
}
if (fields[13].length() == 0) {
testLowerCase(orig, convert(fields[0]));
} else {
testLowerCase(orig, convert(fields[13]));
}
}
}
private void testUpperCase(String orig, String expected) {
String got = orig.toUpperCase();
// Ugly workaround for special mappings for az and tr locales....
if (orig.equals("\u0069") &&
(defaultLang.equals("az") || defaultLang.equals("tr"))) {
expected = "\u0130";
}
if (!expected.equals(got)) {
err = true;
System.err.println("toUpperCase(" +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected));
}
}
private void testLowerCase(String orig, String expected) {
String got = orig.toLowerCase();
// Ugly workaround for special mappings for az and tr locales....
if (orig.equals("\u0049") &&
(defaultLang.equals("az") || defaultLang.equals("tr"))) {
expected = "\u0131";
}
if (!expected.equals(got)) {
err = true;
System.err.println("toLowerCase(" +
") failed.\n\tOriginal: " + toString(orig) +
"\n\tGot: " + toString(got) +
"\n\tExpected: " + toString(expected));
}
}
StringBuilder sb = new StringBuilder();
private String convert(String str) {
sb.setLength(0);
String[] tokens = str.split(" ");
for (String token : tokens) {
int j = Integer.parseInt(token, 16);
if (j < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
sb.append((char)j);
} else {
sb.append(Character.toChars(j));
}
}
return sb.toString();
}
private String toString(String str) {
sb.setLength(0);
int len = str.length();
for (int i = 0; i < len; i++) {
sb.append("0x").append(Integer.toHexString(str.charAt(i)).toUpperCase()).append(" ");
}
return sb.toString();
}
}