/* * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* * @test * @bug 4221795 8032446 8174270 * @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's * src/com/ibm/icu/dev/test and modified. * @modules java.base/sun.text java.base/jdk.internal.icu.text * @compile -XDignore.symbol.file ICUBasicTest.java * @run junit/timeout=30 ICUBasicTest */ /* ******************************************************************************* * Copyright (C) 1996-2004, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ import sun.text.Normalizer; import jdk.internal.icu.text.NormalizerBase; import java.util.HexFormat; import static java.text.Normalizer.Form.*; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.fail; public class ICUBasicTest { /* * Normalization modes */ private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC; private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD; private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC; private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD; private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE; /* * Normalization options */ /* Normal Unicode versions */ private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST; /* * Special cases for UAX #15 bug * see Unicode Public Review Issue #29 * at http://www.unicode.org/review/resolved-pri.html#pri29 * * Note: * PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are * different for earlier Unicode versions. */ @Test public void TestComposition() { final TestCompositionCase cases[] = new TestCompositionCase[] { new TestCompositionCase(NFC, UNICODE_3_2_0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), new TestCompositionCase(NFC, UNICODE_LATEST, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), new TestCompositionCase(NFC, UNICODE_3_2_0, "\u1100\u0300\u1161\u0327\u11a8", "\u1100\u0300\u1161\u0327\u11a8"), new TestCompositionCase(NFC, UNICODE_LATEST, "\u1100\u0300\u1161\u0327\u11a8", "\u1100\u0300\u1161\u0327\u11a8"), new TestCompositionCase(NFC, UNICODE_3_2_0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), new TestCompositionCase(NFC, UNICODE_LATEST, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), new TestCompositionCase(NFC, UNICODE_3_2_0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), new TestCompositionCase(NFC, UNICODE_LATEST, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), }; String output; int i, length; for (i=0; i col2 // Expect col2 x DECOMP => col3 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", }; for (int i=0; i " + HexFormat.of().withDelimiter(" ") .formatHex(b.getBytes())); } else { fail("FAIL: " + HexFormat.of().withDelimiter(" ") .formatHex(b.getBytes()) + " x COMPOSE_COMPAT => " + HexFormat.of().withDelimiter(" ") .formatHex(a.getBytes()) + ", expect " + HexFormat.of().withDelimiter(" ") .formatHex(exp.getBytes())); } a = NormalizerBase.normalize(b, NFD); exp = DATA[i+2]; if (a.equals(exp)) { System.out.println("Ok: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " + HexFormat.of().withDelimiter(" ").formatHex(a.getBytes())); } else { fail("FAIL: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " + HexFormat.of().withDelimiter(" ").formatHex(a.getBytes()) + ", expect " + HexFormat.of().withDelimiter(" ").formatHex(exp.getBytes())); } } } /** * Make sure characters in the CompositionExclusion.txt list do not get * composed to. */ @Test public void TestCompositionExclusion() throws Exception { // This list is generated from CompositionExclusion.txt. // Update whenever the normalizer tables are updated. Note // that we test all characters listed, even those that can be // derived from the Unicode DB and are therefore commented // out. /* * kyuka's note: * Original data seemed to be based on Unicode 3.0.0(the initial * Composition Exclusions list) and seemed to have some mistakes. * Updated in order to correct mistakes and to support Unicode 4.0.0. * And, this table can be used also for Unicode 3.2.0. */ String[][] EXCLUDED_UNICODE_3_2_0 = { {"\u0340"}, {"\u0341"}, {"\u0343"}, {"\u0344"}, {"\u0374"}, {"\u037E"}, {"\u0387"}, {"\u0958"}, {"\u0959", "\u095F"}, {"\u09DC"}, {"\u09DD"}, {"\u09DF"}, {"\u0A33"}, {"\u0A36"}, {"\u0A59", "\u0A5B"}, {"\u0A5E"}, {"\u0B5C"}, {"\u0B5D"}, {"\u0F43"}, {"\u0F4D"}, {"\u0F52"}, {"\u0F57"}, {"\u0F5C"}, {"\u0F69"}, {"\u0F73"}, {"\u0F75"}, {"\u0F76"}, {"\u0F78"}, {"\u0F81"}, {"\u0F93"}, {"\u0F9D"}, {"\u0FA2"}, {"\u0FA7"}, {"\u0FAC"}, {"\u0FB9"}, {"\u1F71"}, {"\u1F73"}, {"\u1F75"}, {"\u1F77"}, {"\u1F79"}, {"\u1F7B"}, {"\u1F7D"}, {"\u1FBB"}, {"\u1FBE"}, {"\u1FC9"}, {"\u1FCB"}, {"\u1FD3"}, {"\u1FDB"}, {"\u1FE3"}, {"\u1FEB"}, {"\u1FEE"}, {"\u1FEF"}, {"\u1FF9"}, {"\u1FFB"}, {"\u1FFD"}, {"\u2000"}, {"\u2001"}, {"\u2126"}, {"\u212A"}, {"\u212B"}, {"\u2329"}, {"\u232A"}, {"\u2ADC"}, {"\uF900", "\uFA0D"}, {"\uFA10"}, {"\uFA12"}, {"\uFA15", "\uFA1E"}, {"\uFA20"}, {"\uFA22"}, {"\uFA25"}, {"\uFA26"}, {"\uFA2A", "\uFA2D"}, {"\uFA30", "\uFA6A"}, {"\uFB1D"}, {"\uFB1F"}, {"\uFB2A", "\uFB36"}, {"\uFB38", "\uFB3C"}, {"\uFB3E"}, {"\uFB40"}, {"\uFB41"}, {"\uFB43"}, {"\uFB44"}, {"\uFB46", "\uFB4E"}, {"\uD834\uDD5E", "\uD834\uDD64"}, {"\uD834\uDDBB", "\uD834\uDDC0"}, {"\uD87E\uDC00", "\uD87E\uDE1D"} }; String[][] EXCLUDED_LATEST = { }; for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) { if (EXCLUDED_UNICODE_3_2_0[i].length == 1) { checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]); } else { int from, to; from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0); to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0); for (int j = from; j <= to; j++) { checkCompositionExclusion_320(String.valueOf(Character.toChars(j))); } } } } private void checkCompositionExclusion_320(String s) throws Exception { String a = String.valueOf(s); String b = NormalizerBase.normalize(a, NFKD); String c = NormalizerBase.normalize(b, NFC); if (c.equals(a)) { fail("FAIL: " + HexFormat.of().withDelimiter(" ") .formatHex(a.getBytes()) + " x DECOMP_COMPAT => " + HexFormat.of().withDelimiter(" ") .formatHex(b.getBytes()) + " x COMPOSE => " + HexFormat.of().withDelimiter(" ") .formatHex(c.getBytes()) + " for the latest Unicode"); } b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2); c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2); if (c.equals(a)) { fail("FAIL: " + HexFormat.of().withDelimiter(" ") .formatHex(a.getBytes()) + " x DECOMP_COMPAT => " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x COMPOSE => " + HexFormat.of().withDelimiter(" ").formatHex(c.getBytes()) + " for Unicode 3.2.0"); } } @Test public void TestTibetan() throws Exception { String[][] decomp = { { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } }; String[][] compose = { { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } }; staticTest(NFD, decomp, 1); staticTest(NFKD,decomp, 2); staticTest(NFC, compose, 1); staticTest(NFKC,compose, 2); } @Test public void TestExplodingBase() throws Exception{ // \u017f - Latin small letter long s // \u0307 - combining dot above // \u1e61 - Latin small letter s with dot above // \u1e9b - Latin small letter long s with dot above String[][] canon = { // Input Decomposed Composed { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, }; String[][] compat = { // Input Decomposed Composed { "\u017f", "s", "s" }, { "\u1e9b", "s\u0307", "\u1e61" }, }; staticTest(NFD, canon, 1); staticTest(NFC, canon, 2); staticTest(NFKD, compat, 1); staticTest(NFKC, compat, 2); } private String[][] canonTests = { // Input Decomposed Composed { "cat", "cat", "cat" }, { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, // D-dot_above { "\u1e0a", "D\u0307", "\u1e0a" }, // D dot_above { "D\u0307", "D\u0307", "\u1e0a" }, // D-dot_below dot_above { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below cedilla dot_above { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_above ogonek dot_below { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // E-macron-grave { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron + grave { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-grave + macron { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // angstrom_sign { "\u212b", "A\u030a", "\u00c5" }, // A-ring { "\u00c5", "A\u030a", "\u00c5" }, { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, //updated with 3.0 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, // ga(Zenkaku-Katakana) { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ka(Zenkaku-Katakana) + ten(Zenkaku) { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka(Hankaku-Katakana) + ten(Hankaku-Katakana) { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // ka(Zenkaku-Katakana) + ten(Hankaku) { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka(Hankaku-Katakana) + ten(Zenkaku) { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, { "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e", "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65", "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" }, }; private String[][] compatTests = { // Input Decomposed Composed { "cat", "cat", "cat" }, // Alef-Lamed vs. Alef, Lamed { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, //updated for 3.0 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \u2163", "Henry IV", "Henry IV" }, // ga(Zenkaku-Katakana) { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ka(Zenkaku-Katakana) + ten(Zenkaku) { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka(Hankaku-Katakana) + ten(Zenkaku) { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ // ka(Hankaku-Katakana) + ten(Hankaku) { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka(Zenkaku-Katakana) + ten(Hankaku) { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, }; @Test public void TestNFD() throws Exception{ staticTest(NFD, canonTests, 1); } @Test public void TestNFC() throws Exception{ staticTest(NFC, canonTests, 2); } @Test public void TestNFKD() throws Exception{ staticTest(NFKD, compatTests, 1); } @Test public void TestNFKC() throws Exception{ staticTest(NFKC, compatTests, 2); } private void staticTest(java.text.Normalizer.Form form, String[][] tests, int outCol) throws Exception { for (int i = 0; i < tests.length; i++) { String input = tests[i][0]; System.out.println("Normalizing '" + input + "' (" + HexFormat.of() .withDelimiter(" ").formatHex(input.getBytes()) + ")" ); String expect =tests[i][outCol]; String output = java.text.Normalizer.normalize(input, form); if (!output.equals(expect)) { fail("FAIL: case " + i + " expected '" + expect + "' (" + HexFormat.of() .withDelimiter(" ").formatHex(expect.getBytes()) + ")" + " but got '" + output + "' (" + HexFormat.of() .withDelimiter(" ").formatHex(output.getBytes()) + ")" ); } } } // With Canonical decomposition, Hangul syllables should get decomposed // into Jamo, but Jamo characters should not be decomposed into // conjoining Jamo private String[][] hangulCanon = { // Input Decomposed Composed { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, }; @Test public void TestHangulCompose() throws Exception{ System.out.println("Canonical composition..."); staticTest(NFC, hangulCanon, 2); } @Test public void TestHangulDecomp() throws Exception{ System.out.println("Canonical decomposition..."); staticTest(NFD, hangulCanon, 1); } }