jdk-24/test/jdk/java/text/Normalizer/ICUBasicTest.java
2023-10-03 16:38:17 +00:00

633 lines
23 KiB
Java

/*
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 4221795 8032446 8174270
* @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's
* src/com/ibm/icu/dev/test and modified.
* @modules java.base/sun.text java.base/jdk.internal.icu.text
* @compile -XDignore.symbol.file ICUBasicTest.java
* @run junit/timeout=30 ICUBasicTest
*/
/*
*******************************************************************************
* Copyright (C) 1996-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
import sun.text.Normalizer;
import jdk.internal.icu.text.NormalizerBase;
import java.util.HexFormat;
import static java.text.Normalizer.Form.*;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.fail;
public class ICUBasicTest {
/*
* Normalization modes
*/
private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC;
private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD;
private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC;
private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD;
private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE;
/*
* Normalization options
*/
/* Normal Unicode versions */
private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;
private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST;
/*
* Special cases for UAX #15 bug
* see Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
*
* Note:
* PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are
* different for earlier Unicode versions.
*/
@Test
public void TestComposition() {
final TestCompositionCase cases[] = new TestCompositionCase[] {
new TestCompositionCase(NFC, UNICODE_3_2_0,
"\u1100\u0300\u1161\u0327",
"\u1100\u0300\u1161\u0327"),
new TestCompositionCase(NFC, UNICODE_LATEST,
"\u1100\u0300\u1161\u0327",
"\u1100\u0300\u1161\u0327"),
new TestCompositionCase(NFC, UNICODE_3_2_0,
"\u1100\u0300\u1161\u0327\u11a8",
"\u1100\u0300\u1161\u0327\u11a8"),
new TestCompositionCase(NFC, UNICODE_LATEST,
"\u1100\u0300\u1161\u0327\u11a8",
"\u1100\u0300\u1161\u0327\u11a8"),
new TestCompositionCase(NFC, UNICODE_3_2_0,
"\uac00\u0300\u0327\u11a8",
"\uac00\u0327\u0300\u11a8"),
new TestCompositionCase(NFC, UNICODE_LATEST,
"\uac00\u0300\u0327\u11a8",
"\uac00\u0327\u0300\u11a8"),
new TestCompositionCase(NFC, UNICODE_3_2_0,
"\u0b47\u0300\u0b3e",
"\u0b47\u0300\u0b3e"),
new TestCompositionCase(NFC, UNICODE_LATEST,
"\u0b47\u0300\u0b3e",
"\u0b47\u0300\u0b3e"),
};
String output;
int i, length;
for (i=0; i<cases.length; ++i) {
output = Normalizer.normalize(cases[i].input,
cases[i].form, cases[i].options);
if (!output.equals(cases[i].expect)) {
fail("unexpected result for case " + i + ". Expected="
+ cases[i].expect + ", Actual=" + output);
}
}
}
private final static class TestCompositionCase {
public java.text.Normalizer.Form form;
public int options;
public String input, expect;
TestCompositionCase(java.text.Normalizer.Form form,
int options,
String input,
String expect) {
this.form = form;
this.options = options;
this.input = input;
this.expect = expect;
}
}
/*
* Added in order to detect a regression.
*/
@Test
public void TestCombiningMarks() {
String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
String result = NormalizerBase.normalize(src, NFD);
if (!expected.equals(result)) {
fail("Reordering of combining marks failed. Expected: " +
HexFormat.of().withDelimiter(" ").formatHex(expected.getBytes())
+ " Got: "+ HexFormat.of().withDelimiter(" ").formatHex(result.getBytes()));
}
}
/*
* Added in order to detect a regression.
*/
@Test
public void TestBengali() throws Exception {
String input = "\u09bc\u09be\u09cd\u09be";
String output=NormalizerBase.normalize(input, NFC);
if (!input.equals(output)) {
fail("ERROR in NFC of string");
}
return;
}
/*
* Added in order to detect a regression.
*/
/**
* Test for a problem found by Verisign. Problem is that
* characters at the start of a string are not put in canonical
* order correctly by compose() if there is no starter.
*/
@Test
public void TestVerisign() throws Exception {
String[] inputs = {
"\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
"\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
};
String[] outputs = {
"\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
"\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
};
for (int i = 0; i < inputs.length; ++i) {
String input = inputs[i];
String output = outputs[i];
String result = NormalizerBase.normalize(input, NFD);
if (!result.equals(output)) {
fail("FAIL input: " + HexFormat.of().withDelimiter(" ")
.formatHex(input.getBytes()) + "\n" +
" decompose: " + HexFormat.of().withDelimiter(" ")
.formatHex(result.getBytes()) + "\n" +
" expected: " + HexFormat.of().withDelimiter(" ")
.formatHex(output.getBytes()));
}
result = NormalizerBase.normalize(input, NFC);
if (!result.equals(output)) {
fail("FAIL input: " + HexFormat.of().withDelimiter(" ")
.formatHex(input.getBytes()) + "\n" +
" compose: " + HexFormat.of().withDelimiter(" ")
.formatHex(output.getBytes()) + "\n" +
" expected: " + HexFormat.of().withDelimiter(" ")
.formatHex(output.getBytes()));
}
}
}
/**
* Test for a problem that showed up just before ICU 1.6 release
* having to do with combining characters with an index of zero.
* Such characters do not participate in any canonical
* decompositions. However, having an index of zero means that
* they all share one typeMask[] entry, that is, they all have to
* map to the same canonical class, which is not the case, in
* reality.
*/
@Test
public void TestZeroIndex() throws Exception {
String[] DATA = {
// Expect col1 x COMPOSE_COMPAT => col2
// Expect col2 x DECOMP => col3
"A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
"A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
"A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
"c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
"c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
};
for (int i=0; i<DATA.length; i+=3) {
String a = DATA[i];
String b = NormalizerBase.normalize(a, NFKC);
String exp = DATA[i+1];
if (b.equals(exp)) {
System.out.println("Ok: " + HexFormat.of().withDelimiter(" ")
.formatHex(a.getBytes()) + " x COMPOSE_COMPAT => " +
HexFormat.of().withDelimiter(" ")
.formatHex(b.getBytes()));
} else {
fail("FAIL: " + HexFormat.of().withDelimiter(" ")
.formatHex(b.getBytes()) + " x COMPOSE_COMPAT => " +
HexFormat.of().withDelimiter(" ")
.formatHex(a.getBytes()) + ", expect " +
HexFormat.of().withDelimiter(" ")
.formatHex(exp.getBytes()));
}
a = NormalizerBase.normalize(b, NFD);
exp = DATA[i+2];
if (a.equals(exp)) {
System.out.println("Ok: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " +
HexFormat.of().withDelimiter(" ").formatHex(a.getBytes()));
} else {
fail("FAIL: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " +
HexFormat.of().withDelimiter(" ").formatHex(a.getBytes()) + ", expect " + HexFormat.of().withDelimiter(" ").formatHex(exp.getBytes()));
}
}
}
/**
* Make sure characters in the CompositionExclusion.txt list do not get
* composed to.
*/
@Test
public void TestCompositionExclusion() throws Exception {
// This list is generated from CompositionExclusion.txt.
// Update whenever the normalizer tables are updated. Note
// that we test all characters listed, even those that can be
// derived from the Unicode DB and are therefore commented
// out.
/*
* kyuka's note:
* Original data seemed to be based on Unicode 3.0.0(the initial
* Composition Exclusions list) and seemed to have some mistakes.
* Updated in order to correct mistakes and to support Unicode 4.0.0.
* And, this table can be used also for Unicode 3.2.0.
*/
String[][] EXCLUDED_UNICODE_3_2_0 = {
{"\u0340"},
{"\u0341"},
{"\u0343"},
{"\u0344"},
{"\u0374"},
{"\u037E"},
{"\u0387"},
{"\u0958"},
{"\u0959", "\u095F"},
{"\u09DC"},
{"\u09DD"},
{"\u09DF"},
{"\u0A33"},
{"\u0A36"},
{"\u0A59", "\u0A5B"},
{"\u0A5E"},
{"\u0B5C"},
{"\u0B5D"},
{"\u0F43"},
{"\u0F4D"},
{"\u0F52"},
{"\u0F57"},
{"\u0F5C"},
{"\u0F69"},
{"\u0F73"},
{"\u0F75"},
{"\u0F76"},
{"\u0F78"},
{"\u0F81"},
{"\u0F93"},
{"\u0F9D"},
{"\u0FA2"},
{"\u0FA7"},
{"\u0FAC"},
{"\u0FB9"},
{"\u1F71"},
{"\u1F73"},
{"\u1F75"},
{"\u1F77"},
{"\u1F79"},
{"\u1F7B"},
{"\u1F7D"},
{"\u1FBB"},
{"\u1FBE"},
{"\u1FC9"},
{"\u1FCB"},
{"\u1FD3"},
{"\u1FDB"},
{"\u1FE3"},
{"\u1FEB"},
{"\u1FEE"},
{"\u1FEF"},
{"\u1FF9"},
{"\u1FFB"},
{"\u1FFD"},
{"\u2000"},
{"\u2001"},
{"\u2126"},
{"\u212A"},
{"\u212B"},
{"\u2329"},
{"\u232A"},
{"\u2ADC"},
{"\uF900", "\uFA0D"},
{"\uFA10"},
{"\uFA12"},
{"\uFA15", "\uFA1E"},
{"\uFA20"},
{"\uFA22"},
{"\uFA25"},
{"\uFA26"},
{"\uFA2A", "\uFA2D"},
{"\uFA30", "\uFA6A"},
{"\uFB1D"},
{"\uFB1F"},
{"\uFB2A", "\uFB36"},
{"\uFB38", "\uFB3C"},
{"\uFB3E"},
{"\uFB40"},
{"\uFB41"},
{"\uFB43"},
{"\uFB44"},
{"\uFB46", "\uFB4E"},
{"\uD834\uDD5E", "\uD834\uDD64"},
{"\uD834\uDDBB", "\uD834\uDDC0"},
{"\uD87E\uDC00", "\uD87E\uDE1D"}
};
String[][] EXCLUDED_LATEST = {
};
for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) {
if (EXCLUDED_UNICODE_3_2_0[i].length == 1) {
checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]);
} else {
int from, to;
from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0);
to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0);
for (int j = from; j <= to; j++) {
checkCompositionExclusion_320(String.valueOf(Character.toChars(j)));
}
}
}
}
private void checkCompositionExclusion_320(String s) throws Exception {
String a = String.valueOf(s);
String b = NormalizerBase.normalize(a, NFKD);
String c = NormalizerBase.normalize(b, NFC);
if (c.equals(a)) {
fail("FAIL: " + HexFormat.of().withDelimiter(" ")
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
HexFormat.of().withDelimiter(" ")
.formatHex(b.getBytes()) + " x COMPOSE => " +
HexFormat.of().withDelimiter(" ")
.formatHex(c.getBytes()) + " for the latest Unicode");
}
b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2);
c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2);
if (c.equals(a)) {
fail("FAIL: " + HexFormat.of().withDelimiter(" ")
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x COMPOSE => " +
HexFormat.of().withDelimiter(" ").formatHex(c.getBytes()) + " for Unicode 3.2.0");
}
}
@Test
public void TestTibetan() throws Exception {
String[][] decomp = {
{ "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
};
String[][] compose = {
{ "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
};
staticTest(NFD, decomp, 1);
staticTest(NFKD,decomp, 2);
staticTest(NFC, compose, 1);
staticTest(NFKC,compose, 2);
}
@Test
public void TestExplodingBase() throws Exception{
// \u017f - Latin small letter long s
// \u0307 - combining dot above
// \u1e61 - Latin small letter s with dot above
// \u1e9b - Latin small letter long s with dot above
String[][] canon = {
// Input Decomposed Composed
{ "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
{ "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
};
String[][] compat = {
// Input Decomposed Composed
{ "\u017f", "s", "s" },
{ "\u1e9b", "s\u0307", "\u1e61" },
};
staticTest(NFD, canon, 1);
staticTest(NFC, canon, 2);
staticTest(NFKD, compat, 1);
staticTest(NFKC, compat, 2);
}
private String[][] canonTests = {
// Input Decomposed Composed
{ "cat", "cat", "cat" },
{ "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
// D-dot_above
{ "\u1e0a", "D\u0307", "\u1e0a" },
// D dot_above
{ "D\u0307", "D\u0307", "\u1e0a" },
// D-dot_below dot_above
{ "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" },
// D-dot_above dot_below
{ "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
// D dot_below dot_above
{ "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
// D dot_below cedilla dot_above
{ "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"},
// D dot_above ogonek dot_below
{ "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"},
// E-macron-grave
{ "\u1E14", "E\u0304\u0300", "\u1E14" },
// E-macron + grave
{ "\u0112\u0300", "E\u0304\u0300", "\u1E14" },
// E-grave + macron
{ "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" },
// angstrom_sign
{ "\u212b", "A\u030a", "\u00c5" },
// A-ring
{ "\u00c5", "A\u030a", "\u00c5" },
{ "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
{ "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
//updated with 3.0
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
{ "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" },
{ "Henry IV", "Henry IV", "Henry IV" },
{ "Henry \u2163", "Henry \u2163", "Henry \u2163" },
// ga(Zenkaku-Katakana)
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
// ka(Zenkaku-Katakana) + ten(Zenkaku)
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
// ka(Hankaku-Katakana) + ten(Hankaku-Katakana)
{ "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" },
// ka(Zenkaku-Katakana) + ten(Hankaku)
{ "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" },
// ka(Hankaku-Katakana) + ten(Zenkaku)
{ "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" },
{ "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
{ "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e",
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65",
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" },
};
private String[][] compatTests = {
// Input Decomposed Composed
{ "cat", "cat", "cat" },
// Alef-Lamed vs. Alef, Lamed
{ "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", },
{ "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
// ffi ligature -> f + f + i
{ "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" },
//updated for 3.0
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
// ffi ligature -> f + f + i
{ "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" },
{ "Henry IV", "Henry IV", "Henry IV" },
{ "Henry \u2163", "Henry IV", "Henry IV" },
// ga(Zenkaku-Katakana)
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
// ka(Zenkaku-Katakana) + ten(Zenkaku)
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
// ka(Hankaku-Katakana) + ten(Zenkaku)
{ "\uFF76\u3099", "\u30AB\u3099", "\u30AC" },
/* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
// ka(Hankaku-Katakana) + ten(Hankaku)
{ "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" },
// ka(Zenkaku-Katakana) + ten(Hankaku)
{ "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" },
};
@Test
public void TestNFD() throws Exception{
staticTest(NFD, canonTests, 1);
}
@Test
public void TestNFC() throws Exception{
staticTest(NFC, canonTests, 2);
}
@Test
public void TestNFKD() throws Exception{
staticTest(NFKD, compatTests, 1);
}
@Test
public void TestNFKC() throws Exception{
staticTest(NFKC, compatTests, 2);
}
private void staticTest(java.text.Normalizer.Form form,
String[][] tests,
int outCol) throws Exception {
for (int i = 0; i < tests.length; i++) {
String input = tests[i][0];
System.out.println("Normalizing '" + input + "' (" + HexFormat.of()
.withDelimiter(" ").formatHex(input.getBytes()) + ")" );
String expect =tests[i][outCol];
String output = java.text.Normalizer.normalize(input, form);
if (!output.equals(expect)) {
fail("FAIL: case " + i
+ " expected '" + expect + "' (" + HexFormat.of()
.withDelimiter(" ").formatHex(expect.getBytes()) + ")"
+ " but got '" + output + "' (" + HexFormat.of()
.withDelimiter(" ").formatHex(output.getBytes()) + ")"
);
}
}
}
// With Canonical decomposition, Hangul syllables should get decomposed
// into Jamo, but Jamo characters should not be decomposed into
// conjoining Jamo
private String[][] hangulCanon = {
// Input Decomposed Composed
{ "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
{ "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
};
@Test
public void TestHangulCompose() throws Exception{
System.out.println("Canonical composition...");
staticTest(NFC, hangulCanon, 2);
}
@Test
public void TestHangulDecomp() throws Exception{
System.out.println("Canonical decomposition...");
staticTest(NFD, hangulCanon, 1);
}
}