2019-05-23 12:21:21 -07:00
|
|
|
/*
|
2020-01-13 08:05:59 -08:00
|
|
|
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
|
2019-05-23 12:21:21 -07:00
|
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
|
|
*
|
|
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
|
|
* accompanied this code).
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License version
|
|
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
*
|
|
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
|
|
* questions.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* @test
|
2020-01-13 08:05:59 -08:00
|
|
|
* @bug 4221795 8032446 8174270
|
2019-05-23 12:21:21 -07:00
|
|
|
* @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's
|
|
|
|
* src/com/ibm/icu/dev/test and modified.
|
2020-01-13 08:05:59 -08:00
|
|
|
* @modules java.base/sun.text java.base/jdk.internal.icu.text
|
2019-05-23 12:21:21 -07:00
|
|
|
* @library /java/text/testlib
|
|
|
|
* @compile -XDignore.symbol.file ICUBasicTest.java
|
|
|
|
* @run main/timeout=30 ICUBasicTest
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 1996-2004, International Business Machines Corporation and *
|
|
|
|
* others. All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
import sun.text.Normalizer;
|
2020-01-13 08:05:59 -08:00
|
|
|
import jdk.internal.icu.text.NormalizerBase;
|
2019-05-23 12:21:21 -07:00
|
|
|
|
2023-04-28 00:16:29 +00:00
|
|
|
import java.util.HexFormat;
|
|
|
|
|
2019-05-23 12:21:21 -07:00
|
|
|
import static java.text.Normalizer.Form.*;
|
|
|
|
|
|
|
|
public class ICUBasicTest extends IntlTest {
|
|
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
|
new ICUBasicTest().run(args);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Normalization modes
|
|
|
|
*/
|
|
|
|
private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC;
|
|
|
|
private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD;
|
|
|
|
private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC;
|
|
|
|
private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD;
|
|
|
|
private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Normalization options
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Normal Unicode versions */
|
|
|
|
private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;
|
|
|
|
private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special cases for UAX #15 bug
|
|
|
|
* see Unicode Public Review Issue #29
|
|
|
|
* at http://www.unicode.org/review/resolved-pri.html#pri29
|
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are
|
|
|
|
* different for earlier Unicode versions.
|
|
|
|
*/
|
|
|
|
public void TestComposition() {
|
|
|
|
|
|
|
|
final TestCompositionCase cases[] = new TestCompositionCase[] {
|
|
|
|
new TestCompositionCase(NFC, UNICODE_3_2_0,
|
|
|
|
"\u1100\u0300\u1161\u0327",
|
|
|
|
"\u1100\u0300\u1161\u0327"),
|
|
|
|
new TestCompositionCase(NFC, UNICODE_LATEST,
|
|
|
|
"\u1100\u0300\u1161\u0327",
|
|
|
|
"\u1100\u0300\u1161\u0327"),
|
|
|
|
|
|
|
|
new TestCompositionCase(NFC, UNICODE_3_2_0,
|
|
|
|
"\u1100\u0300\u1161\u0327\u11a8",
|
|
|
|
"\u1100\u0300\u1161\u0327\u11a8"),
|
|
|
|
new TestCompositionCase(NFC, UNICODE_LATEST,
|
|
|
|
"\u1100\u0300\u1161\u0327\u11a8",
|
|
|
|
"\u1100\u0300\u1161\u0327\u11a8"),
|
|
|
|
|
|
|
|
new TestCompositionCase(NFC, UNICODE_3_2_0,
|
|
|
|
"\uac00\u0300\u0327\u11a8",
|
|
|
|
"\uac00\u0327\u0300\u11a8"),
|
|
|
|
new TestCompositionCase(NFC, UNICODE_LATEST,
|
|
|
|
"\uac00\u0300\u0327\u11a8",
|
|
|
|
"\uac00\u0327\u0300\u11a8"),
|
|
|
|
|
|
|
|
new TestCompositionCase(NFC, UNICODE_3_2_0,
|
|
|
|
"\u0b47\u0300\u0b3e",
|
|
|
|
"\u0b47\u0300\u0b3e"),
|
|
|
|
new TestCompositionCase(NFC, UNICODE_LATEST,
|
|
|
|
"\u0b47\u0300\u0b3e",
|
|
|
|
"\u0b47\u0300\u0b3e"),
|
|
|
|
};
|
|
|
|
|
|
|
|
String output;
|
|
|
|
int i, length;
|
|
|
|
|
|
|
|
for (i=0; i<cases.length; ++i) {
|
|
|
|
output = Normalizer.normalize(cases[i].input,
|
|
|
|
cases[i].form, cases[i].options);
|
|
|
|
if (!output.equals(cases[i].expect)) {
|
|
|
|
errln("unexpected result for case " + i + ". Expected="
|
|
|
|
+ cases[i].expect + ", Actual=" + output);
|
|
|
|
} else if (verbose) {
|
|
|
|
logln("expected result for case " + i + ". Expected="
|
|
|
|
+ cases[i].expect + ", Actual=" + output);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private final static class TestCompositionCase {
|
|
|
|
public java.text.Normalizer.Form form;
|
|
|
|
public int options;
|
|
|
|
public String input, expect;
|
|
|
|
|
|
|
|
TestCompositionCase(java.text.Normalizer.Form form,
|
|
|
|
int options,
|
|
|
|
String input,
|
|
|
|
String expect) {
|
|
|
|
this.form = form;
|
|
|
|
this.options = options;
|
|
|
|
this.input = input;
|
|
|
|
this.expect = expect;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Added in order to detect a regression.
|
|
|
|
*/
|
|
|
|
public void TestCombiningMarks() {
|
|
|
|
String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
|
|
|
|
String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
|
|
|
|
String result = NormalizerBase.normalize(src, NFD);
|
|
|
|
|
|
|
|
if (!expected.equals(result)) {
|
|
|
|
errln("Reordering of combining marks failed. Expected: " +
|
2023-04-28 00:16:29 +00:00
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(expected.getBytes())
|
|
|
|
+ " Got: "+ HexFormat.of().withDelimiter(" ").formatHex(result.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Added in order to detect a regression.
|
|
|
|
*/
|
|
|
|
public void TestBengali() throws Exception {
|
|
|
|
String input = "\u09bc\u09be\u09cd\u09be";
|
|
|
|
String output=NormalizerBase.normalize(input, NFC);
|
|
|
|
|
|
|
|
if (!input.equals(output)) {
|
|
|
|
errln("ERROR in NFC of string");
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Added in order to detect a regression.
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* Test for a problem found by Verisign. Problem is that
|
|
|
|
* characters at the start of a string are not put in canonical
|
|
|
|
* order correctly by compose() if there is no starter.
|
|
|
|
*/
|
|
|
|
public void TestVerisign() throws Exception {
|
|
|
|
String[] inputs = {
|
|
|
|
"\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
|
|
|
|
"\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
|
|
|
|
};
|
|
|
|
String[] outputs = {
|
|
|
|
"\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
|
|
|
|
"\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < inputs.length; ++i) {
|
|
|
|
String input = inputs[i];
|
|
|
|
String output = outputs[i];
|
|
|
|
|
|
|
|
String result = NormalizerBase.normalize(input, NFD);
|
|
|
|
if (!result.equals(output)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL input: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(input.getBytes()) + "\n" +
|
|
|
|
" decompose: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(result.getBytes()) + "\n" +
|
|
|
|
" expected: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(output.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
result = NormalizerBase.normalize(input, NFC);
|
|
|
|
if (!result.equals(output)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL input: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(input.getBytes()) + "\n" +
|
|
|
|
" compose: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(output.getBytes()) + "\n" +
|
|
|
|
" expected: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(output.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Test for a problem that showed up just before ICU 1.6 release
|
|
|
|
* having to do with combining characters with an index of zero.
|
|
|
|
* Such characters do not participate in any canonical
|
|
|
|
* decompositions. However, having an index of zero means that
|
|
|
|
* they all share one typeMask[] entry, that is, they all have to
|
|
|
|
* map to the same canonical class, which is not the case, in
|
|
|
|
* reality.
|
|
|
|
*/
|
|
|
|
public void TestZeroIndex() throws Exception {
|
|
|
|
String[] DATA = {
|
|
|
|
// Expect col1 x COMPOSE_COMPAT => col2
|
|
|
|
// Expect col2 x DECOMP => col3
|
|
|
|
"A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
|
|
|
|
"A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
|
|
|
|
"A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
|
|
|
|
"c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
|
|
|
|
"c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i=0; i<DATA.length; i+=3) {
|
|
|
|
String a = DATA[i];
|
|
|
|
String b = NormalizerBase.normalize(a, NFKC);
|
|
|
|
String exp = DATA[i+1];
|
|
|
|
|
|
|
|
if (b.equals(exp)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
logln("Ok: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + " x COMPOSE_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(b.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
} else {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(b.getBytes()) + " x COMPOSE_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + ", expect " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(exp.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
a = NormalizerBase.normalize(b, NFD);
|
|
|
|
exp = DATA[i+2];
|
|
|
|
if (a.equals(exp)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
logln("Ok: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(a.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
} else {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL: " + HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x DECOMP => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(a.getBytes()) + ", expect " + HexFormat.of().withDelimiter(" ").formatHex(exp.getBytes()));
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Make sure characters in the CompositionExclusion.txt list do not get
|
|
|
|
* composed to.
|
|
|
|
*/
|
|
|
|
public void TestCompositionExclusion() throws Exception {
|
|
|
|
// This list is generated from CompositionExclusion.txt.
|
|
|
|
// Update whenever the normalizer tables are updated. Note
|
|
|
|
// that we test all characters listed, even those that can be
|
|
|
|
// derived from the Unicode DB and are therefore commented
|
|
|
|
// out.
|
|
|
|
|
|
|
|
/*
|
|
|
|
* kyuka's note:
|
|
|
|
* Original data seemed to be based on Unicode 3.0.0(the initial
|
|
|
|
* Composition Exclusions list) and seemed to have some mistakes.
|
|
|
|
* Updated in order to correct mistakes and to support Unicode 4.0.0.
|
|
|
|
* And, this table can be used also for Unicode 3.2.0.
|
|
|
|
*/
|
|
|
|
String[][] EXCLUDED_UNICODE_3_2_0 = {
|
|
|
|
{"\u0340"},
|
|
|
|
{"\u0341"},
|
|
|
|
{"\u0343"},
|
|
|
|
{"\u0344"},
|
|
|
|
{"\u0374"},
|
|
|
|
{"\u037E"},
|
|
|
|
{"\u0387"},
|
|
|
|
{"\u0958"},
|
|
|
|
{"\u0959", "\u095F"},
|
|
|
|
{"\u09DC"},
|
|
|
|
{"\u09DD"},
|
|
|
|
{"\u09DF"},
|
|
|
|
{"\u0A33"},
|
|
|
|
{"\u0A36"},
|
|
|
|
{"\u0A59", "\u0A5B"},
|
|
|
|
{"\u0A5E"},
|
|
|
|
{"\u0B5C"},
|
|
|
|
{"\u0B5D"},
|
|
|
|
{"\u0F43"},
|
|
|
|
{"\u0F4D"},
|
|
|
|
{"\u0F52"},
|
|
|
|
{"\u0F57"},
|
|
|
|
{"\u0F5C"},
|
|
|
|
{"\u0F69"},
|
|
|
|
{"\u0F73"},
|
|
|
|
{"\u0F75"},
|
|
|
|
{"\u0F76"},
|
|
|
|
{"\u0F78"},
|
|
|
|
{"\u0F81"},
|
|
|
|
{"\u0F93"},
|
|
|
|
{"\u0F9D"},
|
|
|
|
{"\u0FA2"},
|
|
|
|
{"\u0FA7"},
|
|
|
|
{"\u0FAC"},
|
|
|
|
{"\u0FB9"},
|
|
|
|
{"\u1F71"},
|
|
|
|
{"\u1F73"},
|
|
|
|
{"\u1F75"},
|
|
|
|
{"\u1F77"},
|
|
|
|
{"\u1F79"},
|
|
|
|
{"\u1F7B"},
|
|
|
|
{"\u1F7D"},
|
|
|
|
{"\u1FBB"},
|
|
|
|
{"\u1FBE"},
|
|
|
|
{"\u1FC9"},
|
|
|
|
{"\u1FCB"},
|
|
|
|
{"\u1FD3"},
|
|
|
|
{"\u1FDB"},
|
|
|
|
{"\u1FE3"},
|
|
|
|
{"\u1FEB"},
|
|
|
|
{"\u1FEE"},
|
|
|
|
{"\u1FEF"},
|
|
|
|
{"\u1FF9"},
|
|
|
|
{"\u1FFB"},
|
|
|
|
{"\u1FFD"},
|
|
|
|
{"\u2000"},
|
|
|
|
{"\u2001"},
|
|
|
|
{"\u2126"},
|
|
|
|
{"\u212A"},
|
|
|
|
{"\u212B"},
|
|
|
|
{"\u2329"},
|
|
|
|
{"\u232A"},
|
|
|
|
{"\u2ADC"},
|
|
|
|
{"\uF900", "\uFA0D"},
|
|
|
|
{"\uFA10"},
|
|
|
|
{"\uFA12"},
|
|
|
|
{"\uFA15", "\uFA1E"},
|
|
|
|
{"\uFA20"},
|
|
|
|
{"\uFA22"},
|
|
|
|
{"\uFA25"},
|
|
|
|
{"\uFA26"},
|
|
|
|
{"\uFA2A", "\uFA2D"},
|
|
|
|
{"\uFA30", "\uFA6A"},
|
|
|
|
{"\uFB1D"},
|
|
|
|
{"\uFB1F"},
|
|
|
|
{"\uFB2A", "\uFB36"},
|
|
|
|
{"\uFB38", "\uFB3C"},
|
|
|
|
{"\uFB3E"},
|
|
|
|
{"\uFB40"},
|
|
|
|
{"\uFB41"},
|
|
|
|
{"\uFB43"},
|
|
|
|
{"\uFB44"},
|
|
|
|
{"\uFB46", "\uFB4E"},
|
|
|
|
{"\uD834\uDD5E", "\uD834\uDD64"},
|
|
|
|
{"\uD834\uDDBB", "\uD834\uDDC0"},
|
|
|
|
{"\uD87E\uDC00", "\uD87E\uDE1D"}
|
|
|
|
};
|
|
|
|
|
|
|
|
String[][] EXCLUDED_LATEST = {
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) {
|
|
|
|
if (EXCLUDED_UNICODE_3_2_0[i].length == 1) {
|
|
|
|
checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]);
|
|
|
|
} else {
|
|
|
|
int from, to;
|
|
|
|
from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0);
|
|
|
|
to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0);
|
|
|
|
|
|
|
|
for (int j = from; j <= to; j++) {
|
|
|
|
checkCompositionExclusion_320(String.valueOf(Character.toChars(j)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void checkCompositionExclusion_320(String s) throws Exception {
|
|
|
|
String a = String.valueOf(s);
|
|
|
|
String b = NormalizerBase.normalize(a, NFKD);
|
|
|
|
String c = NormalizerBase.normalize(b, NFC);
|
|
|
|
|
|
|
|
if (c.equals(a)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(b.getBytes()) + " x COMPOSE => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(c.getBytes()) + " for the latest Unicode");
|
2019-05-23 12:21:21 -07:00
|
|
|
} else if (verbose) {
|
2023-04-28 00:16:29 +00:00
|
|
|
logln("Ok: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(b.getBytes()) + " x COMPOSE => " +
|
|
|
|
HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(c.getBytes()) + " for the latest Unicode");
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2);
|
|
|
|
c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2);
|
|
|
|
if (c.equals(a)) {
|
2023-04-28 00:16:29 +00:00
|
|
|
errln("FAIL: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x COMPOSE => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(c.getBytes()) + " for Unicode 3.2.0");
|
2019-05-23 12:21:21 -07:00
|
|
|
} else if (verbose) {
|
2023-04-28 00:16:29 +00:00
|
|
|
logln("Ok: " + HexFormat.of().withDelimiter(" ")
|
|
|
|
.formatHex(a.getBytes()) + " x DECOMP_COMPAT => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(b.getBytes()) + " x COMPOSE => " +
|
|
|
|
HexFormat.of().withDelimiter(" ").formatHex(c.getBytes()) + " for Unicode 3.2.0");
|
2019-05-23 12:21:21 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestTibetan() throws Exception {
|
|
|
|
String[][] decomp = {
|
|
|
|
{ "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
|
|
|
|
};
|
|
|
|
String[][] compose = {
|
|
|
|
{ "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
|
|
|
|
};
|
|
|
|
|
|
|
|
staticTest(NFD, decomp, 1);
|
|
|
|
staticTest(NFKD,decomp, 2);
|
|
|
|
staticTest(NFC, compose, 1);
|
|
|
|
staticTest(NFKC,compose, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestExplodingBase() throws Exception{
|
|
|
|
// \u017f - Latin small letter long s
|
|
|
|
// \u0307 - combining dot above
|
|
|
|
// \u1e61 - Latin small letter s with dot above
|
|
|
|
// \u1e9b - Latin small letter long s with dot above
|
|
|
|
String[][] canon = {
|
|
|
|
// Input Decomposed Composed
|
|
|
|
{ "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
|
|
|
|
{ "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
|
|
|
|
};
|
|
|
|
String[][] compat = {
|
|
|
|
// Input Decomposed Composed
|
|
|
|
{ "\u017f", "s", "s" },
|
|
|
|
{ "\u1e9b", "s\u0307", "\u1e61" },
|
|
|
|
};
|
|
|
|
|
|
|
|
staticTest(NFD, canon, 1);
|
|
|
|
staticTest(NFC, canon, 2);
|
|
|
|
staticTest(NFKD, compat, 1);
|
|
|
|
staticTest(NFKC, compat, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
private String[][] canonTests = {
|
|
|
|
// Input Decomposed Composed
|
|
|
|
|
|
|
|
{ "cat", "cat", "cat" },
|
|
|
|
{ "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
|
|
|
|
|
|
|
|
// D-dot_above
|
|
|
|
{ "\u1e0a", "D\u0307", "\u1e0a" },
|
|
|
|
|
|
|
|
// D dot_above
|
|
|
|
{ "D\u0307", "D\u0307", "\u1e0a" },
|
|
|
|
|
|
|
|
// D-dot_below dot_above
|
|
|
|
{ "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" },
|
|
|
|
|
|
|
|
// D-dot_above dot_below
|
|
|
|
{ "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
|
|
|
|
|
|
|
|
// D dot_below dot_above
|
|
|
|
{ "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
|
|
|
|
|
|
|
|
// D dot_below cedilla dot_above
|
|
|
|
{ "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"},
|
|
|
|
|
|
|
|
// D dot_above ogonek dot_below
|
|
|
|
{ "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"},
|
|
|
|
|
|
|
|
// E-macron-grave
|
|
|
|
{ "\u1E14", "E\u0304\u0300", "\u1E14" },
|
|
|
|
|
|
|
|
// E-macron + grave
|
|
|
|
{ "\u0112\u0300", "E\u0304\u0300", "\u1E14" },
|
|
|
|
|
|
|
|
// E-grave + macron
|
|
|
|
{ "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" },
|
|
|
|
|
|
|
|
// angstrom_sign
|
|
|
|
{ "\u212b", "A\u030a", "\u00c5" },
|
|
|
|
|
|
|
|
// A-ring
|
|
|
|
{ "\u00c5", "A\u030a", "\u00c5" },
|
|
|
|
{ "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
|
|
|
|
{ "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
|
|
|
|
|
|
|
|
//updated with 3.0
|
|
|
|
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
|
|
|
|
{ "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" },
|
|
|
|
|
|
|
|
{ "Henry IV", "Henry IV", "Henry IV" },
|
|
|
|
{ "Henry \u2163", "Henry \u2163", "Henry \u2163" },
|
|
|
|
|
|
|
|
// ga(Zenkaku-Katakana)
|
|
|
|
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
// ka(Zenkaku-Katakana) + ten(Zenkaku)
|
|
|
|
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
// ka(Hankaku-Katakana) + ten(Hankaku-Katakana)
|
|
|
|
{ "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" },
|
|
|
|
|
|
|
|
// ka(Zenkaku-Katakana) + ten(Hankaku)
|
|
|
|
{ "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" },
|
|
|
|
// ka(Hankaku-Katakana) + ten(Zenkaku)
|
|
|
|
{ "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" },
|
|
|
|
|
|
|
|
{ "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
|
|
|
|
|
|
|
|
{ "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e",
|
|
|
|
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65",
|
|
|
|
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" },
|
|
|
|
};
|
|
|
|
|
|
|
|
private String[][] compatTests = {
|
|
|
|
// Input Decomposed Composed
|
|
|
|
|
|
|
|
{ "cat", "cat", "cat" },
|
|
|
|
|
|
|
|
// Alef-Lamed vs. Alef, Lamed
|
|
|
|
{ "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", },
|
|
|
|
|
|
|
|
{ "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
|
|
|
|
|
|
|
|
// ffi ligature -> f + f + i
|
|
|
|
{ "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" },
|
|
|
|
|
|
|
|
//updated for 3.0
|
|
|
|
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
|
|
|
|
|
|
|
|
// ffi ligature -> f + f + i
|
|
|
|
{ "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" },
|
|
|
|
|
|
|
|
{ "Henry IV", "Henry IV", "Henry IV" },
|
|
|
|
{ "Henry \u2163", "Henry IV", "Henry IV" },
|
|
|
|
|
|
|
|
// ga(Zenkaku-Katakana)
|
|
|
|
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
// ka(Zenkaku-Katakana) + ten(Zenkaku)
|
|
|
|
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
// ka(Hankaku-Katakana) + ten(Zenkaku)
|
|
|
|
{ "\uFF76\u3099", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
/* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
|
|
|
|
// ka(Hankaku-Katakana) + ten(Hankaku)
|
|
|
|
{ "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" },
|
|
|
|
|
|
|
|
// ka(Zenkaku-Katakana) + ten(Hankaku)
|
|
|
|
{ "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" },
|
|
|
|
};
|
|
|
|
|
|
|
|
public void TestNFD() throws Exception{
|
|
|
|
staticTest(NFD, canonTests, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestNFC() throws Exception{
|
|
|
|
staticTest(NFC, canonTests, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestNFKD() throws Exception{
|
|
|
|
staticTest(NFKD, compatTests, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestNFKC() throws Exception{
|
|
|
|
staticTest(NFKC, compatTests, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
private void staticTest(java.text.Normalizer.Form form,
|
|
|
|
String[][] tests,
|
|
|
|
int outCol) throws Exception {
|
|
|
|
for (int i = 0; i < tests.length; i++) {
|
|
|
|
String input = tests[i][0];
|
2023-04-28 00:16:29 +00:00
|
|
|
logln("Normalizing '" + input + "' (" + HexFormat.of()
|
|
|
|
.withDelimiter(" ").formatHex(input.getBytes()) + ")" );
|
2019-05-23 12:21:21 -07:00
|
|
|
|
|
|
|
String expect =tests[i][outCol];
|
|
|
|
String output = java.text.Normalizer.normalize(input, form);
|
|
|
|
|
|
|
|
if (!output.equals(expect)) {
|
|
|
|
errln("FAIL: case " + i
|
2023-04-28 00:16:29 +00:00
|
|
|
+ " expected '" + expect + "' (" + HexFormat.of()
|
|
|
|
.withDelimiter(" ").formatHex(expect.getBytes()) + ")"
|
|
|
|
+ " but got '" + output + "' (" + HexFormat.of()
|
|
|
|
.withDelimiter(" ").formatHex(output.getBytes()) + ")"
|
2019-05-23 12:21:21 -07:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// With Canonical decomposition, Hangul syllables should get decomposed
|
|
|
|
// into Jamo, but Jamo characters should not be decomposed into
|
|
|
|
// conjoining Jamo
|
|
|
|
private String[][] hangulCanon = {
|
|
|
|
// Input Decomposed Composed
|
|
|
|
{ "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
|
|
|
|
{ "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
|
|
|
|
};
|
|
|
|
|
|
|
|
public void TestHangulCompose() throws Exception{
|
|
|
|
logln("Canonical composition...");
|
|
|
|
staticTest(NFC, hangulCanon, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void TestHangulDecomp() throws Exception{
|
|
|
|
logln("Canonical decomposition...");
|
|
|
|
staticTest(NFD, hangulCanon, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|