jdk-24/test/jdk/java/text/BreakIterator/BreakIteratorTest.java
2023-10-03 16:38:17 +00:00

1505 lines
62 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 1996, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
* 4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
* 4152416 4153072 4158381 4214367 4217703 4638433 8264765 8291660
* 8294008
* @run junit/timeout=2000 BreakIteratorTest
* @summary test BreakIterator
*/
/*
* This file is available under and governed by the GNU General Public
* License version 2 only, as published by the Free Software Foundation.
* However, the following notice accompanied the original version of this
* file and, per its terms, should not be removed:
*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
*
* Portions copyright (c) 2007 Sun Microsystems, Inc.
* All Rights Reserved.
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
*/
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.BreakIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Locale;
import java.util.Vector;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.fail;
public class BreakIteratorTest {
private final BreakIterator characterBreak = BreakIterator.getCharacterInstance();
private final BreakIterator wordBreak = BreakIterator.getWordInstance();
private final BreakIterator lineBreak = BreakIterator.getLineInstance();
private final BreakIterator sentenceBreak = BreakIterator.getSentenceInstance();
//=========================================================================
// general test subroutines
//=========================================================================
private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
StringBuffer buffer = new StringBuffer();
String text;
for (int i = 0; i < expectedResult.size(); i++) {
text = (String)expectedResult.elementAt(i);
buffer.append(text);
}
text = buffer.toString();
bi.setText(text);
Vector nextResults = testFirstAndNext(bi, text);
Vector previousResults = testLastAndPrevious(bi, text);
System.out.println("comparing forward and backward...");
compareFragmentLists("forward iteration", "backward iteration", nextResults,
previousResults);
System.out.println("comparing expected and actual...");
compareFragmentLists("expected result", "actual result", expectedResult,
nextResults);
int[] boundaries = new int[expectedResult.size() + 3];
boundaries[0] = BreakIterator.DONE;
boundaries[1] = 0;
for (int i = 0; i < expectedResult.size(); i++)
boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
length();
boundaries[boundaries.length - 1] = BreakIterator.DONE;
testFollowing(bi, text, boundaries);
testPreceding(bi, text, boundaries);
testIsBoundary(bi, text, boundaries);
doMultipleSelectionTest(bi, text);
}
private Vector testFirstAndNext(BreakIterator bi, String text) {
int p = bi.first();
int lastP = p;
Vector<String> result = new Vector<String>();
if (p != 0)
fail("first() returned " + p + " instead of 0");
while (p != BreakIterator.DONE) {
p = bi.next();
if (p != BreakIterator.DONE) {
if (p <= lastP)
fail("next() failed to move forward: next() on position "
+ lastP + " yielded " + p);
result.addElement(text.substring(lastP, p));
}
else {
if (lastP != text.length())
fail("next() returned DONE prematurely: offset was "
+ lastP + " instead of " + text.length());
}
lastP = p;
}
return result;
}
private Vector testLastAndPrevious(BreakIterator bi, String text) {
int p = bi.last();
int lastP = p;
Vector<String> result = new Vector<String>();
if (p != text.length())
fail("last() returned " + p + " instead of " + text.length());
while (p != BreakIterator.DONE) {
p = bi.previous();
if (p != BreakIterator.DONE) {
if (p >= lastP)
fail("previous() failed to move backward: previous() on position "
+ lastP + " yielded " + p);
result.insertElementAt(text.substring(p, lastP), 0);
}
else {
if (lastP != 0)
fail("previous() returned DONE prematurely: offset was "
+ lastP + " instead of 0");
}
lastP = p;
}
return result;
}
private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
int p1 = 0;
int p2 = 0;
String s1;
String s2;
int t1 = 0;
int t2 = 0;
while (p1 < f1.size() && p2 < f2.size()) {
s1 = (String)f1.elementAt(p1);
s2 = (String)f2.elementAt(p2);
t1 += s1.length();
t2 += s2.length();
if (s1.equals(s2)) {
debugLogln(" >" + s1 + "<");
++p1;
++p2;
}
else {
int tempT1 = t1;
int tempT2 = t2;
int tempP1 = p1;
int tempP2 = p2;
while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
while (tempT1 < tempT2 && tempP1 < f1.size()) {
tempT1 += ((String)f1.elementAt(tempP1)).length();
++tempP1;
}
while (tempT2 < tempT1 && tempP2 < f2.size()) {
tempT2 += ((String)f2.elementAt(tempP2)).length();
++tempP2;
}
}
System.out.println("*** " + f1Name + " has:");
while (p1 <= tempP1 && p1 < f1.size()) {
s1 = (String)f1.elementAt(p1);
t1 += s1.length();
debugLogln(" *** >" + s1 + "<");
++p1;
}
System.out.println("***** " + f2Name + " has:");
while (p2 <= tempP2 && p2 < f2.size()) {
s2 = (String)f2.elementAt(p2);
t2 += s2.length();
debugLogln(" ***** >" + s2 + "<");
++p2;
}
fail("Discrepancy between " + f1Name + " and " + f2Name
+ "\n---\n" + f1 +"\n---\n" + f2);
}
}
}
private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
System.out.println("testFollowing():");
int p = 2;
int i = 0;
try {
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
if (i == boundaries[p])
++p;
int b = bi.following(i);
System.out.println("bi.following(" + i + ") -> " + b);
if (b != boundaries[p])
fail("Wrong result from following() for " + i + ": expected " + boundaries[p]
+ ", got " + b);
}
} catch (IllegalArgumentException illargExp) {
fail("IllegalArgumentException caught from following() for offset: " + i);
}
}
private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
System.out.println("testPreceding():");
int p = 0;
int i = 0;
try {
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
int b = bi.preceding(i);
System.out.println("bi.preceding(" + i + ") -> " + b);
if (b != boundaries[p])
fail("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
+ ", got " + b);
if (i == boundaries[p + 1])
++p;
}
} catch (IllegalArgumentException illargExp) {
fail("IllegalArgumentException caught from preceding() for offset: " + i);
}
}
private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
System.out.println("testIsBoundary():");
int p = 1;
boolean isB;
for (int i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
isB = bi.isBoundary(i);
System.out.println("bi.isBoundary(" + i + ") -> " + isB);
if (i == boundaries[p]) {
if (!isB)
fail("Wrong result from isBoundary() for " + i + ": expected true, got false");
++p;
}
else {
if (isB)
fail("Wrong result from isBoundary() for " + i + ": expected false, got true");
}
}
}
private void doMultipleSelectionTest(BreakIterator iterator, String testText)
{
System.out.println("Multiple selection test...");
BreakIterator testIterator = (BreakIterator)iterator.clone();
int offset = iterator.first();
int testOffset;
int count = 0;
do {
testOffset = testIterator.first();
testOffset = testIterator.next(count);
System.out.println("next(" + count + ") -> " + testOffset);
if (offset != testOffset)
fail("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != BreakIterator.DONE) {
count++;
offset = iterator.next();
}
} while (offset != BreakIterator.DONE);
// now do it backwards...
offset = iterator.last();
count = 0;
do {
testOffset = testIterator.last();
testOffset = testIterator.next(count);
System.out.println("next(" + count + ") -> " + testOffset);
if (offset != testOffset)
fail("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != BreakIterator.DONE) {
count--;
offset = iterator.previous();
}
} while (offset != BreakIterator.DONE);
}
private void doBreakInvariantTest(BreakIterator tb, String testChars)
{
StringBuffer work = new StringBuffer("aaa");
int errorCount = 0;
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
// change this back when new BI code is added
for (int i = 0; i < breaks.length(); i++) {
work.setCharAt(1, breaks.charAt(i));
for (int j = 0; j < testChars.length(); j++) {
work.setCharAt(0, testChars.charAt(j));
for (int k = 0; k < testChars.length(); k++) {
char c = testChars.charAt(k);
// if a cr is followed by lf, don't do the check (they stay together)
if (work.charAt(1) == '\r' && (c == '\n'))
continue;
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
// for breaking purposes as per UTR14
int type1 = Character.getType(work.charAt(1));
int type2 = Character.getType(c);
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
type2 == Character.CONTROL || type2 == Character.FORMAT) {
continue;
}
work.setCharAt(2, c);
tb.setText(work.toString());
boolean seen2 = false;
for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
if (l == 2)
seen2 = true;
}
if (!seen2) {
fail("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
}
private void doOtherInvariantTest(BreakIterator tb, String testChars)
{
StringBuffer work = new StringBuffer("a\r\na");
int errorCount = 0;
// a break should never occur between CR and LF
for (int i = 0; i < testChars.length(); i++) {
work.setCharAt(0, testChars.charAt(i));
for (int j = 0; j < testChars.length(); j++) {
work.setCharAt(3, testChars.charAt(j));
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
fail("Break between CR and LF in string U+" + Integer.toHexString(
(int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
(int)(work.charAt(3))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
// a break should never occur before a non-spacing mark, unless it's preceded
// by a line terminator
work.setLength(0);
work.append("aaaa");
for (int i = 0; i < testChars.length(); i++) {
char c = testChars.charAt(i);
if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
continue;
work.setCharAt(1, c);
for (int j = 0; j < testChars.length(); j++) {
c = testChars.charAt(j);
if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
!= Character.ENCLOSING_MARK)
continue;
work.setCharAt(2, c);
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
// for breaking purposes as per UTR14
int type1 = Character.getType(work.charAt(1));
int type2 = Character.getType(work.charAt(2));
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
type2 == Character.CONTROL || type2 == Character.FORMAT) {
continue;
}
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
fail("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
public void debugLogln(String s) {
final String zeros = "0000";
String temp;
StringBuffer out = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c >= ' ' && c < '\u007f')
out.append(c);
else {
out.append("\\u");
temp = Integer.toHexString((int)c);
out.append(zeros.substring(0, 4 - temp.length()));
out.append(temp);
}
}
System.out.println(out.toString());
}
//=========================================================================
// tests
//=========================================================================
@Test
public void TestWordBreak() {
Vector<String> wordSelectionData = new Vector<String>();
wordSelectionData.addElement("12,34");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\u00A2"); //cent sign
wordSelectionData.addElement("\u00A3"); //pound sign
wordSelectionData.addElement("\u00A4"); //currency sign
wordSelectionData.addElement("\u00A5"); //yen sign
wordSelectionData.addElement("alpha-beta-gamma");
wordSelectionData.addElement(".");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("Badges");
wordSelectionData.addElement("?");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("BADGES");
wordSelectionData.addElement("!");
wordSelectionData.addElement("?");
wordSelectionData.addElement("!");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("We");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("don't");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("need");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("no");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("STINKING");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("BADGES");
wordSelectionData.addElement("!");
wordSelectionData.addElement("!");
wordSelectionData.addElement("!");
wordSelectionData.addElement("012.566,5");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("123.3434,900");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("1000,233,456.000");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("1,23.322%");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("123.1222");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\u0024123,000.20");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("179.01\u0025");
wordSelectionData.addElement("Hello");
wordSelectionData.addElement(",");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("how");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("are");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("you");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("X");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("Now");
wordSelectionData.addElement("\r");
wordSelectionData.addElement("is");
wordSelectionData.addElement("\n");
wordSelectionData.addElement("the");
wordSelectionData.addElement("\r\n");
wordSelectionData.addElement("time");
wordSelectionData.addElement("\n");
wordSelectionData.addElement("\r");
wordSelectionData.addElement("for");
wordSelectionData.addElement("\r");
wordSelectionData.addElement("\r");
wordSelectionData.addElement("all");
wordSelectionData.addElement(" ");
generalIteratorTest(wordBreak, wordSelectionData);
}
@Test
public void TestBug4097779() {
Vector<String> wordSelectionData = new Vector<String>();
wordSelectionData.addElement("aa\u0300a");
wordSelectionData.addElement(" ");
generalIteratorTest(wordBreak, wordSelectionData);
}
@Test
public void TestBug4098467Words() {
Vector<String> wordSelectionData = new Vector<String>();
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
wordSelectionData.addElement("\uc0c1\ud56d");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\ud55c\uc778");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\uc5f0\ud569");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
wordSelectionData.addElement(" ");
// conjoining jamo...
wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
wordSelectionData.addElement(" ");
wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
wordSelectionData.addElement(" ");
generalIteratorTest(wordBreak, wordSelectionData);
}
@Test
public void TestBug4117554Words() {
Vector<String> wordSelectionData = new Vector<String>();
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
// count as a Kanji character for the purposes of word breaking
wordSelectionData.addElement("abc");
wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
wordSelectionData.addElement("abc");
generalIteratorTest(wordBreak, wordSelectionData);
}
@Test
public void TestSentenceBreak() {
Vector<String> sentenceSelectionData = new Vector<String>();
sentenceSelectionData.addElement("This is a simple sample sentence. ");
sentenceSelectionData.addElement("(This is it.) ");
sentenceSelectionData.addElement("This is a simple sample sentence. ");
sentenceSelectionData.addElement("\"This isn\'t it.\" ");
sentenceSelectionData.addElement("Hi! ");
sentenceSelectionData.addElement("This is a simple sample sentence. ");
sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
sentenceSelectionData.addElement("He said, that I said, that you said!! ");
sentenceSelectionData.addElement("Don't rock the boat.\u2029");
sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
sentenceSelectionData.addElement("Not on my time (el timo.)! ");
sentenceSelectionData.addElement("So what!!\u2029");
sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
sentenceSelectionData.addElement("He answered, \"You may not!\" ");
sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4113835() {
Vector<String> sentenceSelectionData = new Vector<String>();
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4111338() {
Vector<String> sentenceSelectionData = new Vector<String>();
// test for bug #4111338: Don't break sentences at the boundary between CJK
// and other letters
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
+ "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
+ "\u611d\u57b6\u2510\u5d46\".\u2029");
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
+ "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
+ "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
+ "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
+ "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4117554Sentences() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Treat fullwidth variants of .!? the same as their
// normal counterparts
sentenceSelectionData.addElement("I know I'm right\uff0e ");
sentenceSelectionData.addElement("Right\uff1f ");
sentenceSelectionData.addElement("Right\uff01 ");
// Don't break sentences at boundary between CJK and digits
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
+ "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
+ "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
// Break sentence between a sentence terminator and
// opening punctuation
sentenceSelectionData.addElement("no?");
sentenceSelectionData.addElement("(yes)");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4158381() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Don't break sentence after period if it isn't followed by a space
sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class. ");
sentenceSelectionData.addElement("Another test.\u2029");
// No breaks when there are no terminators around
sentenceSelectionData.addElement("<P>Provides a set of "
+ "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
+ "</SUP></FONT> language) components that, "
+ "to the maximum degree possible, work the same on all platforms. ");
sentenceSelectionData.addElement("Another test.\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4143071() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Make sure sentences that end with digits work right
sentenceSelectionData.addElement("Today is the 27th of May, 1998. ");
sentenceSelectionData.addElement("Tomorrow with be 28 May 1998. ");
sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4152416() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Make sure sentences ending with a capital letter are treated correctly
sentenceSelectionData.addElement("The type of all primitive "
+ "<code>boolean</code> values accessed in the target VM. ");
sentenceSelectionData.addElement("Calls to xxx will return an "
+ "implementor of this interface.\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4152117() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Make sure sentence breaking is handling punctuation correctly
// [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
// IT DOESN'T CROP UP]
sentenceSelectionData.addElement("Constructs a randomly generated "
+ "BigInteger, uniformly distributed over the range <tt>0</tt> "
+ "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. ");
sentenceSelectionData.addElement("The uniformity of the distribution "
+ "assumes that a fair source of random bits is provided in "
+ "<tt>rnd</tt>. ");
sentenceSelectionData.addElement("Note that this constructor always "
+ "constructs a non-negative BigInteger.\u2029");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug8264765() {
Vector<String> sentenceSelectionData = new Vector<String>();
// Comma should not be regarded as the start of a sentence,
// otherwise the backwards rule would break the following sentence.
sentenceSelectionData.addElement(
"Due to a problem (e.g., software bug), the server is down. ");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestLineBreak() {
Vector<String> lineSelectionData = new Vector<String>();
lineSelectionData.addElement("Multi-");
lineSelectionData.addElement("Level ");
lineSelectionData.addElement("example ");
lineSelectionData.addElement("of ");
lineSelectionData.addElement("a ");
lineSelectionData.addElement("semi-");
lineSelectionData.addElement("idiotic ");
lineSelectionData.addElement("non-");
lineSelectionData.addElement("sensical ");
lineSelectionData.addElement("(non-");
lineSelectionData.addElement("important) ");
lineSelectionData.addElement("sentence. ");
lineSelectionData.addElement("Hi ");
lineSelectionData.addElement("Hello ");
lineSelectionData.addElement("How\n");
lineSelectionData.addElement("are\r");
lineSelectionData.addElement("you\u2028");
lineSelectionData.addElement("fine.\t");
lineSelectionData.addElement("good. ");
lineSelectionData.addElement("Now\r");
lineSelectionData.addElement("is\n");
lineSelectionData.addElement("the\r\n");
lineSelectionData.addElement("time\n");
lineSelectionData.addElement("\r");
lineSelectionData.addElement("for\r");
lineSelectionData.addElement("\r");
lineSelectionData.addElement("all");
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4068133() {
Vector<String> lineSelectionData = new Vector<String>();
lineSelectionData.addElement("\u96f6");
lineSelectionData.addElement("\u4e00\u3002");
lineSelectionData.addElement("\u4e8c\u3001");
lineSelectionData.addElement("\u4e09\u3002\u3001");
lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
lineSelectionData.addElement("\u4e94,");
lineSelectionData.addElement("\u516d.");
lineSelectionData.addElement("\u4e03.\u3001,\u3002");
lineSelectionData.addElement("\u516b");
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4086052() {
Vector<String> lineSelectionData = new Vector<String>();
lineSelectionData.addElement("foo\u00a0bar ");
// lineSelectionData.addElement("foo\ufeffbar");
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4097920() {
Vector<String> lineSelectionData = new Vector<String>();
lineSelectionData.addElement("dog,");
lineSelectionData.addElement("cat,");
lineSelectionData.addElement("mouse ");
lineSelectionData.addElement("(one)");
lineSelectionData.addElement("(two)\n");
generalIteratorTest(lineBreak, lineSelectionData);
}
/*
@Test
public void TestBug4035266() {
Vector<String> lineSelectionData = new Vector<String>();
lineSelectionData.addElement("The ");
lineSelectionData.addElement("balance ");
lineSelectionData.addElement("is ");
lineSelectionData.addElement("$-23,456.78, ");
lineSelectionData.addElement("not ");
lineSelectionData.addElement("-$32,456.78!\n");
generalIteratorTest(lineBreak, lineSelectionData);
}
*/
@Test
public void TestBug4098467Lines() {
Vector<String> lineSelectionData = new Vector<String>();
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
lineSelectionData.addElement("\uc0c1");
lineSelectionData.addElement("\ud56d ");
lineSelectionData.addElement("\ud55c");
lineSelectionData.addElement("\uc778 ");
lineSelectionData.addElement("\uc5f0");
lineSelectionData.addElement("\ud569 ");
lineSelectionData.addElement("\uc7a5");
lineSelectionData.addElement("\ub85c");
lineSelectionData.addElement("\uad50");
lineSelectionData.addElement("\ud68c ");
// conjoining jamo...
lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4117554Lines() {
Vector<String> lineSelectionData = new Vector<String>();
// Fullwidth .!? should be treated as postJwrd
lineSelectionData.addElement("\u4e01\uff0e");
lineSelectionData.addElement("\u4e02\uff01");
lineSelectionData.addElement("\u4e03\uff1f");
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4217703() {
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
Vector<String> lineSelectionData = new Vector<String>();
// There shouldn't be a line break between sentence-ending punctuation
// and a closing quote
lineSelectionData.addElement("He ");
lineSelectionData.addElement("said ");
lineSelectionData.addElement("\"Go!\" ");
lineSelectionData.addElement("I ");
lineSelectionData.addElement("went. ");
lineSelectionData.addElement("Hashtable$Enumeration ");
lineSelectionData.addElement("getText().");
lineSelectionData.addElement("getIndex()");
generalIteratorTest(lineBreak, lineSelectionData);
}
private static final String graveS = "S\u0300";
private static final String acuteBelowI = "i\u0317";
private static final String acuteE = "e\u0301";
private static final String circumflexA = "a\u0302";
private static final String tildeE = "e\u0303";
@Test
public void TestCharacterBreak() {
Vector<String> characterSelectionData = new Vector<String>();
characterSelectionData.addElement(graveS);
characterSelectionData.addElement(acuteBelowI);
characterSelectionData.addElement("m");
characterSelectionData.addElement("p");
characterSelectionData.addElement("l");
characterSelectionData.addElement(acuteE);
characterSelectionData.addElement(" ");
characterSelectionData.addElement("s");
characterSelectionData.addElement(circumflexA);
characterSelectionData.addElement("m");
characterSelectionData.addElement("p");
characterSelectionData.addElement("l");
characterSelectionData.addElement(tildeE);
characterSelectionData.addElement(".");
characterSelectionData.addElement("w");
characterSelectionData.addElement(circumflexA);
characterSelectionData.addElement("w");
characterSelectionData.addElement("a");
characterSelectionData.addElement("f");
characterSelectionData.addElement("q");
characterSelectionData.addElement("\n");
characterSelectionData.addElement("\r");
characterSelectionData.addElement("\r\n");
characterSelectionData.addElement("\n");
generalIteratorTest(characterBreak, characterSelectionData);
}
@Test
public void TestBug4098467Characters() {
Vector<String> characterSelectionData = new Vector<String>();
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
characterSelectionData.addElement("\uc0c1");
characterSelectionData.addElement("\ud56d");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\ud55c");
characterSelectionData.addElement("\uc778");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\uc5f0");
characterSelectionData.addElement("\ud569");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\uc7a5");
characterSelectionData.addElement("\ub85c");
characterSelectionData.addElement("\uad50");
characterSelectionData.addElement("\ud68c");
characterSelectionData.addElement(" ");
// conjoining jamo...
characterSelectionData.addElement("\u1109\u1161\u11bc");
characterSelectionData.addElement("\u1112\u1161\u11bc");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\u1112\u1161\u11ab");
characterSelectionData.addElement("\u110b\u1175\u11ab");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\u110b\u1167\u11ab");
characterSelectionData.addElement("\u1112\u1161\u11b8");
characterSelectionData.addElement(" ");
characterSelectionData.addElement("\u110c\u1161\u11bc");
characterSelectionData.addElement("\u1105\u1169");
characterSelectionData.addElement("\u1100\u116d");
characterSelectionData.addElement("\u1112\u116c");
generalIteratorTest(characterBreak, characterSelectionData);
}
@Test
public void TestBug4153072() {
BreakIterator iter = BreakIterator.getWordInstance();
String str = "...Hello, World!...";
int begin = 3;
int end = str.length() - 3;
boolean gotException = false;
boolean dummy;
iter.setText(new StringCharacterIterator(str, begin, end, begin));
for (int index = -1; index < begin + 1; ++index) {
try {
dummy = iter.isBoundary(index);
if (index < begin)
fail("Didn't get exception with offset = " + index +
" and begin index = " + begin);
}
catch (IllegalArgumentException e) {
if (index >= begin)
fail("Got exception with offset = " + index +
" and begin index = " + begin);
}
}
}
@Test
public void TestBug4146175Sentences() {
Vector<String> sentenceSelectionData = new Vector<String>();
// break between periods and opening punctuation even when there's no
// intervening space
sentenceSelectionData.addElement("end.");
sentenceSelectionData.addElement("(This is\u2029");
// treat the fullwidth period as an unambiguous sentence terminator
sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
generalIteratorTest(sentenceBreak, sentenceSelectionData);
}
@Test
public void TestBug4146175Lines() {
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
Vector<String> lineSelectionData = new Vector<String>();
// the fullwidth comma should stick to the preceding Japanese character
lineSelectionData.addElement("\u7d42\uff0c");
lineSelectionData.addElement("\u308f");
generalIteratorTest(lineBreak, lineSelectionData);
}
@Test
public void TestBug4214367() {
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
Vector<String> wordSelectionData = new Vector<String>();
// the hiragana and katakana iteration marks and the long vowel mark
// are not being treated correctly by the word-break iterator
wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
generalIteratorTest(wordBreak, wordSelectionData);
}
private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
= /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
+ "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
+ "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
+ "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
+ "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
+ "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
@Test
public void TestSentenceInvariants()
{
BreakIterator e = BreakIterator.getSentenceInstance();
doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
}
@Test
public void TestWordInvariants()
{
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
BreakIterator e = BreakIterator.getWordInstance();
doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
+ "\u30a3\u4e00\u4e01\u4e02");
doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
+ "\u30a3\u4e00\u4e01\u4e02");
}
@Test
public void TestLineInvariants()
{
if (Locale.getDefault().getLanguage().equals("th")) {
System.out.println("This test is skipped in th locale.");
return;
}
BreakIterator e = BreakIterator.getLineInstance();
String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
+ "\u30a3\u4e00\u4e01\u4e02";
doBreakInvariantTest(e, testChars);
doOtherInvariantTest(e, testChars);
int errorCount = 0;
// in addition to the other invariants, a line-break iterator should make sure that:
// it doesn't break around the non-breaking characters
String noBreak = "\u00a0\u2007\u2011\ufeff";
StringBuffer work = new StringBuffer("aaa");
for (int i = 0; i < testChars.length(); i++) {
char c = testChars.charAt(i);
if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
continue;
work.setCharAt(0, c);
for (int j = 0; j < noBreak.length(); j++) {
work.setCharAt(1, noBreak.charAt(j));
for (int k = 0; k < testChars.length(); k++) {
work.setCharAt(2, testChars.charAt(k));
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
// for breaking purposes as per UTR14
int type1 = Character.getType(work.charAt(1));
int type2 = Character.getType(work.charAt(2));
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
type2 == Character.CONTROL || type2 == Character.FORMAT) {
continue;
}
e.setText(work.toString());
for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
if (l == 1 || l == 2) {
//errln("Got break between U+" + Integer.toHexString((int)
// (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
// (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
// as per UTR14 spaces followed by a GLUE character should allow
// line breaking
if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
work.charAt(l) == '\u0f0c' ||
work.charAt(l) == '\u2007' ||
work.charAt(l) == '\u2011' ||
work.charAt(l) == '\u202f' ||
work.charAt(l) == '\ufeff')) {
continue;
}
fail("Got break between U+" + Integer.toHexString((int)
(work.charAt(l - 1))) + " and U+" + Integer.toHexString(
(int)(work.charAt(l))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
}
// The following test has so many exceptions that it would be better to write a new set of data
// that tested exactly what should be tested
// Until that point it will be commented out
/*
// it does break after dashes (unless they're followed by a digit, a non-spacing mark,
// a currency symbol, a space, a format-control character, a regular control character,
// a line or paragraph separator, or another dash)
String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
for (int i = 0; i < testChars.length(); i++) {
work.setCharAt(0, testChars.charAt(i));
for (int j = 0; j < dashes.length(); j++) {
work.setCharAt(1, dashes.charAt(j));
for (int k = 0; k < testChars.length(); k++) {
char c = testChars.charAt(k);
if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
Character.getType(c) == Character.OTHER_NUMBER ||
Character.getType(c) == Character.NON_SPACING_MARK ||
Character.getType(c) == Character.ENCLOSING_MARK ||
Character.getType(c) == Character.CURRENCY_SYMBOL ||
Character.getType(c) == Character.DASH_PUNCTUATION ||
Character.getType(c) == Character.SPACE_SEPARATOR ||
Character.getType(c) == Character.FORMAT ||
Character.getType(c) == Character.CONTROL ||
Character.getType(c) == Character.END_PUNCTUATION ||
Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
Character.getType(c) == Character.OTHER_PUNCTUATION ||
c == '\'' || c == '\"' ||
// category EX as per UTR14
c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
c == '\u0003' || c == '\u2007' || c == '\u2011' ||
c == '\ufeff')
continue;
work.setCharAt(2, c);
e.setText(work.toString());
boolean saw2 = false;
for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
if (l == 2)
saw2 = true;
if (!saw2) {
fail("Didn't get break between U+" + Integer.toHexString((int)
(work.charAt(1))) + " and U+" + Integer.toHexString(
(int)(work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
*/
}
@Test
public void TestCharacterInvariants()
{
BreakIterator e = BreakIterator.getCharacterInstance();
doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
+ "\u11a9\u11aa");
doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
+ "\u11a9\u11aa");
}
@Test
public void TestEmptyString()
{
String text = "";
Vector<String> x = new Vector<String>();
x.addElement(text);
generalIteratorTest(lineBreak, x);
}
@Test
public void TestGetAvailableLocales()
{
Locale[] locList = BreakIterator.getAvailableLocales();
if (locList.length == 0)
fail("getAvailableLocales() returned an empty list!");
// I have no idea how to test this function...
}
/**
* Bug 4095322
*/
@Test
public void TestJapaneseLineBreak()
{
StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
// Breaking on <Kanji>$<Kanji> is inconsistent
/* Characters in precedingChars and followingChars have been updated
* from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
* In concrete terms,
* 0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
* 0x169B & 0x169C : added since Unicode 3.0.0.
*/
String precedingChars =
/* Puctuation, Open */
"([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
/* Punctuation, Initial quote */
+ "\u00ab\u2018\u201b\u201c\u201f\u2039"
/* Symbol, Currency */
+ "\u00a5\u00a3\u00a4\u20a0";
String followingChars =
/* Puctuation, Close */
")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
/* Punctuation, Final quote */
+ "\u00bb\u2019\u201d\u203a"
/* Punctuation, Other */
+ "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
/* Punctuation, Dash */
+ "\u2103\u2109"
/* Symbol, Currency */
+ "\u00a2"
/* Letter, Modifier */
+ "\u3005\u309d\u309e"
/* Letter, Other */
+ "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
/* Mark, Non-Spacing */
+ "\u0300\u0301\u0302"
/* Symbol, Modifier */
+ "\u309b\u309c"
/* Symbol, Other */
+ "\u00b0";
BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
for (int i = 0; i < precedingChars.length(); i++) {
testString.setCharAt(1, precedingChars.charAt(i));
iter.setText(testString.toString());
int j = iter.first();
if (j != 0) {
fail("ja line break failure: failed to start at 0 and bounced at " + j);
}
j = iter.next();
if (j != 1) {
fail("ja line break failure: failed to stop before '"
+ precedingChars.charAt(i) + "' (\\u"
+ Integer.toString(precedingChars.charAt(i), 16)
+ ") at 1 and bounded at " + j);
}
j = iter.next();
if (j != 3) {
fail("ja line break failure: failed to skip position after '"
+ precedingChars.charAt(i) + "' (\\u"
+ Integer.toString(precedingChars.charAt(i), 16)
+ ") at 3 and bounded at " + j);
}
}
for (int i = 0; i < followingChars.length(); i++) {
testString.setCharAt(1, followingChars.charAt(i));
iter.setText(testString.toString());
int j = iter.first();
if (j != 0) {
fail("ja line break failure: failed to start at 0 and bounded at " + j);
}
j = iter.next();
if (j != 2) {
fail("ja line break failure: failed to skip position before '"
+ followingChars.charAt(i) + "' (\\u"
+ Integer.toString(followingChars.charAt(i), 16)
+ ") at 2 and bounded at " + j);
}
j = iter.next();
if (j != 3) {
fail("ja line break failure: failed to stop after '"
+ followingChars.charAt(i) + "' (\\u"
+ Integer.toString(followingChars.charAt(i), 16)
+ ") at 3 and bounded at " + j);
}
}
}
/**
* Bug 4638433
*/
@Test
public void TestLineBreakBasedOnUnicode3_0_0()
{
BreakIterator iter;
int i;
/* Latin Extend-B characters
* 0x0218-0x0233 which have been added since Unicode 3.0.0.
*/
iter = BreakIterator.getWordInstance(Locale.US);
iter.setText("\u0216\u0217\u0218\u0219\u021A");
i = iter.first();
i = iter.next();
if (i != 5) {
fail("Word break failure: failed to stop at 5 and bounded at " + i);
}
iter = BreakIterator.getLineInstance(Locale.US);
/* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
* \u301f has changed its category from Ps to Pe since Unicode 2.1.
*/
iter.setText("32\u301f1");
i = iter.first();
i = iter.next();
if (i != 3) {
fail("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
}
/* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
* which have been added since Unicode 3.0.0.
*/
iter.setText("\u1820\u1806\u1821");
i = iter.first();
i = iter.next();
if (i != 2) {
fail("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
}
/* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
* been added since Unicode 3.0.0.
*/
iter.setText("\u17E0\u17DB\u17E1");
i = iter.first();
i = iter.next();
if (i != 1) {
fail("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
}
i = iter.next();
if (i != 3) {
fail("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
}
/* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
* been added since Unicode 3.0.0.
*/
iter.setText("\u1692\u1680\u1696");
i = iter.first();
i = iter.next();
if (i != 2) {
fail("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
}
// Confirm changes in BreakIteratorRules_th.java have been reflected.
iter = BreakIterator.getLineInstance(Locale.of("th"));
/* Thai <Seven(Nd)>
* <Left Double Quotation Mark(Pi)>
* <Five(Nd)>
* <Right Double Quotation Mark(Pf)>
* <Three(Nd)>
*/
iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
i = iter.first();
i = iter.next();
if (i != 1) {
fail("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
}
i = iter.next();
if (i != 4) {
fail("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
}
}
/**
* Bug 4068137
*/
@Test
public void TestEndBehavior()
{
String testString = "boo.";
BreakIterator wb = BreakIterator.getWordInstance();
wb.setText(testString);
if (wb.first() != 0)
fail("Didn't get break at beginning of string.");
if (wb.next() != 3)
fail("Didn't get break before period in \"boo.\"");
if (wb.current() != 4 && wb.next() != 4)
fail("Didn't get break at end of string.");
}
// [serialization test has been removed pursuant to bug #4152965]
/**
* Bug 4450804
*/
@Test
public void TestLineBreakContractions() {
Vector<String> expected = new Vector<String>();
expected.add("These ");
expected.add("are ");
expected.add("'foobles'. ");
expected.add("Don't ");
expected.add("you ");
expected.add("like ");
expected.add("them?");
generalIteratorTest(lineBreak, expected);
}
private static final Pattern CODEPOINT = Pattern.compile("([0-9A-F]{4,5})");
@Test
public void TestGraphemeBreak() throws Exception {
Files.lines(Paths.get(System.getProperty("test.root"),
"../../src/java.base/share/data/unicodedata/auxiliary/GraphemeBreakTest.txt"))
.map(ln -> ln.replaceFirst("#.*", ""))
.filter(Predicate.not(String::isEmpty))
.map(line -> line.split("\\s*÷[\\s\\t]*"))
.forEach(sa -> {
Vector<String> expected = new Vector<>(
Arrays.stream(sa)
.map(line -> CODEPOINT.matcher(line).replaceAll(mr -> Character.toString(Integer.valueOf(mr.group(),16))))
.map(line -> line.replaceAll("\\s×\\s", ""))
.filter(Predicate.not(String::isEmpty))
.toList());
generalIteratorTest(characterBreak, expected);
});
}
@Test
public void TestSetTextIOOBException() {
BreakIterator.getCharacterInstance().setText(new StringCharacterIterator("abcfefg", 1, 5, 3));
}
}