8291660: Grapheme support in BreakIterator
Reviewed-by: smarks
This commit is contained in:
parent
a14c3a493a
commit
b8598b0297
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -94,25 +94,20 @@ public class GenerateBreakIteratorData {
|
||||
rules = (ResourceBundle) Class.forName(
|
||||
localizedBundleName("sun.text.resources", "BreakIteratorRules")).getDeclaredConstructor().newInstance();
|
||||
|
||||
if (info.containsKey("CharacterData")) {
|
||||
generateDataFile(info.getString("CharacterData"),
|
||||
rules.getString("CharacterBreakRules"),
|
||||
classNames[0]);
|
||||
}
|
||||
if (info.containsKey("WordData")) {
|
||||
generateDataFile(info.getString("WordData"),
|
||||
rules.getString("WordBreakRules"),
|
||||
classNames[1]);
|
||||
classNames[0]);
|
||||
}
|
||||
if (info.containsKey("LineData")) {
|
||||
generateDataFile(info.getString("LineData"),
|
||||
rules.getString("LineBreakRules"),
|
||||
classNames[2]);
|
||||
classNames[1]);
|
||||
}
|
||||
if (info.containsKey("SentenceData")) {
|
||||
generateDataFile(info.getString("SentenceData"),
|
||||
rules.getString("SentenceBreakRules"),
|
||||
classNames[3]);
|
||||
classNames[2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,12 +24,12 @@
|
||||
#
|
||||
|
||||
#
|
||||
# Rules to create $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/util/regex/EmojiData.java
|
||||
# Rules to create $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java
|
||||
#
|
||||
|
||||
GENSRC_EMOJIDATA := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/util/regex/EmojiData.java
|
||||
GENSRC_EMOJIDATA := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java
|
||||
|
||||
EMOJIDATATEMP = $(MODULE_SRC)/share/classes/java/util/regex/EmojiData.java.template
|
||||
EMOJIDATATEMP = $(MODULE_SRC)/share/classes/jdk/internal/util/regex/EmojiData.java.template
|
||||
UNICODEDATA = $(MODULE_SRC)/share/data/unicodedata
|
||||
|
||||
$(GENSRC_EMOJIDATA): $(BUILD_TOOLS_JDK) $(EMOJIDATATEMP) $(UNICODEDATA)/emoji/emoji-data.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1996, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -99,6 +99,12 @@ import sun.util.locale.provider.LocaleServiceProviderPool;
|
||||
* and a diacritical mark. What users consider to be a character can
|
||||
* differ between languages.
|
||||
*
|
||||
* @implSpec The default implementation of the character boundary analysis
|
||||
* conforms to the Unicode Consortium's Extended Grapheme Cluster breaks.
|
||||
* For more detail, refer to
|
||||
* <a href="https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">
|
||||
* Grapheme Cluster Boundaries</a> section in the Unicode Standard Annex #29.
|
||||
*
|
||||
* <p>
|
||||
* The {@code BreakIterator} instances returned by the factory methods
|
||||
* of this class are intended for use with natural languages only, not for
|
||||
|
@ -44,6 +44,7 @@ import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.util.regex.Grapheme;
|
||||
|
||||
/**
|
||||
* A compiled representation of a regular expression.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -23,7 +23,7 @@
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
package jdk.internal.util.regex;
|
||||
|
||||
/**
|
||||
* Holds data contained in the Unicode Technical Standard #51: Unicode
|
@ -23,11 +23,11 @@
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
package jdk.internal.util.regex;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
final class Grapheme {
|
||||
public final class Grapheme {
|
||||
|
||||
/**
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence.
|
||||
@ -43,7 +43,7 @@ final class Grapheme {
|
||||
* @param limit limit offset in the src (exclusive)
|
||||
* @return the next grapheme boundary
|
||||
*/
|
||||
static int nextBoundary(CharSequence src, int off, int limit) {
|
||||
public static int nextBoundary(CharSequence src, int off, int limit) {
|
||||
Objects.checkFromToIndex(off, limit, src.length());
|
||||
|
||||
int ch0 = Character.codePointAt(src, off);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -50,7 +50,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
|
||||
// built-in type of BreakIterator
|
||||
{"BreakIteratorClasses",
|
||||
new String[] {
|
||||
"RuleBasedBreakIterator", // character-break iterator class
|
||||
"RuleBasedBreakIterator", // word-break iterator class
|
||||
"RuleBasedBreakIterator", // line-break iterator class
|
||||
"RuleBasedBreakIterator" // sentence-break iterator class
|
||||
@ -58,7 +57,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
|
||||
},
|
||||
|
||||
// Rules filename for each break-iterator
|
||||
{"CharacterData", "CharacterBreakIteratorData"},
|
||||
{"WordData", "WordBreakIteratorData"},
|
||||
{"LineData", "LineBreakIteratorData"},
|
||||
{"SentenceData", "SentenceBreakIteratorData"},
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -67,53 +67,6 @@ import java.util.ListResourceBundle;
|
||||
public class BreakIteratorRules extends ListResourceBundle {
|
||||
protected final Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
// rules describing how to break between logical characters
|
||||
{ "CharacterBreakRules",
|
||||
|
||||
// ignore non-spacing marks and enclosing marks (since we never
|
||||
// put a break before ignore characters, this keeps combining
|
||||
// accents with the base characters they modify)
|
||||
"<enclosing>=[:Mn::Me:];"
|
||||
|
||||
// other category definitions
|
||||
+ "<choseong>=[\u1100-\u115f];"
|
||||
+ "<jungseong>=[\u1160-\u11a7];"
|
||||
+ "<jongseong>=[\u11a8-\u11ff];"
|
||||
+ "<surr-hi>=[\ud800-\udbff];"
|
||||
+ "<surr-lo>=[\udc00-\udfff];"
|
||||
|
||||
// break after every character, except as follows:
|
||||
+ ".;"
|
||||
|
||||
// keep base and combining characters togethers
|
||||
+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
|
||||
+ "<base><enclosing><enclosing>*;"
|
||||
|
||||
// keep CRLF sequences together
|
||||
+ "\r\n;"
|
||||
|
||||
// keep surrogate pairs together
|
||||
+ "<surr-hi><surr-lo>;"
|
||||
|
||||
// keep Hangul syllables spelled out using conjoining jamo together
|
||||
+ "<choseong>*<jungseong>*<jongseong>*;"
|
||||
|
||||
// various additions for Hindi support
|
||||
+ "<nukta>=[\u093c];"
|
||||
+ "<danda>=[\u0964\u0965];"
|
||||
+ "<virama>=[\u094d];"
|
||||
+ "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
|
||||
+ "<devConsonant>=[\u0915-\u0939];"
|
||||
+ "<devNuktaConsonant>=[\u0958-\u095f];"
|
||||
+ "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
|
||||
+ "<devCAMN>=(<devConsonant>{<nukta>});"
|
||||
+ "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);"
|
||||
+ "<zwj>=[\u200d];"
|
||||
+ "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
|
||||
+ "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
|
||||
+ "<danda><nukta>;"
|
||||
},
|
||||
|
||||
// default rules for finding word boundaries
|
||||
{ "WordBreakRules",
|
||||
// ignore non-spacing marks, enclosing marks, and format characters,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -25,13 +25,18 @@
|
||||
|
||||
package sun.util.locale.provider;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.spi.BreakIteratorProvider;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.MissingResourceException;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import jdk.internal.util.regex.Grapheme;
|
||||
import sun.text.DictionaryBasedBreakIterator;
|
||||
import sun.text.RuleBasedBreakIterator;
|
||||
|
||||
@ -45,10 +50,9 @@ import sun.text.RuleBasedBreakIterator;
|
||||
public class BreakIteratorProviderImpl extends BreakIteratorProvider
|
||||
implements AvailableLanguageTags {
|
||||
|
||||
private static final int CHARACTER_INDEX = 0;
|
||||
private static final int WORD_INDEX = 1;
|
||||
private static final int LINE_INDEX = 2;
|
||||
private static final int SENTENCE_INDEX = 3;
|
||||
private static final int WORD_INDEX = 0;
|
||||
private static final int LINE_INDEX = 1;
|
||||
private static final int SENTENCE_INDEX = 2;
|
||||
|
||||
private final LocaleProviderAdapter.Type type;
|
||||
private final Set<String> langtags;
|
||||
@ -127,10 +131,7 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
|
||||
*/
|
||||
@Override
|
||||
public BreakIterator getCharacterInstance(Locale locale) {
|
||||
return getBreakInstance(locale,
|
||||
CHARACTER_INDEX,
|
||||
"CharacterData",
|
||||
"CharacterDictionary");
|
||||
return new GraphemeBreakIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -193,4 +194,151 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
|
||||
public boolean isSupportedLocale(Locale locale) {
|
||||
return LocaleProviderAdapter.forType(type).isSupportedProviderLocale(locale, langtags);
|
||||
}
|
||||
|
||||
static final class GraphemeBreakIterator extends BreakIterator {
|
||||
CharacterIterator ci;
|
||||
int offset;
|
||||
List<Integer> boundaries;
|
||||
int boundaryIndex;
|
||||
|
||||
GraphemeBreakIterator() {
|
||||
setText("");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
boundaryIndex = 0;
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
boundaryIndex = boundaries.size() - 1;
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n == 0) {
|
||||
return offset;
|
||||
}
|
||||
|
||||
boundaryIndex = boundaryIndex + n;
|
||||
if (boundaryIndex < 0) {
|
||||
boundaryIndex = 0;
|
||||
current();
|
||||
return DONE;
|
||||
} else if (boundaryIndex >= boundaries.size()) {
|
||||
boundaryIndex = boundaries.size() - 1;
|
||||
current();
|
||||
return DONE;
|
||||
} else {
|
||||
return current();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
return next(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
return next(-1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
var lastBoundary = boundaries.get(boundaries.size() - 1);
|
||||
|
||||
if (offset < boundaries.get(0) || offset > lastBoundary) {
|
||||
throw new IllegalArgumentException("offset is out of bounds: " + offset);
|
||||
} else if (offset == this.offset && this.offset == lastBoundary) {
|
||||
return DONE;
|
||||
}
|
||||
|
||||
boundaryIndex = Collections.binarySearch(boundaries, Math.min(offset + 1, lastBoundary));
|
||||
if (boundaryIndex < 0) {
|
||||
boundaryIndex = -boundaryIndex - 1;
|
||||
}
|
||||
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
offset = boundaries.get(boundaryIndex);
|
||||
return offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return ci;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
ci = newText;
|
||||
var text = new CharacterIteratorCharSequence(ci);
|
||||
var end = ci.getEndIndex();
|
||||
boundaries = new ArrayList<>();
|
||||
|
||||
for (int b = ci.getBeginIndex(); b < end;) {
|
||||
boundaries.add(b);
|
||||
b = Grapheme.nextBoundary(text, b, end);
|
||||
}
|
||||
boundaries.add(end);
|
||||
boundaryIndex = 0;
|
||||
offset = ci.getIndex();
|
||||
}
|
||||
|
||||
// Had to override to suppress the bug in the BreakIterator's default impl.
|
||||
// See the comments in the default impl.
|
||||
@Override
|
||||
public boolean isBoundary(int offset) {
|
||||
if (offset < boundaries.get(0) || offset > boundaries.get(boundaries.size() - 1)) {
|
||||
throw new IllegalArgumentException("offset is out of bounds: " + offset);
|
||||
}
|
||||
return Collections.binarySearch(boundaries, offset) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(ci, offset, boundaries, boundaryIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
return o instanceof GraphemeBreakIterator that &&
|
||||
ci.equals(that.ci) &&
|
||||
offset == that.offset &&
|
||||
boundaries.equals(that.boundaries) &&
|
||||
boundaryIndex == that.boundaryIndex;
|
||||
}
|
||||
}
|
||||
|
||||
// Implementation only for calling Grapheme.nextBoundary()
|
||||
static final class CharacterIteratorCharSequence implements CharSequence {
|
||||
CharacterIterator src;
|
||||
CharacterIteratorCharSequence(CharacterIterator ci) {
|
||||
src = ci;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return src.getEndIndex() - src.getBeginIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
src.setIndex(index);
|
||||
return src.current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
// not expected to be called
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -53,7 +53,6 @@ public class BreakIteratorInfo_th extends ListResourceBundle {
|
||||
// built-in type of BreakIterator
|
||||
{"BreakIteratorClasses",
|
||||
new String[] {
|
||||
"RuleBasedBreakIterator", // character-break iterator class
|
||||
"DictionaryBasedBreakIterator", // word-break iterator class
|
||||
"DictionaryBasedBreakIterator", // line-break iterator class
|
||||
"RuleBasedBreakIterator" // sentence-break iterator class
|
||||
|
@ -25,7 +25,7 @@
|
||||
* @test
|
||||
* @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
|
||||
* 4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
|
||||
* 4152416 4153072 4158381 4214367 4217703 4638433 8264765
|
||||
* 4152416 4153072 4158381 4214367 4217703 4638433 8264765 8291660
|
||||
* @library /java/text/testlib
|
||||
* @run main/timeout=2000 BreakIteratorTest
|
||||
* @summary test BreakIterator
|
||||
@ -67,13 +67,15 @@
|
||||
*
|
||||
*/
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
import java.util.Vector;
|
||||
import java.util.Enumeration;
|
||||
import java.io.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class BreakIteratorTest extends IntlTest
|
||||
{
|
||||
@ -1449,4 +1451,21 @@ public class BreakIteratorTest extends IntlTest
|
||||
generalIteratorTest(lineBreak, expected);
|
||||
}
|
||||
|
||||
private static final Pattern CODEPOINT = Pattern.compile("([0-9A-F]{4,5})");
|
||||
public void TestGraphemeBreak() throws Exception {
|
||||
Files.lines(Paths.get(System.getProperty("test.root"),
|
||||
"../../src/java.base/share/data/unicodedata/auxiliary/GraphemeBreakTest.txt"))
|
||||
.map(ln -> ln.replaceFirst("#.*", ""))
|
||||
.filter(Predicate.not(String::isEmpty))
|
||||
.map(line -> line.split("\\s*÷[\\s\\t]*"))
|
||||
.forEach(sa -> {
|
||||
Vector<String> expected = new Vector<>(
|
||||
Arrays.stream(sa)
|
||||
.map(line -> CODEPOINT.matcher(line).replaceAll(mr -> Character.toString(Integer.valueOf(mr.group(),16))))
|
||||
.map(line -> line.replaceAll("\\s×\\s", ""))
|
||||
.filter(Predicate.not(String::isEmpty))
|
||||
.toList());
|
||||
generalIteratorTest(characterBreak, expected);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2007, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -122,7 +122,7 @@ public class MirroredBreakIterator extends BreakIterator {
|
||||
|
||||
@Override
|
||||
public boolean isBoundary(int offset) {
|
||||
// Call the default impelementation in BreakIterator
|
||||
// Call the default implementation in BreakIterator
|
||||
return super.isBoundary(offset);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2007, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -23,7 +23,7 @@
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 4052440 8062588 8165804 8210406
|
||||
* @bug 4052440 8062588 8165804 8210406 8291660
|
||||
* @summary BreakIteratorProvider tests
|
||||
* @library providersrc/foobarutils
|
||||
* providersrc/fooprovider
|
||||
@ -102,8 +102,9 @@ public class BreakIteratorProviderTest extends ProviderTest {
|
||||
// JRE
|
||||
String[] jresResult = new String[4];
|
||||
if (jreSupportsLocale) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
jresResult[i] = "sun.text." + classNames[i];
|
||||
jresResult[0] = "sun.util.locale.provider.BreakIteratorProviderImpl$GraphemeBreakIterator";
|
||||
for (int i = 1; i < 4; i++) {
|
||||
jresResult[i] = "sun.text." + classNames[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,24 +23,33 @@
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 7071819 8221431 8239383 8273430
|
||||
* @bug 7071819 8221431 8239383 8273430 8291660
|
||||
* @summary tests Unicode Extended Grapheme support
|
||||
* @library /lib/testlibrary/java/lang
|
||||
* @build java.base/java.util.regex.GraphemeTestAccessor
|
||||
* @modules java.base/jdk.internal.util.regex:+open
|
||||
* @run testng GraphemeTest
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.lang.invoke.MethodType;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import jdk.internal.util.regex.Grapheme;
|
||||
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
import static org.testng.Assert.fail;
|
||||
import static org.testng.Assert.assertFalse;
|
||||
import java.util.regex.GraphemeTestAccessor;
|
||||
|
||||
public class GraphemeTest {
|
||||
|
||||
private static MethodHandles.Lookup lookup;
|
||||
|
||||
@BeforeClass
|
||||
public static void setup() throws IllegalAccessException {
|
||||
lookup = MethodHandles.privateLookupIn(Grapheme.class, MethodHandles.lookup());
|
||||
}
|
||||
|
||||
@Test
|
||||
public static void testGraphemeBreakProperty() throws Throwable {
|
||||
testProps(UCDFiles.GRAPHEME_BREAK_PROPERTY);
|
||||
@ -52,12 +61,16 @@ public class GraphemeTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public static void testExcludedSpacingMarks() {
|
||||
assertFalse(GraphemeTestAccessor.isExcludedSpacingMark(0x1065));
|
||||
assertFalse(GraphemeTestAccessor.isExcludedSpacingMark(0x1066));
|
||||
public static void testExcludedSpacingMarks() throws Throwable {
|
||||
var mh = lookup.findStatic(
|
||||
Grapheme.class, "isExcludedSpacingMark", MethodType.methodType(boolean.class, int.class));
|
||||
assertFalse((boolean)mh.invokeExact(0x1065));
|
||||
assertFalse((boolean)mh.invokeExact(0x1066));
|
||||
}
|
||||
|
||||
private static void testProps(Path path) throws IOException {
|
||||
private static void testProps(Path path) throws Throwable {
|
||||
var mh = lookup.findStatic(
|
||||
Grapheme.class, "getType", MethodType.methodType(int.class, int.class));
|
||||
Files.lines(path)
|
||||
.map(ln -> ln.replaceFirst("#.*", ""))
|
||||
.filter(ln -> ln.length() != 0)
|
||||
@ -83,15 +96,19 @@ public class GraphemeTest {
|
||||
// canonical equivalence."
|
||||
// For "extended grapheme clusters" support, there is no
|
||||
// need actually to diff "extend" and "spackmark" given GB9, GB9a.
|
||||
if (!expected.equals(types[GraphemeTestAccessor.getType(cp)])) {
|
||||
if ("Extend".equals(expected) &&
|
||||
"SpacingMark".equals(types[GraphemeTestAccessor.getType(cp)]))
|
||||
System.out.printf("[%x] [%s][%d] -> [%s]%n",
|
||||
cp, expected, Character.getType(cp), types[GraphemeTestAccessor.getType(cp)]);
|
||||
else
|
||||
fail(String.format(
|
||||
"cp=[%x], expected:[%s] result:[%s]%n",
|
||||
cp, expected, types[GraphemeTestAccessor.getType(cp)]));
|
||||
try {
|
||||
if (!expected.equals(types[(int) mh.invokeExact(cp)])) {
|
||||
if ("Extend".equals(expected) &&
|
||||
"SpacingMark".equals(types[(int) mh.invokeExact(cp)]))
|
||||
System.out.printf("[%x] [%s][%d] -> [%s]%n",
|
||||
cp, expected, Character.getType(cp), types[(int) mh.invokeExact(cp)]);
|
||||
else
|
||||
fail(String.format(
|
||||
"cp=[%x], expected:[%s] result:[%s]%n",
|
||||
cp, expected, types[(int) mh.invokeExact(cp)]));
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
public class GraphemeTestAccessor {
|
||||
|
||||
public static boolean isExcludedSpacingMark(int cp) {
|
||||
return Grapheme.isExcludedSpacingMark(cp);
|
||||
}
|
||||
|
||||
public static int getType(int cp) {
|
||||
return Grapheme.getType(cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user