8041791: String.toLowerCase regression - violates Unicode standard

Reviewed-by: peytoia
2014-05-14 10:52:51 -07:00 · 2014-05-14 10:52:51 -07:00 · d0dadf23c5
commit d0dadf23c5
parent 3b959b9adf
3 changed files with 25 additions and 9 deletions
--- a/jdk/src/share/classes/java/lang/ConditionalSpecialCasing.java
+++ b/jdk/src/share/classes/java/lang/ConditionalSpecialCasing.java
@ -62,6 +62,7 @@ final class ConditionalSpecialCasing {
        //# Conditional mappings
        //# ================================================================================
        new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
+        new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE

        //# ================================================================================
        //# Locale-sensitive mappings
@ -77,8 +78,8 @@ final class ConditionalSpecialCasing {

        //# ================================================================================
        //# Turkish and Azeri
-//      new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
-//      new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
+        new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
+        new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
        new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
        new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
        new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
@ -147,21 +148,25 @@ final class ConditionalSpecialCasing {

    private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
        HashSet<Entry> set = entryTable.get(new Integer(src.codePointAt(index)));
+        char[] ret = null;

        if (set != null) {
            Iterator<Entry> iter = set.iterator();
            String currentLang = locale.getLanguage();
            while (iter.hasNext()) {
                Entry entry = iter.next();
-                String conditionLang= entry.getLanguage();
+                String conditionLang = entry.getLanguage();
                if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
                        isConditionMet(src, index, locale, entry.getCondition())) {
-                    return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
+                    ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase();
+                    if (conditionLang != null) {
+                        break;
+                    }
                }
            }
        }

-        return null;
+        return ret;
    }

    private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
--- a/jdk/src/share/classes/java/lang/String.java
+++ b/jdk/src/share/classes/java/lang/String.java
@ -2583,6 +2583,9 @@ public final class String
            if (cp == '\u03A3') {                       // GREEK CAPITAL LETTER SIGMA
                return toLowerCaseEx(result, i, locale, false);
            }
+            if (cp == '\u0130') {                       // LATIN CAPITAL LETTER I WITH DOT ABOVE
+                return toLowerCaseEx(result, i, locale, true);
+            }
            cp = Character.toLowerCase(cp);
            if (!Character.isBmpCodePoint(cp)) {
                return toLowerCaseEx(result, i, locale, false);
--- a/jdk/test/java/lang/String/ToLowerCase.java
+++ b/jdk/test/java/lang/String/ToLowerCase.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -23,7 +23,7 @@

 /*
    @test
-    @bug 4217441 4533872 4900935 8020037 8032012
+    @bug 4217441 4533872 4900935 8020037 8032012 8041791
    @summary toLowerCase should lower-case Greek Sigma correctly depending
             on the context (final/non-final).  Also it should handle
             Locale specific (lt, tr, and az) lowercasings and supplementary
@ -72,8 +72,10 @@ public class ToLowerCase {
        // I-dot tests
        test("\u0130", turkish, "i");
        test("\u0130", az, "i");
-        test("\u0130", lt, "i");
-        test("\u0130", Locale.US, "i");
+        test("\u0130", lt, "\u0069\u0307");
+        test("\u0130", Locale.US, "\u0069\u0307");
+        test("\u0130", Locale.JAPAN, "\u0069\u0307");
+        test("\u0130", Locale.ROOT, "\u0069\u0307");

        // Remove dot_above in the sequence I + dot_above (Turkish and Azeri)
        test("I\u0307", turkish, "i");
@ -111,6 +113,12 @@ public class ToLowerCase {
            if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE) {
                continue;
            }
+            if (cp == 0x0130) {
+                // Although UnicodeData.txt has the lower case char as \u0069, it should be
+                // handled with the rules in SpecialCasing.txt, i.e., \u0069\u0307 in
+                // non Turkic locales.
+                continue;
+            }
            int lowerCase = Character.toLowerCase(cp);
            if (lowerCase == -1) {    //Character.ERROR
                continue;