8302877: Speed up latin1 case conversions

Reviewed-by: naoto, redestad
This commit is contained in:
Eirik Bjorsnos 2023-02-21 20:54:36 +00:00 committed by Naoto Sato
parent 1ea5f9f7cd
commit ef1f7bd3b8
5 changed files with 166 additions and 23 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -135,30 +135,36 @@ class CharacterDataLatin1 extends CharacterData {
}
int toLowerCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);
if (((val & $$maskLowerCase) != 0) &&
((val & $$maskCaseOffset) != $$maskCaseOffset)) {
int offset = val << $$shiftCaseOffsetSign >> ($$shiftCaseOffsetSign+$$shiftCaseOffset);
mapChar = ch + offset;
if (ch < 'A') { // Fast path for low code points
return ch;
}
return mapChar;
int l = ch | 0x20; // Lowercase using 'oldest ASCII trick in the book'
if (l <= 'z' // In range a-z
|| (l >= 0xE0 && l <= 0xFE && l != 0xF7)) { // ..or agrave-thorn, excluding division
return l;
}
return ch;
}
int toUpperCase(int ch) {
int mapChar = ch;
int val = getProperties(ch);
if ((val & $$maskUpperCase) != 0) {
if ((val & $$maskCaseOffset) != $$maskCaseOffset) {
int offset = val << $$shiftCaseOffsetSign >> ($$shiftCaseOffsetSign+$$shiftCaseOffset);
mapChar = ch - offset;
} else if (ch == 0x00B5) {
mapChar = 0x039C;
}
if (ch < 'a') { // Fast path for low code points
return ch;
}
return mapChar;
int U = ch & 0xDF; // Uppercase using 'oldest ASCII trick in the book'
if (U <= 'Z' // In range A-Z
|| (U >= 0xC0 && U <= 0xDE && U != 0xD7)) { // ..or Agrave-Thorn, excluding multiplication
return U;
}
// Special-case for 'y with Diaeresis' which uppercases out of latin1
if (ch == 0xFF) {
return 0x178; // Capital Letter Y with Diaeresis
}
// Special-case for 'Micro Sign' which uppercases out of latin1
if (ch == 0xB5) {
return 0x39C; // Greek Capital Letter Mu
}
return ch;
}
int toTitleCase(int ch) {

View File

@ -0,0 +1,92 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import org.testng.annotations.Test;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.fail;
/**
* @test
* @bug 8302877
* @summary Provides exhaustive verification of Character.toUpperCase and Character.toLowerCase
* for all code points in the latin1 range 0-255.
* @run testng Latin1CaseConversion
*/
public class Latin1CaseConversion {
@Test
public void shouldUpperCaseAndLowerCaseLatin1() {
for (int c = 0; c < 256; c++) {
int upper = Character.toUpperCase(c);
int lower = Character.toLowerCase(c);
if (c < 0x41) { // Before A
assertUnchanged(upper, lower, c);
} else if (c <= 0x5A) { // A-Z
assertEquals(upper, c);
assertEquals(lower, c + 32);
} else if (c < 0x61) { // Between Z and a
assertUnchanged(upper, lower, c);
} else if (c <= 0x7A) { // a-z
assertEquals(upper, c - 32);
assertEquals(lower, c);
} else if (c < 0xB5) { // Between z and Micro Sign
assertUnchanged(upper, lower, c);
} else if (c == 0xB5) { // Special case for Micro Sign
assertEquals(upper, 0x39C);
assertEquals(lower, c);
} else if (c < 0xC0) { // Between my and A-grave
assertUnchanged(upper, lower, c);
} else if (c < 0xD7) { // A-grave - O with Diaeresis
assertEquals(upper, c);
assertEquals(lower, c + 32);
} else if (c == 0xD7) { // Multiplication
assertUnchanged(upper, lower, c);
} else if (c <= 0xDE) { // O with slash - Thorn
assertEquals(upper, c);
assertEquals(lower, c + 32);
} else if (c == 0xDF) { // Sharp s
assertUnchanged(upper, lower, c);
} else if (c < 0xF7) { // a-grave - divsion
assertEquals(upper, c - 32);
assertEquals(lower, c);
} else if (c == 0xF7) { // Division
assertUnchanged(upper, lower, c);
} else if (c < 0xFF) { // o with slash - thorn
assertEquals(upper, c - 32);
assertEquals(lower, c);
} else if (c == 0XFF) { // Special case for y with Diaeresis
assertEquals(upper, 0x178);
assertEquals(lower, c);
} else {
fail("Uncovered code point: " + Integer.toHexString(c));
}
}
}
private static void assertUnchanged(int upper, int lower, int c) {
assertEquals(upper, c);
assertEquals(lower, c);
}
}

View File

@ -2230,7 +2230,7 @@ FormatData/ar_YE/NumberElements/8=\u2030
FormatData/ar_YE/NumberElements/9=\u221e
FormatData/ar_YE/NumberElements/10=\ufffd
# bug #4113654 (this is obviously not an exchaustive test; I'm trying it here for a single
# bug #4113654 (this is obviously not an exhaustive test; I'm trying it here for a single
# inheritance chain only. This bug fix also gets tested fairly well by the tests for all
# the other bugs as given above)
FormatData//NumberPatterns/0=#,##0.###;-#,##0.###

View File

@ -2185,7 +2185,7 @@ FormatData/ar_YE/arab.NumberElements/8=\u0609
FormatData/ar_YE/arab.NumberElements/9=\u221e
FormatData/ar_YE/arab.NumberElements/10=\u0644\u064a\u0633\u00a0\u0631\u0642\u0645
# bug #4113654 (this is obviously not an exchaustive test; I'm trying it here for a single
# bug #4113654 (this is obviously not an exhaustive test; I'm trying it here for a single
# inheritance chain only. This bug fix also gets tested fairly well by the tests for all
# the other bugs as given above)
FormatData//latn.NumberPatterns/0=#,##0.###

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -27,11 +27,13 @@ package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@ -80,4 +82,47 @@ public class Characters {
}
}
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(3)
public static class CaseConversions {
@Param({
"low", // 0x09 pre A
"A", // 0x41 uppercase A
"a", // 0x61 lowercase a
"A-grave", // 0xC0 uppercase A-grave
"a-grave", // 0xE0 lowercase a-grave
"micro", // 0xB5 lowercase 'Micro Sign'
"yD" // 0xFF lowercase 'y with Diaeresis'
})
private String codePoint;
private int cp;
@Setup(Level.Trial)
public void setup() {
cp = switch (codePoint) {
case "low" -> 0x09;
case "A" -> 0x41;
case "a" -> 0x61;
case "A-grave" -> 0xC0;
case "a-grave" -> 0xE0;
case "yD" -> 0xE0;
case "micro" -> 0xFF;
default -> Integer.parseInt(codePoint);;
};
}
@Benchmark
public int toUpperCase() {
return Character.toUpperCase(cp);
}
@Benchmark
public int toLowerCase() {
return Character.toLowerCase(cp);
}
}
}