jdk-24/jdk/test/sun/nio/cs/TestUTF8.java
Xueming Shen 630d73eb0a 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
6636317: Optimize UTF-8 coder for ASCII input

Re-write the UTF-8 charset to obey the standard and improve the performance

Reviewed-by: alanb
2008-08-22 14:37:46 -07:00

394 lines
17 KiB
Java

/*
* Copyright 2008 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
* @test
* @bug 4486841
* @summary Test UTF-8 charset
*/
import java.nio.charset.*;
import java.nio.*;
import java.util.*;
public class TestUTF8 {
static char[] decode(byte[] bb, String csn, boolean testDirect)
throws Exception {
CharsetDecoder dec = Charset.forName(csn).newDecoder();
ByteBuffer bbf;
CharBuffer cbf;
if (testDirect) {
bbf = ByteBuffer.allocateDirect(bb.length);
cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
bbf.put(bb).flip();
} else {
bbf = ByteBuffer.wrap(bb);
cbf = CharBuffer.allocate(bb.length);
}
CoderResult cr = dec.decode(bbf, cbf, true);
if (cr != CoderResult.UNDERFLOW)
throw new RuntimeException("Decoding err: " + csn);
char[] cc = new char[cbf.position()];
cbf.flip(); cbf.get(cc);
return cc;
}
static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
throws Exception {
CharsetDecoder dec = Charset.forName(csn).newDecoder();
ByteBuffer bbf;
CharBuffer cbf;
if (testDirect) {
bbf = ByteBuffer.allocateDirect(bb.length);
cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
bbf.put(bb).flip();
} else {
bbf = ByteBuffer.wrap(bb);
cbf = CharBuffer.allocate(bb.length);
}
return dec.decode(bbf, cbf, true);
}
static byte[] encode(char[] cc, String csn, boolean testDirect)
throws Exception {
ByteBuffer bbf;
CharBuffer cbf;
CharsetEncoder enc = Charset.forName(csn).newEncoder();
if (testDirect) {
bbf = ByteBuffer.allocateDirect(cc.length * 4);
cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
cbf.put(cc).flip();
} else {
bbf = ByteBuffer.allocate(cc.length * 4);
cbf = CharBuffer.wrap(cc);
}
CoderResult cr = enc.encode(cbf, bbf, true);
if (cr != CoderResult.UNDERFLOW)
throw new RuntimeException("Encoding err: " + csn);
byte[] bb = new byte[bbf.position()];
bbf.flip(); bbf.get(bb);
return bb;
}
static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
throws Exception {
ByteBuffer bbf;
CharBuffer cbf;
CharsetEncoder enc = Charset.forName(csn).newEncoder();
if (testDirect) {
bbf = ByteBuffer.allocateDirect(cc.length * 4);
cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
cbf.put(cc).flip();
} else {
bbf = ByteBuffer.allocate(cc.length * 4);
cbf = CharBuffer.wrap(cc);
}
return enc.encode(cbf, bbf, true);
}
static char[] getUTFChars() {
char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
(0x110000 - 0x10000) * 2]; //supp
int pos = 0;
int i = 0;
for (i = 0; i < 0xd800; i++)
cc[pos++] = (char)i;
for (i = 0xe000; i < 0x10000; i++)
cc[pos++] = (char)i;
for (i = 0x10000; i < 0x110000; i++) {
pos += Character.toChars(i, cc, pos);
}
return cc;
}
static int to3ByteUTF8(char c, byte[] bb, int pos) {
bb[pos++] = (byte)(0xe0 | ((c >> 12)));
bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
return 3;
}
static void checkRoundtrip(String csn) throws Exception {
System.out.printf(" Check roundtrip <%s>...", csn);
char[] cc = getUTFChars();
byte[] bb = encode(cc, csn, false);
char[] ccO = decode(bb, csn, false);
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" non-direct failed");
}
bb = encode(cc, csn, true);
ccO = decode(bb, csn, true);
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" (direct) failed");
}
System.out.println();
}
static void check6ByteSurrs(String csn) throws Exception {
System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn);
byte[] bb = new byte[(0x110000 - 0x10000) * 6];
char[] cc = new char[(0x110000 - 0x10000) * 2];
int bpos = 0;
int cpos = 0;
for (int i = 0x10000; i < 0x110000; i++) {
Character.toChars(i, cc, cpos);
bpos += to3ByteUTF8(cc[cpos], bb, bpos);
bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
cpos += 2;
}
char[] ccO = decode(bb, csn, false);
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" decoding failed%n");
}
ccO = decode(bb, csn, true);
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" decoding(direct) failed%n");
}
}
static void compare(String csn1, String csn2) throws Exception {
System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2);
char[] cc = getUTFChars();
byte[] bb1 = encode(cc, csn1, false);
byte[] bb2 = encode(cc, csn2, false);
if (!Arrays.equals(bb1, bb2))
System.out.printf(" encoding failed%n");
char[] cc1 = decode(bb1, csn1, false);
char[] cc2 = decode(bb1, csn2, false);
if (!Arrays.equals(cc1, cc2)) {
System.out.printf(" decoding failed%n");
}
bb1 = encode(cc, csn1, true);
bb2 = encode(cc, csn2, true);
if (!Arrays.equals(bb1, bb2))
System.out.printf(" encoding (direct) failed%n");
cc1 = decode(bb1, csn1, true);
cc2 = decode(bb1, csn2, true);
if (!Arrays.equals(cc1, cc2)) {
System.out.printf(" decoding (direct) failed%n");
}
}
// The first byte is the length of malformed bytes
static byte[][] malformed = {
// One-byte sequences:
{1, (byte)0xFF },
{1, (byte)0xC0 },
{1, (byte)0x80 },
{1, (byte)0xFF, (byte)0xFF}, // all ones
{1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
// Two-byte sequences:
{1, (byte)0xC0, (byte)0x80}, // invalid first byte
{1, (byte)0xC1, (byte)0xBF}, // invalid first byte
{1, (byte)0xC2, (byte)0x00}, // invalid second byte
{1, (byte)0xC2, (byte)0xC0}, // invalid second byte
{1, (byte)0xD0, (byte)0x00}, // invalid second byte
{1, (byte)0xD0, (byte)0xC0}, // invalid second byte
{1, (byte)0xDF, (byte)0x00}, // invalid second byte
{1, (byte)0xDF, (byte)0xC0}, // invalid second byte
// Three-byte sequences
{1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble
{1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
{1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
{1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
{1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
{2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
{2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
{1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
{1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
// Four-byte sequences
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
{1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
{1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
{1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte
{1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
{2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
{3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte
{1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
{2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
{3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
{1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
// Five-byte sequences
{5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte
{5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
{5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
{5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
{5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
{2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
{3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
{4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
// Six-byte sequences
{6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
{6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
{6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
{6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
{2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
{3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
{4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
{5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
};
static void checkMalformed(String csn) throws Exception {
boolean failed = false;
System.out.printf(" Check malformed <%s>...%n", csn);
for (boolean direct: new boolean[] {false, true}) {
for (byte[] bins : malformed) {
int mlen = bins[0];
byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
CoderResult cr = decodeCR(bin, csn, direct);
String ashex = "";
for (int i = 0; i < bin.length; i++) {
if (i > 0) ashex += " ";
ashex += Integer.toBinaryString((int)bin[i] & 0xff);
}
if (!cr.isMalformed()) {
System.out.printf(" FAIL(direct=%b): [%s] not malformed.\n", direct, ashex);
failed = true;
} else if (cr.length() != mlen) {
System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length());
failed = true;
}
}
}
if (failed)
throw new RuntimeException("Check malformed failed " + csn);
}
static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
int inPos = flow[0];
int inLen = flow[1];
int outPos = flow[2];
int outLen = flow[3];
int expedInPos = flow[4];
int expedOutPos = flow[5];
CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
:CoderResult.OVERFLOW;
ByteBuffer bbf;
CharBuffer cbf;
if (direct) {
bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
} else {
bbf = ByteBuffer.allocate(inPos + utf8s.length);
cbf = CharBuffer.allocate(outPos + outLen);
}
bbf.position(inPos);
bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
cbf.position(outPos);
dec.reset();
CoderResult cr = dec.decode(bbf, cbf, false);
if (cr != expedCR ||
bbf.position() != expedInPos ||
cbf.position() != expedOutPos) {
System.out.printf("Expected(direct=%5b): [", direct);
for (int i:flow) System.out.print(" " + i);
System.out.println("] CR=" + cr +
", inPos=" + bbf.position() +
", outPos=" + cbf.position());
return false;
}
return true;
}
static void checkUnderOverflow(String csn) throws Exception {
System.out.printf(" Check under/overflow <%s>...%n", csn);
CharsetDecoder dec = Charset.forName(csn).newDecoder();
boolean failed = false;
byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
int inlen = utf8s.length;
for (int inoff = 0; inoff < 20; inoff++) {
for (int outoff = 0; outoff < 20; outoff++) {
int[][] Flows = {
//inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1)
{inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1},
{inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1},
{inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1},
{inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1},
{inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0},
// underflow
{inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0},
{inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0},
{inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0},
{inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0},
{inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0},
{inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0},
{inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0},
{inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0},
{inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0},
{inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0},
// 2-byte underflow/overflow
{inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0},
{inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1},
// 3-byte underflow/overflow
{inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0},
{inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0},
{inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1},
// 4-byte underflow/overflow
{inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0},
{inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0},
{inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0},
{inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1},
};
for (boolean direct: new boolean[] {false, true}) {
for (int[] flow: Flows) {
if (!check(dec, utf8s, direct, flow))
failed = true;
}
}}}
if (failed)
throw new RuntimeException("Check under/overflow failed " + csn);
}
public static void main(String[] args) throws Exception {
checkRoundtrip("UTF-8");
check6ByteSurrs("UTF-8");
//compare("UTF-8", "UTF-8-OLD");
checkMalformed("UTF-8");
checkUnderOverflow("UTF-8");
}
}