From b1408532df6b0adfb18ec628eb7c0ee61bf1f9e4 Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Mon, 7 Nov 2011 13:46:02 -0800 Subject: [PATCH] 7096080: UTF8 update and new CESU-8 charset 7082884: Incorrect UTF8 conversion for sequence ED 31 7082883: Incorrect UTF8 conversion for sequence fc 80 80 8f bf bf Updated UTF8 and added CESU-8 to following the latest Standard Reviewed-by: alanb --- jdk/make/java/nio/FILES_java.gmk | 1 + jdk/src/share/classes/sun/nio/cs/CESU_8.java | 604 ++++++++++++++++++ jdk/src/share/classes/sun/nio/cs/UTF_8.java | 185 ++++-- .../classes/sun/nio/cs/standard-charsets | 4 + jdk/test/java/nio/charset/coders/Errors.java | 4 +- jdk/test/sun/nio/cs/TestStringCoding.java | 6 +- jdk/test/sun/nio/cs/TestStringCodingUTF8.java | 10 +- jdk/test/sun/nio/cs/TestUTF8.java | 201 ++++-- 8 files changed, 904 insertions(+), 111 deletions(-) create mode 100644 jdk/src/share/classes/sun/nio/cs/CESU_8.java diff --git a/jdk/make/java/nio/FILES_java.gmk b/jdk/make/java/nio/FILES_java.gmk index 41397cacdc0..b394b89e483 100644 --- a/jdk/make/java/nio/FILES_java.gmk +++ b/jdk/make/java/nio/FILES_java.gmk @@ -232,6 +232,7 @@ FILES_src = \ sun/nio/cs/UTF_16BE.java \ sun/nio/cs/UTF_16LE.java \ sun/nio/cs/UTF_8.java \ + sun/nio/cs/CESU_8.java \ sun/nio/cs/Unicode.java \ sun/nio/cs/UnicodeDecoder.java \ sun/nio/cs/UnicodeEncoder.java \ diff --git a/jdk/src/share/classes/sun/nio/cs/CESU_8.java b/jdk/src/share/classes/sun/nio/cs/CESU_8.java new file mode 100644 index 00000000000..40711f83d86 --- /dev/null +++ b/jdk/src/share/classes/sun/nio/cs/CESU_8.java @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package sun.nio.cs; + +import java.nio.Buffer; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +/* Legal CESU-8 Byte Sequences + * + * # Code Points Bits Bit/Byte pattern + * 1 7 0xxxxxxx + * U+0000..U+007F 00..7F + * + * 2 11 110xxxxx 10xxxxxx + * U+0080..U+07FF C2..DF 80..BF + * + * 3 16 1110xxxx 10xxxxxx 10xxxxxx + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+FFFF E1..EF 80..BF 80..BF + * + */ + +class CESU_8 extends Unicode +{ + public CESU_8() { + super("CESU-8", StandardCharsets.aliases_CESU_8); + } + + public String historicalName() { + return "CESU8"; + } + + public CharsetDecoder newDecoder() { + return new Decoder(this); + } + + public CharsetEncoder newEncoder() { + return new Encoder(this); + } + + private static final void updatePositions(Buffer src, int sp, + Buffer dst, int dp) { + src.position(sp - src.arrayOffset()); + dst.position(dp - dst.arrayOffset()); + } + + private static class Decoder extends CharsetDecoder + implements ArrayDecoder { + private Decoder(Charset cs) { + super(cs, 1.0f, 1.0f); + } + + private static boolean isNotContinuation(int b) { + return (b & 0xc0) != 0x80; + } + + // [E0] [A0..BF] [80..BF] + // [E1..EF] [80..BF] [80..BF] + private static boolean isMalformed3(int b1, int b2, int b3) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; + } + + // only used when there is only one byte left in src buffer + private static boolean isMalformed3_2(int b1, int b2) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80; + } + + + // [F0] [90..BF] [80..BF] [80..BF] + // [F1..F3] [80..BF] [80..BF] [80..BF] + // [F4] [80..8F] [80..BF] [80..BF] + // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] + // will be checked by Character.isSupplementaryCodePoint(uc) + private static boolean isMalformed4(int b2, int b3, int b4) { + return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || + (b4 & 0xc0) != 0x80; + } + + // only used when there is less than 4 bytes left in src buffer + private static boolean isMalformed4_2(int b1, int b2) { + return (b1 == 0xf0 && b2 == 0x90) || + (b2 & 0xc0) != 0x80; + } + + private static boolean isMalformed4_3(int b3) { + return (b3 & 0xc0) != 0x80; + } + + private static CoderResult malformedN(ByteBuffer src, int nb) { + switch (nb) { + case 1: + case 2: // always 1 + return CoderResult.malformedForLength(1); + case 3: + int b1 = src.get(); + int b2 = src.get(); // no need to lookup b3 + return CoderResult.malformedForLength( + ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + isNotContinuation(b2)) ? 1 : 2); + case 4: // we don't care the speed here + b1 = src.get() & 0xff; + b2 = src.get() & 0xff; + if (b1 > 0xf4 || + (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || + (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || + isNotContinuation(b2)) + return CoderResult.malformedForLength(1); + if (isNotContinuation(src.get())) + return CoderResult.malformedForLength(2); + return CoderResult.malformedForLength(3); + default: + assert false; + return null; + } + } + + private static CoderResult malformed(ByteBuffer src, int sp, + CharBuffer dst, int dp, + int nb) + { + src.position(sp - src.arrayOffset()); + CoderResult cr = malformedN(src, nb); + updatePositions(src, sp, dst, dp); + return cr; + } + + + private static CoderResult malformed(ByteBuffer src, + int mark, int nb) + { + src.position(mark); + CoderResult cr = malformedN(src, nb); + src.position(mark); + return cr; + } + + private static CoderResult malformedForLength(ByteBuffer src, + int sp, + CharBuffer dst, + int dp, + int malformedNB) + { + updatePositions(src, sp, dst, dp); + return CoderResult.malformedForLength(malformedNB); + } + + private static CoderResult malformedForLength(ByteBuffer src, + int mark, + int malformedNB) + { + src.position(mark); + return CoderResult.malformedForLength(malformedNB); + } + + + private static CoderResult xflow(Buffer src, int sp, int sl, + Buffer dst, int dp, int nb) { + updatePositions(src, sp, dst, dp); + return (nb == 0 || sl - sp < nb) + ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; + } + + private static CoderResult xflow(Buffer src, int mark, int nb) { + src.position(mark); + return (nb == 0 || src.remaining() < nb) + ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; + } + + private CoderResult decodeArrayLoop(ByteBuffer src, + CharBuffer dst) + { + // This method is optimized for ASCII input. + byte[] sa = src.array(); + int sp = src.arrayOffset() + src.position(); + int sl = src.arrayOffset() + src.limit(); + + char[] da = dst.array(); + int dp = dst.arrayOffset() + dst.position(); + int dl = dst.arrayOffset() + dst.limit(); + int dlASCII = dp + Math.min(sl - sp, dl - dp); + + // ASCII only loop + while (dp < dlASCII && sa[sp] >= 0) + da[dp++] = (char) sa[sp++]; + while (sp < sl) { + int b1 = sa[sp]; + if (b1 >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + if (dp >= dl) + return xflow(src, sp, sl, dst, dp, 1); + da[dp++] = (char) b1; + sp++; + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + if (sl - sp < 2 || dp >= dl) + return xflow(src, sp, sl, dst, dp, 2); + int b2 = sa[sp + 1]; + if (isNotContinuation(b2)) + return malformedForLength(src, sp, dst, dp, 1); + da[dp++] = (char) (((b1 << 6) ^ b2) + ^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + sp += 2; + } else if ((b1 >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + int srcRemaining = sl - sp; + if (srcRemaining < 3 || dp >= dl) { + if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) + return malformedForLength(src, sp, dst, dp, 1); + return xflow(src, sp, sl, dst, dp, 3); + } + int b2 = sa[sp + 1]; + int b3 = sa[sp + 2]; + if (isMalformed3(b1, b2, b3)) + return malformed(src, sp, dst, dp, 3); + da[dp++] = (char) + ((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + sp += 3; + } else { + return malformed(src, sp, dst, dp, 1); + } + } + return xflow(src, sp, sl, dst, dp, 0); + } + + private CoderResult decodeBufferLoop(ByteBuffer src, + CharBuffer dst) + { + int mark = src.position(); + int limit = src.limit(); + while (mark < limit) { + int b1 = src.get(); + if (b1 >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + if (dst.remaining() < 1) + return xflow(src, mark, 1); // overflow + dst.put((char) b1); + mark++; + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + if (limit - mark < 2|| dst.remaining() < 1) + return xflow(src, mark, 2); + int b2 = src.get(); + if (isNotContinuation(b2)) + return malformedForLength(src, mark, 1); + dst.put((char) (((b1 << 6) ^ b2) + ^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0)))); + mark += 2; + } else if ((b1 >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + int srcRemaining = limit - mark; + if (srcRemaining < 3 || dst.remaining() < 1) { + if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) + return malformedForLength(src, mark, 1); + return xflow(src, mark, 3); + } + int b2 = src.get(); + int b3 = src.get(); + if (isMalformed3(b1, b2, b3)) + return malformed(src, mark, 3); + dst.put((char) + ((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0))))); + mark += 3; + } else { + return malformed(src, mark, 1); + } + } + return xflow(src, mark, 0); + } + + protected CoderResult decodeLoop(ByteBuffer src, + CharBuffer dst) + { + if (src.hasArray() && dst.hasArray()) + return decodeArrayLoop(src, dst); + else + return decodeBufferLoop(src, dst); + } + + private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp) + { + if (bb == null) + bb = ByteBuffer.wrap(ba); + bb.position(sp); + return bb; + } + + // returns -1 if there is/are malformed byte(s) and the + // "action" for malformed input is not REPLACE. + public int decode(byte[] sa, int sp, int len, char[] da) { + final int sl = sp + len; + int dp = 0; + int dlASCII = Math.min(len, da.length); + ByteBuffer bb = null; // only necessary if malformed + + // ASCII only optimized loop + while (dp < dlASCII && sa[sp] >= 0) + da[dp++] = (char) sa[sp++]; + + while (sp < sl) { + int b1 = sa[sp++]; + if (b1 >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + da[dp++] = (char) b1; + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + if (sp < sl) { + int b2 = sa[sp++]; + if (isNotContinuation(b2)) { + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement().charAt(0); + sp--; // malformedN(bb, 2) always returns 1 + } else { + da[dp++] = (char) (((b1 << 6) ^ b2)^ + (((byte) 0xC0 << 6) ^ + ((byte) 0x80 << 0))); + } + continue; + } + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement().charAt(0); + return dp; + } else if ((b1 >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + if (sp + 1 < sl) { + int b2 = sa[sp++]; + int b3 = sa[sp++]; + if (isMalformed3(b1, b2, b3)) { + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement().charAt(0); + sp -=3; + bb = getByteBuffer(bb, sa, sp); + sp += malformedN(bb, 3).length(); + } else { + da[dp++] = (char)((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + } + continue; + } + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + if (sp < sl && isMalformed3_2(b1, sa[sp])) { + da[dp++] = replacement().charAt(0); + continue; + + } + da[dp++] = replacement().charAt(0); + return dp; + } else { + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement().charAt(0); + } + } + return dp; + } + } + + private static class Encoder extends CharsetEncoder + implements ArrayEncoder { + + private Encoder(Charset cs) { + super(cs, 1.1f, 3.0f); + } + + public boolean canEncode(char c) { + return !Character.isSurrogate(c); + } + + public boolean isLegalReplacement(byte[] repl) { + return ((repl.length == 1 && repl[0] >= 0) || + super.isLegalReplacement(repl)); + } + + private static CoderResult overflow(CharBuffer src, int sp, + ByteBuffer dst, int dp) { + updatePositions(src, sp, dst, dp); + return CoderResult.OVERFLOW; + } + + private static CoderResult overflow(CharBuffer src, int mark) { + src.position(mark); + return CoderResult.OVERFLOW; + } + + private static void to3Bytes(byte[] da, int dp, char c) { + da[dp] = (byte)(0xe0 | ((c >> 12))); + da[dp + 1] = (byte)(0x80 | ((c >> 6) & 0x3f)); + da[dp + 2] = (byte)(0x80 | (c & 0x3f)); + } + + private static void to3Bytes(ByteBuffer dst, char c) { + dst.put((byte)(0xe0 | ((c >> 12)))); + dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); + dst.put((byte)(0x80 | (c & 0x3f))); + } + + private Surrogate.Parser sgp; + private char[] c2; + private CoderResult encodeArrayLoop(CharBuffer src, + ByteBuffer dst) + { + char[] sa = src.array(); + int sp = src.arrayOffset() + src.position(); + int sl = src.arrayOffset() + src.limit(); + + byte[] da = dst.array(); + int dp = dst.arrayOffset() + dst.position(); + int dl = dst.arrayOffset() + dst.limit(); + int dlASCII = dp + Math.min(sl - sp, dl - dp); + + // ASCII only loop + while (dp < dlASCII && sa[sp] < '\u0080') + da[dp++] = (byte) sa[sp++]; + while (sp < sl) { + char c = sa[sp]; + if (c < 0x80) { + // Have at most seven bits + if (dp >= dl) + return overflow(src, sp, dst, dp); + da[dp++] = (byte)c; + } else if (c < 0x800) { + // 2 bytes, 11 bits + if (dl - dp < 2) + return overflow(src, sp, dst, dp); + da[dp++] = (byte)(0xc0 | (c >> 6)); + da[dp++] = (byte)(0x80 | (c & 0x3f)); + } else if (Character.isSurrogate(c)) { + // Have a surrogate pair + if (sgp == null) + sgp = new Surrogate.Parser(); + int uc = sgp.parse(c, sa, sp, sl); + if (uc < 0) { + updatePositions(src, sp, dst, dp); + return sgp.error(); + } + if (dl - dp < 6) + return overflow(src, sp, dst, dp); + to3Bytes(da, dp, Character.highSurrogate(uc)); + dp += 3; + to3Bytes(da, dp, Character.lowSurrogate(uc)); + dp += 3; + sp++; // 2 chars + } else { + // 3 bytes, 16 bits + if (dl - dp < 3) + return overflow(src, sp, dst, dp); + to3Bytes(da, dp, c); + dp += 3; + } + sp++; + } + updatePositions(src, sp, dst, dp); + return CoderResult.UNDERFLOW; + } + + private CoderResult encodeBufferLoop(CharBuffer src, + ByteBuffer dst) + { + int mark = src.position(); + while (src.hasRemaining()) { + char c = src.get(); + if (c < 0x80) { + // Have at most seven bits + if (!dst.hasRemaining()) + return overflow(src, mark); + dst.put((byte)c); + } else if (c < 0x800) { + // 2 bytes, 11 bits + if (dst.remaining() < 2) + return overflow(src, mark); + dst.put((byte)(0xc0 | (c >> 6))); + dst.put((byte)(0x80 | (c & 0x3f))); + } else if (Character.isSurrogate(c)) { + // Have a surrogate pair + if (sgp == null) + sgp = new Surrogate.Parser(); + int uc = sgp.parse(c, src); + if (uc < 0) { + src.position(mark); + return sgp.error(); + } + if (dst.remaining() < 6) + return overflow(src, mark); + to3Bytes(dst, Character.highSurrogate(uc)); + to3Bytes(dst, Character.lowSurrogate(uc)); + mark++; // 2 chars + } else { + // 3 bytes, 16 bits + if (dst.remaining() < 3) + return overflow(src, mark); + to3Bytes(dst, c); + } + mark++; + } + src.position(mark); + return CoderResult.UNDERFLOW; + } + + protected final CoderResult encodeLoop(CharBuffer src, + ByteBuffer dst) + { + if (src.hasArray() && dst.hasArray()) + return encodeArrayLoop(src, dst); + else + return encodeBufferLoop(src, dst); + } + + // returns -1 if there is malformed char(s) and the + // "action" for malformed input is not REPLACE. + public int encode(char[] sa, int sp, int len, byte[] da) { + int sl = sp + len; + int dp = 0; + int dlASCII = dp + Math.min(len, da.length); + + // ASCII only optimized loop + while (dp < dlASCII && sa[sp] < '\u0080') + da[dp++] = (byte) sa[sp++]; + + while (sp < sl) { + char c = sa[sp++]; + if (c < 0x80) { + // Have at most seven bits + da[dp++] = (byte)c; + } else if (c < 0x800) { + // 2 bytes, 11 bits + da[dp++] = (byte)(0xc0 | (c >> 6)); + da[dp++] = (byte)(0x80 | (c & 0x3f)); + } else if (Character.isSurrogate(c)) { + if (sgp == null) + sgp = new Surrogate.Parser(); + int uc = sgp.parse(c, sa, sp - 1, sl); + if (uc < 0) { + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement()[0]; + } else { + to3Bytes(da, dp, Character.highSurrogate(uc)); + dp += 3; + to3Bytes(da, dp, Character.lowSurrogate(uc)); + dp += 3; + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + to3Bytes(da, dp, c); + dp += 3; + } + } + return dp; + } + } +} diff --git a/jdk/src/share/classes/sun/nio/cs/UTF_8.java b/jdk/src/share/classes/sun/nio/cs/UTF_8.java index 56d6bcc7b6d..1f7edaad2d8 100644 --- a/jdk/src/share/classes/sun/nio/cs/UTF_8.java +++ b/jdk/src/share/classes/sun/nio/cs/UTF_8.java @@ -72,8 +72,8 @@ class UTF_8 extends Unicode return new Encoder(this); } - static final void updatePositions(Buffer src, int sp, - Buffer dst, int dp) { + private static final void updatePositions(Buffer src, int sp, + Buffer dst, int dp) { src.position(sp - src.arrayOffset()); dst.position(dp - dst.arrayOffset()); } @@ -88,11 +88,6 @@ class UTF_8 extends Unicode return (b & 0xc0) != 0x80; } - // [C2..DF] [80..BF] - private static boolean isMalformed2(int b1, int b2) { - return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; - } - // [E0] [A0..BF] [80..BF] // [E1..EF] [80..BF] [80..BF] private static boolean isMalformed3(int b1, int b2, int b3) { @@ -100,6 +95,12 @@ class UTF_8 extends Unicode (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; } + // only used when there is only one byte left in src buffer + private static boolean isMalformed3_2(int b1, int b2) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80; + } + // [F0] [90..BF] [80..BF] [80..BF] // [F1..F3] [80..BF] [80..BF] [80..BF] // [F4] [80..8F] [80..BF] [80..BF] @@ -110,6 +111,16 @@ class UTF_8 extends Unicode (b4 & 0xc0) != 0x80; } + // only used when there is less than 4 bytes left in src buffer + private static boolean isMalformed4_2(int b1, int b2) { + return (b1 == 0xf0 && b2 == 0x90) || + (b2 & 0xc0) != 0x80; + } + + private static boolean isMalformed4_3(int b3) { + return (b3 & 0xc0) != 0x80; + } + private static CoderResult lookupN(ByteBuffer src, int n) { for (int i = 1; i < n; i++) { @@ -122,28 +133,14 @@ class UTF_8 extends Unicode private static CoderResult malformedN(ByteBuffer src, int nb) { switch (nb) { case 1: - int b1 = src.get(); - if ((b1 >> 2) == -2) { - // 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - if (src.remaining() < 4) - return CoderResult.UNDERFLOW; - return lookupN(src, 5); - } - if ((b1 >> 1) == -2) { - // 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - if (src.remaining() < 5) - return CoderResult.UNDERFLOW; - return lookupN(src, 6); - } - return CoderResult.malformedForLength(1); case 2: // always 1 return CoderResult.malformedForLength(1); case 3: - b1 = src.get(); + int b1 = src.get(); int b2 = src.get(); // no need to lookup b3 return CoderResult.malformedForLength( ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || - isNotContinuation(b2))?1:2); + isNotContinuation(b2)) ? 1 : 2); case 4: // we don't care the speed here b1 = src.get() & 0xff; b2 = src.get() & 0xff; @@ -171,6 +168,7 @@ class UTF_8 extends Unicode return cr; } + private static CoderResult malformed(ByteBuffer src, int mark, int nb) { @@ -180,18 +178,36 @@ class UTF_8 extends Unicode return cr; } + private static CoderResult malformedForLength(ByteBuffer src, + int sp, + CharBuffer dst, + int dp, + int malformedNB) + { + updatePositions(src, sp, dst, dp); + return CoderResult.malformedForLength(malformedNB); + } + + private static CoderResult malformedForLength(ByteBuffer src, + int mark, + int malformedNB) + { + src.position(mark); + return CoderResult.malformedForLength(malformedNB); + } + + private static CoderResult xflow(Buffer src, int sp, int sl, Buffer dst, int dp, int nb) { updatePositions(src, sp, dst, dp); return (nb == 0 || sl - sp < nb) - ?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; + ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; } private static CoderResult xflow(Buffer src, int mark, int nb) { - CoderResult cr = (nb == 0 || src.remaining() < (nb - 1)) - ?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; src.position(mark); - return cr; + return (nb == 0 || src.remaining() < nb) + ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; } private CoderResult decodeArrayLoop(ByteBuffer src, @@ -210,7 +226,6 @@ class UTF_8 extends Unicode // ASCII only loop while (dp < dlASCII && sa[sp] >= 0) da[dp++] = (char) sa[sp++]; - while (sp < sl) { int b1 = sa[sp]; if (b1 >= 0) { @@ -219,13 +234,20 @@ class UTF_8 extends Unicode return xflow(src, sp, sl, dst, dp, 1); da[dp++] = (char) b1; sp++; - } else if ((b1 >> 5) == -2) { + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + // [C2..DF] [80..BF] if (sl - sp < 2 || dp >= dl) return xflow(src, sp, sl, dst, dp, 2); int b2 = sa[sp + 1]; - if (isMalformed2(b1, b2)) - return malformed(src, sp, dst, dp, 2); + // Now we check the first byte of 2-byte sequence as + // if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) + // no longer need to check b1 against c1 & c0 for + // malformed as we did in previous version + // (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; + // only need to check the second byte b2. + if (isNotContinuation(b2)) + return malformedForLength(src, sp, dst, dp, 1); da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ @@ -233,24 +255,37 @@ class UTF_8 extends Unicode sp += 2; } else if ((b1 >> 4) == -2) { // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx - if (sl - sp < 3 || dp >= dl) + int srcRemaining = sl - sp; + if (srcRemaining < 3 || dp >= dl) { + if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) + return malformedForLength(src, sp, dst, dp, 1); return xflow(src, sp, sl, dst, dp, 3); + } int b2 = sa[sp + 1]; int b3 = sa[sp + 2]; if (isMalformed3(b1, b2, b3)) return malformed(src, sp, dst, dp, 3); - da[dp++] = (char) + char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); + if (Character.isSurrogate(c)) + return malformedForLength(src, sp, dst, dp, 3); + da[dp++] = c; sp += 3; } else if ((b1 >> 3) == -2) { // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if (sl - sp < 4 || dl - dp < 2) + int srcRemaining = sl - sp; + if (srcRemaining < 4 || dl - dp < 2) { + if (srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1])) + return malformedForLength(src, sp, dst, dp, 1); + if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2])) + return malformedForLength(src, sp, dst, dp, 2); return xflow(src, sp, sl, dst, dp, 4); + } int b2 = sa[sp + 1]; int b3 = sa[sp + 2]; int b4 = sa[sp + 3]; @@ -289,38 +324,51 @@ class UTF_8 extends Unicode return xflow(src, mark, 1); // overflow dst.put((char) b1); mark++; - } else if ((b1 >> 5) == -2) { + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx if (limit - mark < 2|| dst.remaining() < 1) return xflow(src, mark, 2); int b2 = src.get(); - if (isMalformed2(b1, b2)) - return malformed(src, mark, 2); - dst.put((char) (((b1 << 6) ^ b2) + if (isNotContinuation(b2)) + return malformedForLength(src, mark, 1); + dst.put((char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0)))); mark += 2; } else if ((b1 >> 4) == -2) { // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx - if (limit - mark < 3 || dst.remaining() < 1) + int srcRemaining = limit - mark; + if (srcRemaining < 3 || dst.remaining() < 1) { + if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) + return malformedForLength(src, mark, 1); return xflow(src, mark, 3); + } int b2 = src.get(); int b3 = src.get(); if (isMalformed3(b1, b2, b3)) return malformed(src, mark, 3); - dst.put((char) - ((b1 << 12) ^ - (b2 << 6) ^ - (b3 ^ - (((byte) 0xE0 << 12) ^ - ((byte) 0x80 << 6) ^ - ((byte) 0x80 << 0))))); + char c = (char) + ((b1 << 12) ^ + (b2 << 6) ^ + (b3 ^ + (((byte) 0xE0 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (Character.isSurrogate(c)) + return malformedForLength(src, mark, 3); + dst.put(c); mark += 3; } else if ((b1 >> 3) == -2) { // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if (limit - mark < 4 || dst.remaining() < 2) + int srcRemaining = limit - mark; + if (srcRemaining < 4 || dst.remaining() < 2) { + if (srcRemaining > 1 && isMalformed4_2(b1, src.get())) + return malformedForLength(src, mark, 1); + if (srcRemaining > 2 && isMalformed4_3(src.get())) + return malformedForLength(src, mark, 2); return xflow(src, mark, 4); + } int b2 = src.get(); int b3 = src.get(); int b4 = src.get(); @@ -364,7 +412,7 @@ class UTF_8 extends Unicode return bb; } - // returns -1 if there is malformed byte(s) and the + // returns -1 if there is/are malformed byte(s) and the // "action" for malformed input is not REPLACE. public int decode(byte[] sa, int sp, int len, char[] da) { final int sl = sp + len; @@ -381,11 +429,11 @@ class UTF_8 extends Unicode if (b1 >= 0) { // 1 byte, 7 bits: 0xxxxxxx da[dp++] = (char) b1; - } else if ((b1 >> 5) == -2) { + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx if (sp < sl) { int b2 = sa[sp++]; - if (isMalformed2(b1, b2)) { + if (isNotContinuation(b2)) { if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; da[dp++] = replacement().charAt(0); @@ -410,21 +458,33 @@ class UTF_8 extends Unicode if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; da[dp++] = replacement().charAt(0); - sp -=3; + sp -= 3; bb = getByteBuffer(bb, sa, sp); sp += malformedN(bb, 3).length(); } else { - da[dp++] = (char)((b1 << 12) ^ + char c = (char)((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); + if (Character.isSurrogate(c)) { + if (malformedInputAction() != CodingErrorAction.REPLACE) + return -1; + da[dp++] = replacement().charAt(0); + } else { + da[dp++] = c; + } } continue; } if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; + if (sp < sl && isMalformed3_2(b1, sa[sp])) { + da[dp++] = replacement().charAt(0); + continue; + + } da[dp++] = replacement().charAt(0); return dp; } else if ((b1 >> 3) == -2) { @@ -458,28 +518,29 @@ class UTF_8 extends Unicode } if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; + + if (sp < sl && isMalformed4_2(b1, sa[sp])) { + da[dp++] = replacement().charAt(0); + continue; + } + sp++; + if (sp < sl && isMalformed4_3(sa[sp])) { + da[dp++] = replacement().charAt(0); + continue; + } da[dp++] = replacement().charAt(0); return dp; } else { if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; da[dp++] = replacement().charAt(0); - sp--; - bb = getByteBuffer(bb, sa, sp); - CoderResult cr = malformedN(bb, 1); - if (!cr.isError()) { - // leading byte for 5 or 6-byte, but don't have enough - // bytes in buffer to check. Consumed rest as malformed. - return dp; - } - sp += cr.length(); } } return dp; } } - private static class Encoder extends CharsetEncoder + private static final class Encoder extends CharsetEncoder implements ArrayEncoder { private Encoder(Charset cs) { diff --git a/jdk/src/share/classes/sun/nio/cs/standard-charsets b/jdk/src/share/classes/sun/nio/cs/standard-charsets index 06120fee8f3..245713500a6 100644 --- a/jdk/src/share/classes/sun/nio/cs/standard-charsets +++ b/jdk/src/share/classes/sun/nio/cs/standard-charsets @@ -63,6 +63,10 @@ charset UTF-8 UTF_8 alias UTF8 # JDK historical alias unicode-1-1-utf-8 +charset CESU-8 CESU_8 + alias CESU8 + alias csCESU-8 + charset UTF-16 UTF_16 alias UTF_16 # JDK historical alias utf16 diff --git a/jdk/test/java/nio/charset/coders/Errors.java b/jdk/test/java/nio/charset/coders/Errors.java index be19c0b403d..0b74001dd6e 100644 --- a/jdk/test/java/nio/charset/coders/Errors.java +++ b/jdk/test/java/nio/charset/coders/Errors.java @@ -23,7 +23,7 @@ /* @test * @summary Check that error cases are replaced correctly in String/ISR/OSW - * @bug 4457851 + * @bug 4457851 7096080 * * @build Errors Util * @run main Errors @@ -193,11 +193,9 @@ public class Errors { t.test("\uFFFF", new byte[] { (byte)0xEF, (byte)0xBF, (byte)0xBF }); t.test(new byte[] { X, (byte)0x7f, Y }, "x\u007Fy"); t.test(new byte[] { X, (byte)0x80, Y }, "x\uFFFDy"); - t.test(new byte[] { (byte)0xf0, (byte)0xf0 }, "\uFFFD"); } public static void main(String[] args) throws Exception { - test_US_ASCII(new TestString("US-ASCII")); test_US_ASCII(new TestStream("US-ASCII")); diff --git a/jdk/test/sun/nio/cs/TestStringCoding.java b/jdk/test/sun/nio/cs/TestStringCoding.java index c4837e956ae..09e61444889 100644 --- a/jdk/test/sun/nio/cs/TestStringCoding.java +++ b/jdk/test/sun/nio/cs/TestStringCoding.java @@ -24,7 +24,7 @@ */ /* @test - @bug 6636323 6636319 7040220 + @bug 6636323 6636319 7040220 7096080 @summary Test if StringCoding and NIO result have the same de/encoding result * @run main/othervm/timeout=2000 TestStringCoding */ @@ -111,7 +111,8 @@ public class TestStringCoding { //encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { - if (cs.name().equals("UTF-8")) // utf8 handles surrogates + if (cs.name().equals("UTF-8") || // utf8 handles surrogates + cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] { (byte)'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc; @@ -136,7 +137,6 @@ public class TestStringCoding { cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); - ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, diff --git a/jdk/test/sun/nio/cs/TestStringCodingUTF8.java b/jdk/test/sun/nio/cs/TestStringCodingUTF8.java index fdc204849b7..d1f69950684 100644 --- a/jdk/test/sun/nio/cs/TestStringCodingUTF8.java +++ b/jdk/test/sun/nio/cs/TestStringCodingUTF8.java @@ -33,14 +33,16 @@ import java.nio.charset.*; public class TestStringCodingUTF8 { public static void main(String[] args) throws Throwable { - test(); + test("UTF-8"); + test("CESU-8"); // security manager on System.setSecurityManager(new PermissiveSecurityManger()); - test(); + test("UTF-8"); + test("CESU-8"); } - static void test() throws Throwable { - Charset cs = Charset.forName("UTF-8"); + static void test(String csn) throws Throwable { + Charset cs = Charset.forName(csn); char[] bmp = new char[0x10000]; for (int i = 0; i < 0x10000; i++) { bmp[i] = (char)i; diff --git a/jdk/test/sun/nio/cs/TestUTF8.java b/jdk/test/sun/nio/cs/TestUTF8.java index f339eae046b..e83f8fbb51a 100644 --- a/jdk/test/sun/nio/cs/TestUTF8.java +++ b/jdk/test/sun/nio/cs/TestUTF8.java @@ -23,7 +23,7 @@ /* * @test - * @bug 4486841 7040220 + * @bug 4486841 7040220 7096080 * @summary Test UTF-8 charset */ @@ -156,15 +156,22 @@ public class TestUTF8 { return 3; } + static int to4ByteUTF8(int uc, byte[] bb, int pos) { + bb[pos++] = (byte)(0xf0 | ((uc >> 18))); + bb[pos++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); + bb[pos++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); + bb[pos++] = (byte)(0x80 | (uc & 0x3f)); + return 4; + } + static void checkRoundtrip(String csn) throws Exception { System.out.printf(" Check roundtrip <%s>...", csn); char[] cc = getUTFChars(); byte[] bb = encode(cc, csn, false); char[] ccO = decode(bb, csn, false); - if (!Arrays.equals(cc, ccO)) { + if (!Arrays.equals(cc, ccO)) System.out.printf(" non-direct failed"); - } bb = encode(cc, csn, true); ccO = decode(bb, csn, true); if (!Arrays.equals(cc, ccO)) { @@ -180,6 +187,40 @@ public class TestUTF8 { System.out.println(); } + static void check4ByteSurrs(String csn) throws Exception { + System.out.printf(" Check 4-byte Surrogates <%s>...%n", csn); + byte[] bb = new byte[(0x110000 - 0x10000) * 4]; + char[] cc = new char[(0x110000 - 0x10000) * 2]; + int bpos = 0; + int cpos = 0; + for (int i = 0x10000; i < 0x110000; i++) { + Character.toChars(i, cc, cpos); + bpos += to4ByteUTF8(i, bb, bpos); + cpos += 2; + } + checkSurrs(csn, bb, cc); + } + + + static void checkSurrs(String csn, byte[] bb, char[] cc) + throws Exception + { + char[] ccO = decode(bb, csn, false); + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" decoding failed%n"); + } + ccO = decode(bb, csn, true); + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" decoding(direct) failed%n"); + } + if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { + System.out.printf(" String.toCharArray() failed"); + } + if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { + System.out.printf(" String.getBytes() failed"); + } + } + static void check6ByteSurrs(String csn) throws Exception { System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); byte[] bb = new byte[(0x110000 - 0x10000) * 6]; @@ -192,23 +233,10 @@ public class TestUTF8 { bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); cpos += 2; } - - char[] ccO = decode(bb, csn, false); - if (!Arrays.equals(cc, ccO)) { - System.out.printf(" decoding failed%n"); - } - ccO = decode(bb, csn, true); - if (!Arrays.equals(cc, ccO)) { - System.out.printf(" decoding(direct) failed%n"); - } - // new String(bb, csn).getBytes(csn) will not return - // the 6 bytes surrogates as in bb, so only test - // toCharArray() here. - if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { - System.out.printf(" String.toCharArray() failed"); - } + checkSurrs(csn, bb, cc); } + static void compare(String csn1, String csn2) throws Exception { System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); char[] cc = getUTFChars(); @@ -266,6 +294,10 @@ public class TestUTF8 { {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte + {1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes + {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate + {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate + // Four-byte sequences {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded @@ -276,8 +308,13 @@ public class TestUTF8 { {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte + {1, (byte)0xF0, (byte)41 }, // invalid second byte + // & only 2 bytes + {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte - {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte + {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte + {2, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte + // & 3 bytes input {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte @@ -287,30 +324,113 @@ public class TestUTF8 { {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte // Five-byte sequences - {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte - {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded - {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded - {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded - {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, - {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, - {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, - {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, + {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, // Six-byte sequences - {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded - {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded - {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded - {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, - {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, - {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, - {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, - {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, + {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, }; - static void checkMalformed(String csn) throws Exception { + // The first byte is the length of malformed bytes + static byte[][] malformed_cesu8 = { + // One-byte sequences: + {1, (byte)0xFF }, + {1, (byte)0xC0 }, + {1, (byte)0x80 }, + + {1, (byte)0xFF, (byte)0xFF}, // all ones + {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble + + // Two-byte sequences: + {1, (byte)0xC0, (byte)0x80}, // invalid first byte + {1, (byte)0xC1, (byte)0xBF}, // invalid first byte + {1, (byte)0xC2, (byte)0x00}, // invalid second byte + {1, (byte)0xC2, (byte)0xC0}, // invalid second byte + {1, (byte)0xD0, (byte)0x00}, // invalid second byte + {1, (byte)0xD0, (byte)0xC0}, // invalid second byte + {1, (byte)0xDF, (byte)0x00}, // invalid second byte + {1, (byte)0xDF, (byte)0xC0}, // invalid second byte + + // Three-byte sequences + {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble + {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + + {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte + {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte + {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte + {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones + {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte + {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte + {1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes + + // CESU-8 does not have 4, 5, 6 bytes sequenc + // Four-byte sequences + {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded + + {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones + {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte + {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte + {1, (byte)0xF0, (byte)41 }, // invalid second byte + // & only 2 bytes + {1, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte + {1, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte + {1, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte + // & 3 bytes input + + {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte + {1, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte + {1, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte + {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + + // Five-byte sequences + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + + {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, + {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, + + // Six-byte sequences + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, + {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, + }; + + + static void checkMalformed(String csn, byte[][] malformed) throws Exception { boolean failed = false; System.out.printf(" Check malformed <%s>...%n", csn); Charset cs = Charset.forName(csn); @@ -430,9 +550,12 @@ public class TestUTF8 { public static void main(String[] args) throws Exception { checkRoundtrip("UTF-8"); - check6ByteSurrs("UTF-8"); - //compare("UTF-8", "UTF-8-OLD"); - checkMalformed("UTF-8"); + check4ByteSurrs("UTF-8"); + checkMalformed("UTF-8", malformed); checkUnderOverflow("UTF-8"); + + checkRoundtrip("CESU-8"); + check6ByteSurrs("CESU-8"); + checkMalformed("CESU-8", malformed_cesu8); } }