From 8decb5de90d313b1441f7b0bc3ed2668e59d6c33 Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Sat, 12 Apr 2014 14:38:50 -0700 Subject: [PATCH] 8039751: UTF-8 decoder fails to handle some edge cases correctly To update decoder.isMalformed4_2() to correctly detect out of range 2nd byte Reviewed-by: alanb --- jdk/src/share/classes/sun/nio/cs/UTF_8.java | 23 ++++++++++---- jdk/test/sun/nio/cs/TestUTF8.java | 33 +++++++++++++++++++-- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/jdk/src/share/classes/sun/nio/cs/UTF_8.java b/jdk/src/share/classes/sun/nio/cs/UTF_8.java index bfb7b7a977b..3ee2341f000 100644 --- a/jdk/src/share/classes/sun/nio/cs/UTF_8.java +++ b/jdk/src/share/classes/sun/nio/cs/UTF_8.java @@ -111,12 +111,18 @@ class UTF_8 extends Unicode (b4 & 0xc0) != 0x80; } - // only used when there is less than 4 bytes left in src buffer + // only used when there is less than 4 bytes left in src buffer. + // both b1 and b2 should be "& 0xff" before passed in. private static boolean isMalformed4_2(int b1, int b2) { - return (b1 == 0xf0 && b2 == 0x90) || + return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || + (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80; } + // tests if b1 and b2 are malformed as the first 2 bytes of a + // legal`4-byte utf-8 byte sequence. + // only used when there is less than 4 bytes left in src buffer, + // after isMalformed4_2 has been invoked. private static boolean isMalformed4_3(int b3) { return (b3 & 0xc0) != 0x80; } @@ -280,7 +286,9 @@ class UTF_8 extends Unicode // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx int srcRemaining = sl - sp; if (srcRemaining < 4 || dl - dp < 2) { - if (srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1])) + b1 &= 0xff; + if (b1 > 0xf4 || + srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff)) return malformedForLength(src, sp, dst, dp, 1); if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2])) return malformedForLength(src, sp, dst, dp, 2); @@ -363,7 +371,9 @@ class UTF_8 extends Unicode // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx int srcRemaining = limit - mark; if (srcRemaining < 4 || dst.remaining() < 2) { - if (srcRemaining > 1 && isMalformed4_2(b1, src.get())) + b1 &= 0xff; + if (b1 > 0xf4 || + srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff)) return malformedForLength(src, mark, 1); if (srcRemaining > 2 && isMalformed4_3(src.get())) return malformedForLength(src, mark, 2); @@ -518,8 +528,9 @@ class UTF_8 extends Unicode } if (malformedInputAction() != CodingErrorAction.REPLACE) return -1; - - if (sp < sl && isMalformed4_2(b1, sa[sp])) { + b1 &= 0xff; + if (b1 > 0xf4 || + sp < sl && isMalformed4_2(b1, sa[sp] & 0xff)) { da[dp++] = replacement().charAt(0); continue; } diff --git a/jdk/test/sun/nio/cs/TestUTF8.java b/jdk/test/sun/nio/cs/TestUTF8.java index c54ae66f4d9..010fbf17b43 100644 --- a/jdk/test/sun/nio/cs/TestUTF8.java +++ b/jdk/test/sun/nio/cs/TestUTF8.java @@ -23,7 +23,7 @@ /* * @test - * @bug 4486841 7040220 7096080 + * @bug 4486841 7040220 7096080 8039751 * @summary Test UTF-8 charset */ @@ -291,14 +291,18 @@ public class TestUTF8 { {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte + {2, (byte)0xE1, (byte)0x80, (byte)0x42}, // invalid third byte + {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte {1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes + {1, (byte)0xE1, (byte)0x40,}, // invalid second byte & 2 bytes {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate + // Four-byte sequences {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded @@ -323,6 +327,32 @@ public class TestUTF8 { {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + // #8039751 + {1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte + {1, (byte)0xF6, (byte)0x80, (byte)0x80, }, + {1, (byte)0xF6, (byte)0x80, }, + {1, (byte)0xF6, }, + {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte + {1, (byte)0xF5, (byte)0x80, (byte)0x80, }, + {1, (byte)0xF5, (byte)0x80, }, + {1, (byte)0xF5 }, + + {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte + {1, (byte)0xF4, (byte)0x90, (byte)0x80 }, + {1, (byte)0xF4, (byte)0x90 }, + + {1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte + {1, (byte)0xF4, (byte)0x7f, (byte)0x80 }, + {1, (byte)0xF4, (byte)0x7f }, + + {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte + {1, (byte)0xF0, (byte)0x80, (byte)0x80 }, + {1, (byte)0xF0, (byte)0x80 }, + + {1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte + {1, (byte)0xF0, (byte)0xc0, (byte)0x80 }, + {1, (byte)0xF0, (byte)0xc0 }, + // Five-byte sequences {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded @@ -553,7 +583,6 @@ public class TestUTF8 { check4ByteSurrs("UTF-8"); checkMalformed("UTF-8", malformed); checkUnderOverflow("UTF-8"); - checkRoundtrip("CESU-8"); check6ByteSurrs("CESU-8"); checkMalformed("CESU-8", malformed_cesu8);