8039751: UTF-8 decoder fails to handle some edge cases correctly

To update decoder.isMalformed4_2() to correctly detect out of range 2nd byte

Reviewed-by: alanb
This commit is contained in:
Xueming Shen 2014-04-12 14:38:50 -07:00
parent e90c029bad
commit 8decb5de90
2 changed files with 48 additions and 8 deletions

View File

@ -111,12 +111,18 @@ class UTF_8 extends Unicode
(b4 & 0xc0) != 0x80;
}
// only used when there is less than 4 bytes left in src buffer
// only used when there is less than 4 bytes left in src buffer.
// both b1 and b2 should be "& 0xff" before passed in.
private static boolean isMalformed4_2(int b1, int b2) {
return (b1 == 0xf0 && b2 == 0x90) ||
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
(b2 & 0xc0) != 0x80;
}
// tests if b1 and b2 are malformed as the first 2 bytes of a
// legal`4-byte utf-8 byte sequence.
// only used when there is less than 4 bytes left in src buffer,
// after isMalformed4_2 has been invoked.
private static boolean isMalformed4_3(int b3) {
return (b3 & 0xc0) != 0x80;
}
@ -280,7 +286,9 @@ class UTF_8 extends Unicode
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int srcRemaining = sl - sp;
if (srcRemaining < 4 || dl - dp < 2) {
if (srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1]))
b1 &= 0xff;
if (b1 > 0xf4 ||
srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))
return malformedForLength(src, sp, dst, dp, 1);
if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))
return malformedForLength(src, sp, dst, dp, 2);
@ -363,7 +371,9 @@ class UTF_8 extends Unicode
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int srcRemaining = limit - mark;
if (srcRemaining < 4 || dst.remaining() < 2) {
if (srcRemaining > 1 && isMalformed4_2(b1, src.get()))
b1 &= 0xff;
if (b1 > 0xf4 ||
srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))
return malformedForLength(src, mark, 1);
if (srcRemaining > 2 && isMalformed4_3(src.get()))
return malformedForLength(src, mark, 2);
@ -518,8 +528,9 @@ class UTF_8 extends Unicode
}
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
if (sp < sl && isMalformed4_2(b1, sa[sp])) {
b1 &= 0xff;
if (b1 > 0xf4 ||
sp < sl && isMalformed4_2(b1, sa[sp] & 0xff)) {
da[dp++] = replacement().charAt(0);
continue;
}

View File

@ -23,7 +23,7 @@
/*
* @test
* @bug 4486841 7040220 7096080
* @bug 4486841 7040220 7096080 8039751
* @summary Test UTF-8 charset
*/
@ -291,14 +291,18 @@ public class TestUTF8 {
{1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
{2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
{2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
{2, (byte)0xE1, (byte)0x80, (byte)0x42}, // invalid third byte
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
{1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
{1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
{1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes
{1, (byte)0xE1, (byte)0x40,}, // invalid second byte & 2 bytes
{3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
{3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
// Four-byte sequences
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
{1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
@ -323,6 +327,32 @@ public class TestUTF8 {
{1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
// #8039751
{1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
{1, (byte)0xF6, (byte)0x80, (byte)0x80, },
{1, (byte)0xF6, (byte)0x80, },
{1, (byte)0xF6, },
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
{1, (byte)0xF5, (byte)0x80, (byte)0x80, },
{1, (byte)0xF5, (byte)0x80, },
{1, (byte)0xF5 },
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
{1, (byte)0xF4, (byte)0x90, (byte)0x80 },
{1, (byte)0xF4, (byte)0x90 },
{1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte
{1, (byte)0xF4, (byte)0x7f, (byte)0x80 },
{1, (byte)0xF4, (byte)0x7f },
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
{1, (byte)0xF0, (byte)0x80, (byte)0x80 },
{1, (byte)0xF0, (byte)0x80 },
{1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
{1, (byte)0xF0, (byte)0xc0, (byte)0x80 },
{1, (byte)0xF0, (byte)0xc0 },
// Five-byte sequences
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
@ -553,7 +583,6 @@ public class TestUTF8 {
check4ByteSurrs("UTF-8");
checkMalformed("UTF-8", malformed);
checkUnderOverflow("UTF-8");
checkRoundtrip("CESU-8");
check6ByteSurrs("CESU-8");
checkMalformed("CESU-8", malformed_cesu8);