8039751: UTF-8 decoder fails to handle some edge cases correctly
To update decoder.isMalformed4_2() to correctly detect out of range 2nd byte Reviewed-by: alanb
This commit is contained in:
parent
e90c029bad
commit
8decb5de90
@ -111,12 +111,18 @@ class UTF_8 extends Unicode
|
||||
(b4 & 0xc0) != 0x80;
|
||||
}
|
||||
|
||||
// only used when there is less than 4 bytes left in src buffer
|
||||
// only used when there is less than 4 bytes left in src buffer.
|
||||
// both b1 and b2 should be "& 0xff" before passed in.
|
||||
private static boolean isMalformed4_2(int b1, int b2) {
|
||||
return (b1 == 0xf0 && b2 == 0x90) ||
|
||||
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
|
||||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
|
||||
(b2 & 0xc0) != 0x80;
|
||||
}
|
||||
|
||||
// tests if b1 and b2 are malformed as the first 2 bytes of a
|
||||
// legal`4-byte utf-8 byte sequence.
|
||||
// only used when there is less than 4 bytes left in src buffer,
|
||||
// after isMalformed4_2 has been invoked.
|
||||
private static boolean isMalformed4_3(int b3) {
|
||||
return (b3 & 0xc0) != 0x80;
|
||||
}
|
||||
@ -280,7 +286,9 @@ class UTF_8 extends Unicode
|
||||
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
int srcRemaining = sl - sp;
|
||||
if (srcRemaining < 4 || dl - dp < 2) {
|
||||
if (srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1]))
|
||||
b1 &= 0xff;
|
||||
if (b1 > 0xf4 ||
|
||||
srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))
|
||||
return malformedForLength(src, sp, dst, dp, 1);
|
||||
if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))
|
||||
return malformedForLength(src, sp, dst, dp, 2);
|
||||
@ -363,7 +371,9 @@ class UTF_8 extends Unicode
|
||||
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
int srcRemaining = limit - mark;
|
||||
if (srcRemaining < 4 || dst.remaining() < 2) {
|
||||
if (srcRemaining > 1 && isMalformed4_2(b1, src.get()))
|
||||
b1 &= 0xff;
|
||||
if (b1 > 0xf4 ||
|
||||
srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))
|
||||
return malformedForLength(src, mark, 1);
|
||||
if (srcRemaining > 2 && isMalformed4_3(src.get()))
|
||||
return malformedForLength(src, mark, 2);
|
||||
@ -518,8 +528,9 @@ class UTF_8 extends Unicode
|
||||
}
|
||||
if (malformedInputAction() != CodingErrorAction.REPLACE)
|
||||
return -1;
|
||||
|
||||
if (sp < sl && isMalformed4_2(b1, sa[sp])) {
|
||||
b1 &= 0xff;
|
||||
if (b1 > 0xf4 ||
|
||||
sp < sl && isMalformed4_2(b1, sa[sp] & 0xff)) {
|
||||
da[dp++] = replacement().charAt(0);
|
||||
continue;
|
||||
}
|
||||
|
@ -23,7 +23,7 @@
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 4486841 7040220 7096080
|
||||
* @bug 4486841 7040220 7096080 8039751
|
||||
* @summary Test UTF-8 charset
|
||||
*/
|
||||
|
||||
@ -291,14 +291,18 @@ public class TestUTF8 {
|
||||
{1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
|
||||
{2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
|
||||
{2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
|
||||
{2, (byte)0xE1, (byte)0x80, (byte)0x42}, // invalid third byte
|
||||
|
||||
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
|
||||
{1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
|
||||
{1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
|
||||
{1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes
|
||||
{1, (byte)0xE1, (byte)0x40,}, // invalid second byte & 2 bytes
|
||||
{3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
|
||||
{3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
|
||||
|
||||
|
||||
|
||||
// Four-byte sequences
|
||||
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
|
||||
{1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
|
||||
@ -323,6 +327,32 @@ public class TestUTF8 {
|
||||
{1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
|
||||
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
|
||||
|
||||
// #8039751
|
||||
{1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
|
||||
{1, (byte)0xF6, (byte)0x80, (byte)0x80, },
|
||||
{1, (byte)0xF6, (byte)0x80, },
|
||||
{1, (byte)0xF6, },
|
||||
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
|
||||
{1, (byte)0xF5, (byte)0x80, (byte)0x80, },
|
||||
{1, (byte)0xF5, (byte)0x80, },
|
||||
{1, (byte)0xF5 },
|
||||
|
||||
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
|
||||
{1, (byte)0xF4, (byte)0x90, (byte)0x80 },
|
||||
{1, (byte)0xF4, (byte)0x90 },
|
||||
|
||||
{1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte
|
||||
{1, (byte)0xF4, (byte)0x7f, (byte)0x80 },
|
||||
{1, (byte)0xF4, (byte)0x7f },
|
||||
|
||||
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
|
||||
{1, (byte)0xF0, (byte)0x80, (byte)0x80 },
|
||||
{1, (byte)0xF0, (byte)0x80 },
|
||||
|
||||
{1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
|
||||
{1, (byte)0xF0, (byte)0xc0, (byte)0x80 },
|
||||
{1, (byte)0xF0, (byte)0xc0 },
|
||||
|
||||
// Five-byte sequences
|
||||
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte
|
||||
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
|
||||
@ -553,7 +583,6 @@ public class TestUTF8 {
|
||||
check4ByteSurrs("UTF-8");
|
||||
checkMalformed("UTF-8", malformed);
|
||||
checkUnderOverflow("UTF-8");
|
||||
|
||||
checkRoundtrip("CESU-8");
|
||||
check6ByteSurrs("CESU-8");
|
||||
checkMalformed("CESU-8", malformed_cesu8);
|
||||
|
Loading…
x
Reference in New Issue
Block a user