8325002: Exceptions::fthrow needs to ensure it truncates to a valid utf8 string
Reviewed-by: djelinski, stuefe
This commit is contained in:
parent
d39e7af2e5
commit
5b7bb40d1f
@ -43,6 +43,7 @@
|
||||
#include "runtime/atomic.hpp"
|
||||
#include "utilities/events.hpp"
|
||||
#include "utilities/exceptions.hpp"
|
||||
#include "utilities/utf8.hpp"
|
||||
|
||||
// Limit exception message components to 64K (the same max as Symbols)
|
||||
#define MAX_LEN 65535
|
||||
@ -262,8 +263,32 @@ void Exceptions::fthrow(JavaThread* thread, const char* file, int line, Symbol*
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
char msg[max_msg_size];
|
||||
os::vsnprintf(msg, max_msg_size, format, ap);
|
||||
int ret = os::vsnprintf(msg, max_msg_size, format, ap);
|
||||
va_end(ap);
|
||||
|
||||
// If ret == -1 then either there was a format conversion error, or the required buffer size
|
||||
// exceeds INT_MAX and so couldn't be returned (undocumented behaviour of vsnprintf). Depending
|
||||
// on the platform the buffer may be filled to its capacity (Linux), filled to the conversion
|
||||
// that encountered the overflow (macOS), or is empty (Windows), so it is possible we
|
||||
// have a truncated UTF-8 sequence. Similarly, if the buffer was too small and ret >= max_msg_size
|
||||
// we may also have a truncated UTF-8 sequence. In such cases we need to fix the buffer so the UTF-8
|
||||
// sequence is valid.
|
||||
if (ret == -1 || ret >= max_msg_size) {
|
||||
int len = (int) strlen(msg);
|
||||
if (len > 0) {
|
||||
// Truncation will only happen if the buffer was filled by vsnprintf,
|
||||
// otherwise vsnprintf already terminated filling it at a well-defined point.
|
||||
// But as this is not a clearly specified area we will perform our own UTF8
|
||||
// truncation anyway - though for those well-defined termination points it
|
||||
// will be a no-op.
|
||||
UTF8::truncate_to_legal_utf8((unsigned char*)msg, len + 1);
|
||||
}
|
||||
}
|
||||
// UTF8::is_legal_utf8 should actually be called is_legal_utf8_class_name as the final
|
||||
// parameter controls a check for a specific character appearing in the "name", which is only
|
||||
// allowed for classfile versions <= 47. We pass `true` so that we allow such strings as this code
|
||||
// know nothing about the actual string content.
|
||||
assert(UTF8::is_legal_utf8((const unsigned char*)msg, (int)strlen(msg), true), "must be");
|
||||
_throw_msg(thread, file, line, h_name, msg);
|
||||
}
|
||||
|
||||
|
@ -392,6 +392,69 @@ bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Return true if `b` could be the starting byte of an encoded 2,3 or 6
|
||||
// byte sequence.
|
||||
static bool is_starting_byte(unsigned char b) {
|
||||
return b >= 0xC0 && b <= 0xEF;
|
||||
}
|
||||
|
||||
// Takes an incoming buffer that was valid UTF-8, but which has been truncated such that
|
||||
// the last encoding may be partial, and returns the same buffer with a NUL-terminator
|
||||
// inserted such that any partial encoding has gone.
|
||||
// Note: if the incoming buffer is already valid then we may still drop the last encoding.
|
||||
// To avoid that the caller can choose to check for validity first.
|
||||
// The incoming buffer is still expected to be NUL-terminated.
|
||||
// The incoming buffer is expected to be a realistic size - we assert if it is too small.
|
||||
void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
|
||||
assert(length > 5, "invalid length");
|
||||
assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated");
|
||||
|
||||
if (buffer[length - 2] < 128) { // valid "ascii" - common case
|
||||
return;
|
||||
}
|
||||
|
||||
// Modified UTF-8 encodes characters in sequences of 1, 2, 3 or 6 bytes.
|
||||
// The last byte is invalid if it is:
|
||||
// - the 1st byte of a 2, 3 or 6 byte sequence
|
||||
// 0b110xxxxx
|
||||
// 0b1110xxxx
|
||||
// 0b11101101
|
||||
// - the 2nd byte of a 3 or 6 byte sequence
|
||||
// 0b10xxxxxx
|
||||
// 0b1010xxxx
|
||||
// - the 3rd, 4th or 5th byte of a 6 byte sequence
|
||||
// 0b10xxxxxx
|
||||
// 0b11101101
|
||||
// 0b1011xxxx
|
||||
//
|
||||
// Rather than checking all possible situations we simplify things noting that as we have already
|
||||
// got a truncated string, then dropping one more character is not significant. So we work from the
|
||||
// end of the buffer looking for the first byte that can be the starting byte of a UTF-8 encoded sequence,
|
||||
// then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte
|
||||
// encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding.
|
||||
|
||||
for (int index = length - 2; index > 0; index--) {
|
||||
if (is_starting_byte(buffer[index])) {
|
||||
if (buffer[index] == 0xED) {
|
||||
// Could be first byte of 3 or 6, or fourth byte of 6.
|
||||
// If fourth the previous three bytes will encode a high
|
||||
// surrogate value in the range EDA080 to EDAFBF. We only
|
||||
// need to check for EDA to establish this as the "missing"
|
||||
// values in EDAxxx would not be valid 3 byte encodings.
|
||||
if ((index - 3) >= 0 &&
|
||||
(buffer[index - 3] == 0xED) &&
|
||||
((buffer[index - 2] & 0xF0) == 0xA0)) {
|
||||
assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check");
|
||||
// It was fourth byte so truncate 3 bytes earlier
|
||||
index -= 3;
|
||||
}
|
||||
}
|
||||
buffer[index] = '\0';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
bool UNICODE::is_latin1(jchar c) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -88,6 +88,7 @@ class UTF8 : AllStatic {
|
||||
|
||||
static bool is_legal_utf8(const unsigned char* buffer, int length,
|
||||
bool version_leq_47);
|
||||
static void truncate_to_legal_utf8(unsigned char* buffer, int length);
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -22,6 +22,8 @@
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "nmt/memflags.hpp"
|
||||
#include "runtime/os.hpp"
|
||||
#include "utilities/utf8.hpp"
|
||||
#include "unittest.hpp"
|
||||
|
||||
@ -101,5 +103,105 @@ TEST_VM(utf8, jbyte_length) {
|
||||
UNICODE::as_utf8(str, 19, res, i);
|
||||
EXPECT_TRUE(test_stamp(res + i, sizeof(res) - i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_VM(utf8, truncation) {
|
||||
|
||||
// Test that truncation removes partial encodings as expected.
|
||||
|
||||
const char orig_bytes[] = { 'A', 'B', 'C', 'D', 'E', '\0' };
|
||||
const int orig_length = sizeof(orig_bytes)/sizeof(char);
|
||||
ASSERT_TRUE(UTF8::is_legal_utf8((const unsigned char*)orig_bytes, orig_length - 1, false));
|
||||
const char* orig_str = &orig_bytes[0];
|
||||
ASSERT_EQ((int)strlen(orig_str), orig_length - 1);
|
||||
|
||||
unsigned char* temp_bytes;
|
||||
const char* temp_str;
|
||||
char* utf8;
|
||||
int n_utf8; // Number of bytes in the encoding
|
||||
|
||||
// Test 1: a valid UTF8 "ascii" ending string should be returned as-is
|
||||
|
||||
temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * orig_length, mtTest);
|
||||
strcpy((char*)temp_bytes, orig_str);
|
||||
temp_str = (const char*) temp_bytes;
|
||||
UTF8::truncate_to_legal_utf8(temp_bytes, orig_length);
|
||||
ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be unchanged";
|
||||
ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be unchanged";
|
||||
os::free(temp_bytes);
|
||||
|
||||
// Test 2: a UTF8 sequence that "ends" with a 2-byte encoding
|
||||
// drops the 2-byte encoding
|
||||
|
||||
jchar two_byte_char[] = { 0x00D1 }; // N with tilde
|
||||
n_utf8 = 2;
|
||||
utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL
|
||||
UNICODE::convert_to_utf8(two_byte_char, 1, utf8);
|
||||
int utf8_len = (int)strlen(utf8);
|
||||
ASSERT_EQ(utf8_len, n_utf8) << "setup error";
|
||||
|
||||
// Now drop zero or one byte from the end and check it truncates as expected
|
||||
for (int drop = 0; drop < n_utf8; drop++) {
|
||||
int temp_len = orig_length + utf8_len - drop;
|
||||
temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest);
|
||||
temp_str = (const char*) temp_bytes;
|
||||
strcpy((char*)temp_bytes, orig_str);
|
||||
strncat((char*)temp_bytes, utf8, utf8_len - drop);
|
||||
ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error";
|
||||
UTF8::truncate_to_legal_utf8(temp_bytes, temp_len);
|
||||
ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length";
|
||||
ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original";
|
||||
os::free(temp_bytes);
|
||||
}
|
||||
os::free(utf8);
|
||||
|
||||
// Test 3: a UTF8 sequence that "ends" with a 3-byte encoding
|
||||
// drops the 3-byte encoding
|
||||
n_utf8 = 3;
|
||||
jchar three_byte_char[] = { 0x0800 };
|
||||
utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL
|
||||
UNICODE::convert_to_utf8(three_byte_char, 1, utf8);
|
||||
utf8_len = (int)strlen(utf8);
|
||||
ASSERT_EQ(utf8_len, n_utf8) << "setup error";
|
||||
|
||||
// Now drop zero, to two bytes from the end and check it truncates as expected
|
||||
for (int drop = 0; drop < n_utf8; drop++) {
|
||||
int temp_len = orig_length + utf8_len - drop;
|
||||
temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest);
|
||||
temp_str = (const char*) temp_bytes;
|
||||
strcpy((char*)temp_bytes, orig_str);
|
||||
strncat((char*)temp_bytes, utf8, utf8_len - drop);
|
||||
ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error";
|
||||
UTF8::truncate_to_legal_utf8(temp_bytes, temp_len);
|
||||
ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length";
|
||||
ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original";
|
||||
os::free(temp_bytes);
|
||||
}
|
||||
os::free(utf8);
|
||||
|
||||
// Test 4: a UTF8 sequence that "ends" with a 6-byte encoding
|
||||
// drops the 6-byte encoding
|
||||
n_utf8 = 6;
|
||||
jchar six_byte_char[] = { 0xD801, 0xDC37 }; // U+10437 as its UTF-16 surrogate pairs
|
||||
utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL
|
||||
UNICODE::convert_to_utf8(six_byte_char, 2, utf8);
|
||||
utf8_len = (int)strlen(utf8);
|
||||
ASSERT_EQ(utf8_len, n_utf8) << "setup error";
|
||||
|
||||
// Now drop zero to five bytes from the end and check it truncates as expected
|
||||
for (int drop = 0; drop < n_utf8; drop++) {
|
||||
int temp_len = orig_length + utf8_len - drop;
|
||||
temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest);
|
||||
temp_str = (const char*) temp_bytes;
|
||||
strcpy((char*)temp_bytes, orig_str);
|
||||
strncat((char*)temp_bytes, utf8, utf8_len - drop);
|
||||
ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error";
|
||||
UTF8::truncate_to_legal_utf8(temp_bytes, temp_len);
|
||||
ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length";
|
||||
ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original";
|
||||
os::free(temp_bytes);
|
||||
}
|
||||
os::free(utf8);
|
||||
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user