8338257: UTF8 lengths should be size_t not int
Reviewed-by: stuefe, coleenp, dlong
This commit is contained in:
parent
777ed2b5d2
commit
a4962ace4d
@ -431,10 +431,10 @@ void HashtableTextDump::get_utf8(char* utf8_buffer, int utf8_length) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the content is NOT the same as
|
// NOTE: the content is NOT the same as
|
||||||
// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen).
|
// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, size_t buflen).
|
||||||
// We want to escape \r\n\t so that output [1] is more readable; [2] can be more easily
|
// We want to escape \r\n\t so that output [1] is more readable; [2] can be more easily
|
||||||
// parsed by scripts; [3] quickly processed by HashtableTextDump::get_utf8()
|
// parsed by scripts; [3] quickly processed by HashtableTextDump::get_utf8()
|
||||||
void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, int utf8_length) {
|
void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length) {
|
||||||
const char *c = utf8_string;
|
const char *c = utf8_string;
|
||||||
const char *end = c + utf8_length;
|
const char *end = c + utf8_length;
|
||||||
for (; c < end; c++) {
|
for (; c < end; c++) {
|
||||||
|
@ -431,7 +431,7 @@ public:
|
|||||||
|
|
||||||
int unescape(const char* from, const char* end, int count);
|
int unescape(const char* from, const char* end, int count);
|
||||||
void get_utf8(char* utf8_buffer, int utf8_length);
|
void get_utf8(char* utf8_buffer, int utf8_length);
|
||||||
static void put_utf8(outputStream* st, const char* utf8_string, int utf8_length);
|
static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
|
#endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
|
||||||
|
@ -304,7 +304,8 @@ Handle java_lang_String::create_from_unicode(const jchar* unicode, int length, T
|
|||||||
#ifdef ASSERT
|
#ifdef ASSERT
|
||||||
{
|
{
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
char* expected = UNICODE::as_utf8(unicode, length);
|
size_t utf8_len = static_cast<size_t>(length);
|
||||||
|
char* expected = UNICODE::as_utf8(unicode, utf8_len);
|
||||||
char* actual = as_utf8_string(h_obj());
|
char* actual = as_utf8_string(h_obj());
|
||||||
if (strcmp(expected, actual) != 0) {
|
if (strcmp(expected, actual) != 0) {
|
||||||
fatal("Unicode conversion failure: %s --> %s", expected, actual);
|
fatal("Unicode conversion failure: %s --> %s", expected, actual);
|
||||||
@ -346,7 +347,7 @@ Handle java_lang_String::create_from_str(const char* utf8_str, TRAPS) {
|
|||||||
#ifdef ASSERT
|
#ifdef ASSERT
|
||||||
// This check is too strict when the input string is not a valid UTF8.
|
// This check is too strict when the input string is not a valid UTF8.
|
||||||
// For example, it may be created with arbitrary content via jni_NewStringUTF.
|
// For example, it may be created with arbitrary content via jni_NewStringUTF.
|
||||||
if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, (int)strlen(utf8_str), false)) {
|
if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, strlen(utf8_str), false)) {
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
const char* expected = utf8_str;
|
const char* expected = utf8_str;
|
||||||
char* actual = as_utf8_string(h_obj());
|
char* actual = as_utf8_string(h_obj());
|
||||||
@ -554,7 +555,7 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
|
|||||||
if (length == 0) return nullptr;
|
if (length == 0) return nullptr;
|
||||||
|
|
||||||
char* result;
|
char* result;
|
||||||
int result_length;
|
size_t result_length;
|
||||||
if (!is_latin1) {
|
if (!is_latin1) {
|
||||||
jchar* base = value->char_at_addr(0);
|
jchar* base = value->char_at_addr(0);
|
||||||
result_length = UNICODE::quoted_ascii_length(base, length) + 1;
|
result_length = UNICODE::quoted_ascii_length(base, length) + 1;
|
||||||
@ -566,8 +567,8 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
|
|||||||
result = NEW_RESOURCE_ARRAY(char, result_length);
|
result = NEW_RESOURCE_ARRAY(char, result_length);
|
||||||
UNICODE::as_quoted_ascii(base, length, result, result_length);
|
UNICODE::as_quoted_ascii(base, length, result, result_length);
|
||||||
}
|
}
|
||||||
assert(result_length >= length + 1, "must not be shorter");
|
assert(result_length >= (size_t)length + 1, "must not be shorter");
|
||||||
assert(result_length == (int)strlen(result) + 1, "must match");
|
assert(result_length == strlen(result) + 1, "must match");
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -582,8 +583,9 @@ Symbol* java_lang_String::as_symbol(oop java_string) {
|
|||||||
} else {
|
} else {
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
|
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
|
||||||
const char* base = UNICODE::as_utf8(position, length);
|
size_t utf8_len = static_cast<size_t>(length);
|
||||||
Symbol* sym = SymbolTable::new_symbol(base, length);
|
const char* base = UNICODE::as_utf8(position, utf8_len);
|
||||||
|
Symbol* sym = SymbolTable::new_symbol(base, checked_cast<int>(utf8_len));
|
||||||
return sym;
|
return sym;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -598,12 +600,13 @@ Symbol* java_lang_String::as_symbol_or_null(oop java_string) {
|
|||||||
} else {
|
} else {
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
|
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
|
||||||
const char* base = UNICODE::as_utf8(position, length);
|
size_t utf8_len = static_cast<size_t>(length);
|
||||||
return SymbolTable::probe(base, length);
|
const char* base = UNICODE::as_utf8(position, utf8_len);
|
||||||
|
return SymbolTable::probe(base, checked_cast<int>(utf8_len));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
|
size_t java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
|
||||||
assert(value_equals(value, java_lang_String::value(java_string)),
|
assert(value_equals(value, java_lang_String::value(java_string)),
|
||||||
"value must be same as java_lang_String::value(java_string)");
|
"value must be same as java_lang_String::value(java_string)");
|
||||||
int length = java_lang_String::length(java_string, value);
|
int length = java_lang_String::length(java_string, value);
|
||||||
@ -617,18 +620,39 @@ int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int java_lang_String::utf8_length(oop java_string) {
|
size_t java_lang_String::utf8_length(oop java_string) {
|
||||||
typeArrayOop value = java_lang_String::value(java_string);
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
return utf8_length(java_string, value);
|
return utf8_length(java_string, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int java_lang_String::utf8_length_as_int(oop java_string) {
|
||||||
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
|
return utf8_length_as_int(java_string, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
int java_lang_String::utf8_length_as_int(oop java_string, typeArrayOop value) {
|
||||||
|
assert(value_equals(value, java_lang_String::value(java_string)),
|
||||||
|
"value must be same as java_lang_String::value(java_string)");
|
||||||
|
int length = java_lang_String::length(java_string, value);
|
||||||
|
if (length == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (!java_lang_String::is_latin1(java_string)) {
|
||||||
|
return UNICODE::utf8_length_as_int(value->char_at_addr(0), length);
|
||||||
|
} else {
|
||||||
|
return UNICODE::utf8_length_as_int(value->byte_at_addr(0), length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string) {
|
char* java_lang_String::as_utf8_string(oop java_string) {
|
||||||
int length;
|
size_t length;
|
||||||
return as_utf8_string(java_string, length);
|
return as_utf8_string(java_string, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string, int& length) {
|
char* java_lang_String::as_utf8_string(oop java_string, size_t& length) {
|
||||||
typeArrayOop value = java_lang_String::value(java_string);
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
|
// `length` is used as the incoming number of characters to
|
||||||
|
// convert, and then set as the number of bytes in the UTF8 sequence.
|
||||||
length = java_lang_String::length(java_string, value);
|
length = java_lang_String::length(java_string, value);
|
||||||
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
||||||
if (!is_latin1) {
|
if (!is_latin1) {
|
||||||
@ -642,7 +666,7 @@ char* java_lang_String::as_utf8_string(oop java_string, int& length) {
|
|||||||
|
|
||||||
// Uses a provided buffer if it's sufficiently large, otherwise allocates
|
// Uses a provided buffer if it's sufficiently large, otherwise allocates
|
||||||
// a resource array to fit
|
// a resource array to fit
|
||||||
char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int buflen, int& utf8_len) {
|
char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& utf8_len) {
|
||||||
typeArrayOop value = java_lang_String::value(java_string);
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
int len = java_lang_String::length(java_string, value);
|
int len = java_lang_String::length(java_string, value);
|
||||||
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
||||||
@ -663,7 +687,7 @@ char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int bufl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen) {
|
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen) {
|
||||||
assert(value_equals(value, java_lang_String::value(java_string)),
|
assert(value_equals(value, java_lang_String::value(java_string)),
|
||||||
"value must be same as java_lang_String::value(java_string)");
|
"value must be same as java_lang_String::value(java_string)");
|
||||||
int length = java_lang_String::length(java_string, value);
|
int length = java_lang_String::length(java_string, value);
|
||||||
@ -677,25 +701,28 @@ char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string, char* buf, int buflen) {
|
char* java_lang_String::as_utf8_string(oop java_string, char* buf, size_t buflen) {
|
||||||
typeArrayOop value = java_lang_String::value(java_string);
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
return as_utf8_string(java_string, value, buf, buflen);
|
return as_utf8_string(java_string, value, buf, buflen);
|
||||||
}
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string, int start, int len) {
|
char* java_lang_String::as_utf8_string(oop java_string, int start, int len) {
|
||||||
|
// `length` is used as the incoming number of characters to
|
||||||
|
// convert, and then set as the number of bytes in the UTF8 sequence.
|
||||||
|
size_t length = static_cast<size_t>(len);
|
||||||
typeArrayOop value = java_lang_String::value(java_string);
|
typeArrayOop value = java_lang_String::value(java_string);
|
||||||
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
bool is_latin1 = java_lang_String::is_latin1(java_string);
|
||||||
assert(start + len <= java_lang_String::length(java_string), "just checking");
|
assert(start + len <= java_lang_String::length(java_string), "just checking");
|
||||||
if (!is_latin1) {
|
if (!is_latin1) {
|
||||||
jchar* position = value->char_at_addr(start);
|
jchar* position = value->char_at_addr(start);
|
||||||
return UNICODE::as_utf8(position, len);
|
return UNICODE::as_utf8(position, length);
|
||||||
} else {
|
} else {
|
||||||
jbyte* position = value->byte_at_addr(start);
|
jbyte* position = value->byte_at_addr(start);
|
||||||
return UNICODE::as_utf8(position, len);
|
return UNICODE::as_utf8(position, length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen) {
|
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen) {
|
||||||
assert(value_equals(value, java_lang_String::value(java_string)),
|
assert(value_equals(value, java_lang_String::value(java_string)),
|
||||||
"value must be same as java_lang_String::value(java_string)");
|
"value must be same as java_lang_String::value(java_string)");
|
||||||
assert(start + len <= java_lang_String::length(java_string), "just checking");
|
assert(start + len <= java_lang_String::length(java_string), "just checking");
|
||||||
|
@ -131,17 +131,21 @@ class java_lang_String : AllStatic {
|
|||||||
static inline bool deduplication_requested(oop java_string);
|
static inline bool deduplication_requested(oop java_string);
|
||||||
static inline int length(oop java_string);
|
static inline int length(oop java_string);
|
||||||
static inline int length(oop java_string, typeArrayOop string_value);
|
static inline int length(oop java_string, typeArrayOop string_value);
|
||||||
static int utf8_length(oop java_string);
|
static size_t utf8_length(oop java_string);
|
||||||
static int utf8_length(oop java_string, typeArrayOop string_value);
|
static size_t utf8_length(oop java_string, typeArrayOop string_value);
|
||||||
|
// Legacy variants that truncate the length if needed
|
||||||
|
static int utf8_length_as_int(oop java_string);
|
||||||
|
static int utf8_length_as_int(oop java_string, typeArrayOop string_value);
|
||||||
|
|
||||||
// String converters
|
// String converters
|
||||||
static char* as_utf8_string(oop java_string);
|
static char* as_utf8_string(oop java_string);
|
||||||
static char* as_utf8_string(oop java_string, int& length);
|
// `length` is set to the length of the utf8 sequence.
|
||||||
static char* as_utf8_string_full(oop java_string, char* buf, int buflen, int& length);
|
static char* as_utf8_string(oop java_string, size_t& length);
|
||||||
static char* as_utf8_string(oop java_string, char* buf, int buflen);
|
static char* as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& length);
|
||||||
|
static char* as_utf8_string(oop java_string, char* buf, size_t buflen);
|
||||||
static char* as_utf8_string(oop java_string, int start, int len);
|
static char* as_utf8_string(oop java_string, int start, int len);
|
||||||
static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen);
|
static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen);
|
||||||
static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen);
|
static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen);
|
||||||
static char* as_platform_dependent_str(Handle java_string, TRAPS);
|
static char* as_platform_dependent_str(Handle java_string, TRAPS);
|
||||||
static jchar* as_unicode_string(oop java_string, int& length, TRAPS);
|
static jchar* as_unicode_string(oop java_string, int& length, TRAPS);
|
||||||
static jchar* as_unicode_string_or_null(oop java_string, int& length);
|
static jchar* as_unicode_string_or_null(oop java_string, int& length);
|
||||||
|
@ -72,7 +72,9 @@ static char* get_module_name(oop module, int& len, TRAPS) {
|
|||||||
if (name_oop == nullptr) {
|
if (name_oop == nullptr) {
|
||||||
THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), "Null module name");
|
THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), "Null module name");
|
||||||
}
|
}
|
||||||
char* module_name = java_lang_String::as_utf8_string(name_oop, len);
|
size_t utf8_len;
|
||||||
|
char* module_name = java_lang_String::as_utf8_string(name_oop, utf8_len);
|
||||||
|
len = checked_cast<int>(utf8_len); // module names are < 64K
|
||||||
if (!verify_module_name(module_name, len)) {
|
if (!verify_module_name(module_name, len)) {
|
||||||
THROW_MSG_NULL(vmSymbols::java_lang_IllegalArgumentException(),
|
THROW_MSG_NULL(vmSymbols::java_lang_IllegalArgumentException(),
|
||||||
err_msg("Invalid module name: %s", module_name));
|
err_msg("Invalid module name: %s", module_name));
|
||||||
@ -84,9 +86,9 @@ static Symbol* as_symbol(jstring str_object) {
|
|||||||
if (str_object == nullptr) {
|
if (str_object == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
int len;
|
size_t len;
|
||||||
char* str = java_lang_String::as_utf8_string(JNIHandles::resolve_non_null(str_object), len);
|
char* str = java_lang_String::as_utf8_string(JNIHandles::resolve_non_null(str_object), len);
|
||||||
return SymbolTable::new_symbol(str, len);
|
return SymbolTable::new_symbol(str, checked_cast<int>(len));
|
||||||
}
|
}
|
||||||
|
|
||||||
ModuleEntryTable* Modules::get_module_entry_table(Handle h_loader) {
|
ModuleEntryTable* Modules::get_module_entry_table(Handle h_loader) {
|
||||||
@ -142,8 +144,10 @@ bool Modules::is_package_defined(Symbol* package, Handle h_loader) {
|
|||||||
// Will use the provided buffer if it's sufficiently large, otherwise allocates
|
// Will use the provided buffer if it's sufficiently large, otherwise allocates
|
||||||
// a resource array
|
// a resource array
|
||||||
// The length of the resulting string will be assigned to utf8_len
|
// The length of the resulting string will be assigned to utf8_len
|
||||||
static const char* as_internal_package(oop package_string, char* buf, int buflen, int& utf8_len) {
|
static const char* as_internal_package(oop package_string, char* buf, size_t buflen, int& utf8_len) {
|
||||||
char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, utf8_len);
|
size_t full_utf8_len;
|
||||||
|
char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, full_utf8_len);
|
||||||
|
utf8_len = checked_cast<int>(full_utf8_len); // package names are < 64K
|
||||||
|
|
||||||
// Turn all '/'s into '.'s
|
// Turn all '/'s into '.'s
|
||||||
for (int index = 0; index < utf8_len; index++) {
|
for (int index = 0; index < utf8_len; index++) {
|
||||||
|
@ -686,7 +686,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
|
|||||||
st->print("%d: ", length);
|
st->print("%d: ", length);
|
||||||
} else {
|
} else {
|
||||||
ResourceMark rm(current);
|
ResourceMark rm(current);
|
||||||
int utf8_length = length;
|
size_t utf8_length = length;
|
||||||
char* utf8_string;
|
char* utf8_string;
|
||||||
|
|
||||||
if (!is_latin1) {
|
if (!is_latin1) {
|
||||||
@ -697,7 +697,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
|
|||||||
utf8_string = UNICODE::as_utf8(bytes, utf8_length);
|
utf8_string = UNICODE::as_utf8(bytes, utf8_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
st->print("%d: ", utf8_length);
|
st->print("%zu: ", utf8_length);
|
||||||
HashtableTextDump::put_utf8(st, utf8_string, utf8_length);
|
HashtableTextDump::put_utf8(st, utf8_string, utf8_length);
|
||||||
}
|
}
|
||||||
st->cr();
|
st->cr();
|
||||||
|
@ -349,6 +349,7 @@ Symbol* SymbolTable::lookup_common(const char* name,
|
|||||||
// to be used for arbitrary strings. For debug builds we will assert if
|
// to be used for arbitrary strings. For debug builds we will assert if
|
||||||
// a string is too long, whereas product builds will truncate it.
|
// a string is too long, whereas product builds will truncate it.
|
||||||
static int check_length(const char* name, int len) {
|
static int check_length(const char* name, int len) {
|
||||||
|
assert(len >= 0, "negative length %d suggests integer overflow in the caller", len);
|
||||||
assert(len <= Symbol::max_length(),
|
assert(len <= Symbol::max_length(),
|
||||||
"String length %d exceeds the maximum Symbol length of %d", len, Symbol::max_length());
|
"String length %d exceeds the maximum Symbol length of %d", len, Symbol::max_length());
|
||||||
if (len > Symbol::max_length()) {
|
if (len > Symbol::max_length()) {
|
||||||
@ -461,33 +462,33 @@ Symbol* SymbolTable::lookup_only(const char* name, int len, unsigned int& hash)
|
|||||||
// and probing logic, so there is no need for convert_to_utf8 until
|
// and probing logic, so there is no need for convert_to_utf8 until
|
||||||
// an actual new Symbol* is created.
|
// an actual new Symbol* is created.
|
||||||
Symbol* SymbolTable::new_symbol(const jchar* name, int utf16_length) {
|
Symbol* SymbolTable::new_symbol(const jchar* name, int utf16_length) {
|
||||||
int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
|
size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
|
||||||
char stack_buf[ON_STACK_BUFFER_LENGTH];
|
char stack_buf[ON_STACK_BUFFER_LENGTH];
|
||||||
if (utf8_length < (int) sizeof(stack_buf)) {
|
if (utf8_length < sizeof(stack_buf)) {
|
||||||
char* chars = stack_buf;
|
char* chars = stack_buf;
|
||||||
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
||||||
return new_symbol(chars, utf8_length);
|
return new_symbol(chars, checked_cast<int>(utf8_length));
|
||||||
} else {
|
} else {
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
|
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
|
||||||
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
||||||
return new_symbol(chars, utf8_length);
|
return new_symbol(chars, checked_cast<int>(utf8_length));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Symbol* SymbolTable::lookup_only_unicode(const jchar* name, int utf16_length,
|
Symbol* SymbolTable::lookup_only_unicode(const jchar* name, int utf16_length,
|
||||||
unsigned int& hash) {
|
unsigned int& hash) {
|
||||||
int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
|
size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
|
||||||
char stack_buf[ON_STACK_BUFFER_LENGTH];
|
char stack_buf[ON_STACK_BUFFER_LENGTH];
|
||||||
if (utf8_length < (int) sizeof(stack_buf)) {
|
if (utf8_length < sizeof(stack_buf)) {
|
||||||
char* chars = stack_buf;
|
char* chars = stack_buf;
|
||||||
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
||||||
return lookup_only(chars, utf8_length, hash);
|
return lookup_only(chars, checked_cast<int>(utf8_length), hash);
|
||||||
} else {
|
} else {
|
||||||
ResourceMark rm;
|
ResourceMark rm;
|
||||||
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
|
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
|
||||||
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
UNICODE::convert_to_utf8(name, utf16_length, chars);
|
||||||
return lookup_only(chars, utf8_length, hash);
|
return lookup_only(chars, checked_cast<int>(utf8_length), hash);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012, 2023, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2012, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
@ -292,10 +292,10 @@ static const char* get_as_dcmd_arena_string(oop string) {
|
|||||||
char* str = nullptr;
|
char* str = nullptr;
|
||||||
const typeArrayOop value = java_lang_String::value(string);
|
const typeArrayOop value = java_lang_String::value(string);
|
||||||
if (value != nullptr) {
|
if (value != nullptr) {
|
||||||
const size_t length = static_cast<size_t>(java_lang_String::utf8_length(string, value)) + 1;
|
const size_t length = java_lang_String::utf8_length(string, value) + 1;
|
||||||
str = dcmd_arena_allocate(length);
|
str = dcmd_arena_allocate(length);
|
||||||
assert(str != nullptr, "invariant");
|
assert(str != nullptr, "invariant");
|
||||||
java_lang_String::as_utf8_string(string, value, str, static_cast<int>(length));
|
java_lang_String::as_utf8_string(string, value, str, length);
|
||||||
}
|
}
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
@ -502,7 +502,7 @@ Klass* JfrJavaSupport::klass(const jobject handle) {
|
|||||||
return obj->klass();
|
return obj->klass();
|
||||||
}
|
}
|
||||||
|
|
||||||
static char* allocate_string(bool c_heap, int length, Thread* thread) {
|
static char* allocate_string(bool c_heap, size_t length, Thread* thread) {
|
||||||
return c_heap ? NEW_C_HEAP_ARRAY(char, length, mtTracing) :
|
return c_heap ? NEW_C_HEAP_ARRAY(char, length, mtTracing) :
|
||||||
NEW_RESOURCE_ARRAY_IN_THREAD(thread, char, length);
|
NEW_RESOURCE_ARRAY_IN_THREAD(thread, char, length);
|
||||||
}
|
}
|
||||||
@ -511,7 +511,7 @@ const char* JfrJavaSupport::c_str(oop string, Thread* thread, bool c_heap /* fal
|
|||||||
char* str = nullptr;
|
char* str = nullptr;
|
||||||
const typeArrayOop value = java_lang_String::value(string);
|
const typeArrayOop value = java_lang_String::value(string);
|
||||||
if (value != nullptr) {
|
if (value != nullptr) {
|
||||||
const int length = java_lang_String::utf8_length(string, value);
|
const size_t length = java_lang_String::utf8_length(string, value);
|
||||||
str = allocate_string(c_heap, length + 1, thread);
|
str = allocate_string(c_heap, length + 1, thread);
|
||||||
if (str == nullptr) {
|
if (str == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
@ -121,7 +121,10 @@ static const char* get_java_thread_name(const JavaThread* jt, int& length, oop v
|
|||||||
}
|
}
|
||||||
assert(thread_obj != nullptr, "invariant");
|
assert(thread_obj != nullptr, "invariant");
|
||||||
const oop name = java_lang_Thread::name(thread_obj);
|
const oop name = java_lang_Thread::name(thread_obj);
|
||||||
return name != nullptr ? java_lang_String::as_utf8_string(name, length) : nullptr;
|
size_t utf8_len;
|
||||||
|
const char* ret = name != nullptr ? java_lang_String::as_utf8_string(name, utf8_len) : nullptr;
|
||||||
|
length = checked_cast<int>(utf8_len); // Thread names should be short
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* JfrThreadName::name(const Thread* t, int& length, oop vthread) {
|
const char* JfrThreadName::name(const Thread* t, int& length, oop vthread) {
|
||||||
|
@ -166,7 +166,7 @@ void Symbol::print_symbol_on(outputStream* st) const {
|
|||||||
|
|
||||||
char* Symbol::as_quoted_ascii() const {
|
char* Symbol::as_quoted_ascii() const {
|
||||||
const char *ptr = (const char *)&_body[0];
|
const char *ptr = (const char *)&_body[0];
|
||||||
int quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
|
size_t quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
|
||||||
char* result = NEW_RESOURCE_ARRAY(char, quoted_length + 1);
|
char* result = NEW_RESOURCE_ARRAY(char, quoted_length + 1);
|
||||||
UTF8::as_quoted_ascii(ptr, utf8_length(), result, quoted_length + 1);
|
UTF8::as_quoted_ascii(ptr, utf8_length(), result, quoted_length + 1);
|
||||||
return result;
|
return result;
|
||||||
|
@ -2223,7 +2223,7 @@ JNI_END
|
|||||||
JNI_ENTRY(jsize, jni_GetStringUTFLength(JNIEnv *env, jstring string))
|
JNI_ENTRY(jsize, jni_GetStringUTFLength(JNIEnv *env, jstring string))
|
||||||
HOTSPOT_JNI_GETSTRINGUTFLENGTH_ENTRY(env, string);
|
HOTSPOT_JNI_GETSTRINGUTFLENGTH_ENTRY(env, string);
|
||||||
oop java_string = JNIHandles::resolve_non_null(string);
|
oop java_string = JNIHandles::resolve_non_null(string);
|
||||||
jsize ret = java_lang_String::utf8_length(java_string);
|
jsize ret = java_lang_String::utf8_length_as_int(java_string);
|
||||||
HOTSPOT_JNI_GETSTRINGUTFLENGTH_RETURN(ret);
|
HOTSPOT_JNI_GETSTRINGUTFLENGTH_RETURN(ret);
|
||||||
return ret;
|
return ret;
|
||||||
JNI_END
|
JNI_END
|
||||||
@ -2236,10 +2236,11 @@ JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboole
|
|||||||
typeArrayOop s_value = java_lang_String::value(java_string);
|
typeArrayOop s_value = java_lang_String::value(java_string);
|
||||||
if (s_value != nullptr) {
|
if (s_value != nullptr) {
|
||||||
size_t length = java_lang_String::utf8_length(java_string, s_value);
|
size_t length = java_lang_String::utf8_length(java_string, s_value);
|
||||||
/* JNI Specification states return null on OOM */
|
// JNI Specification states return null on OOM.
|
||||||
|
// The resulting sequence doesn't have to be NUL-terminated but we do.
|
||||||
result = AllocateHeap(length + 1, mtInternal, AllocFailStrategy::RETURN_NULL);
|
result = AllocateHeap(length + 1, mtInternal, AllocFailStrategy::RETURN_NULL);
|
||||||
if (result != nullptr) {
|
if (result != nullptr) {
|
||||||
java_lang_String::as_utf8_string(java_string, s_value, result, (int) length + 1);
|
java_lang_String::as_utf8_string(java_string, s_value, result, length + 1);
|
||||||
if (isCopy != nullptr) {
|
if (isCopy != nullptr) {
|
||||||
*isCopy = JNI_TRUE;
|
*isCopy = JNI_TRUE;
|
||||||
}
|
}
|
||||||
|
@ -1321,7 +1321,7 @@ JvmtiEnv::GetThreadInfo(jthread thread, jvmtiThreadInfo* info_ptr) {
|
|||||||
if (name() != nullptr) {
|
if (name() != nullptr) {
|
||||||
n = java_lang_String::as_utf8_string(name());
|
n = java_lang_String::as_utf8_string(name());
|
||||||
} else {
|
} else {
|
||||||
int utf8_length = 0;
|
size_t utf8_length = 0;
|
||||||
n = UNICODE::as_utf8((jchar*) nullptr, utf8_length);
|
n = UNICODE::as_utf8((jchar*) nullptr, utf8_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
@ -45,7 +45,7 @@ static const char* allocate(oop string) {
|
|||||||
char* str = nullptr;
|
char* str = nullptr;
|
||||||
const typeArrayOop value = java_lang_String::value(string);
|
const typeArrayOop value = java_lang_String::value(string);
|
||||||
if (value != nullptr) {
|
if (value != nullptr) {
|
||||||
const int length = java_lang_String::utf8_length(string, value);
|
const size_t length = java_lang_String::utf8_length(string, value);
|
||||||
str = NEW_C_HEAP_ARRAY(char, length + 1, mtServiceability);
|
str = NEW_C_HEAP_ARRAY(char, length + 1, mtServiceability);
|
||||||
java_lang_String::as_utf8_string(string, value, str, length + 1);
|
java_lang_String::as_utf8_string(string, value, str, length + 1);
|
||||||
}
|
}
|
||||||
|
@ -98,15 +98,21 @@ char* UTF8::next_character(const char* str, jint* value) {
|
|||||||
return next_ch;
|
return next_ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count bytes of the form 10xxxxxx and deduct this count
|
// The number of unicode characters in a utf8 sequence can be easily
|
||||||
|
// determined by noting that bytes of the form 10xxxxxx are part of
|
||||||
|
// a 2 or 3-byte multi-byte sequence, all others are either characters
|
||||||
|
// themselves or else the start of a multi-byte character.
|
||||||
|
|
||||||
|
// Calculate the unicode length of a utf8 string of known size
|
||||||
|
// by counting bytes of the form 10xxxxxx and deducting this count
|
||||||
// from the total byte count. The utf8 string must be in
|
// from the total byte count. The utf8 string must be in
|
||||||
// legal form which has been verified in the format checker.
|
// legal form which has been verified in the format checker.
|
||||||
int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
|
int UTF8::unicode_length(const char* str, size_t len, bool& is_latin1, bool& has_multibyte) {
|
||||||
int num_chars = len;
|
size_t num_chars = len;
|
||||||
has_multibyte = false;
|
has_multibyte = false;
|
||||||
is_latin1 = true;
|
is_latin1 = true;
|
||||||
unsigned char prev = 0;
|
unsigned char prev = 0;
|
||||||
for (int i = 0; i < len; i++) {
|
for (size_t i = 0; i < len; i++) {
|
||||||
unsigned char c = str[i];
|
unsigned char c = str[i];
|
||||||
if ((c & 0xC0) == 0x80) {
|
if ((c & 0xC0) == 0x80) {
|
||||||
// Multibyte, check if valid latin1 character.
|
// Multibyte, check if valid latin1 character.
|
||||||
@ -118,12 +124,12 @@ int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_mu
|
|||||||
}
|
}
|
||||||
prev = c;
|
prev = c;
|
||||||
}
|
}
|
||||||
return num_chars;
|
return checked_cast<int>(num_chars);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count bytes of the utf8 string except those in form
|
// Calculate the unicode length of a nul-terminated utf8 string
|
||||||
// 10xxxxxx which only appear in multibyte characters.
|
// by counting bytes of the utf8 string except those in the form
|
||||||
// The utf8 string must be in legal form and has been
|
// 10xxxxxx. The utf8 string must be in legal form and has been
|
||||||
// verified in the format checker.
|
// verified in the format checker.
|
||||||
int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
|
int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
|
||||||
int num_chars = 0;
|
int num_chars = 0;
|
||||||
@ -195,10 +201,10 @@ template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unico
|
|||||||
template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
|
template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
|
||||||
|
|
||||||
// returns the quoted ascii length of a 0-terminated utf8 string
|
// returns the quoted ascii length of a 0-terminated utf8 string
|
||||||
int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
|
size_t UTF8::quoted_ascii_length(const char* utf8_str, size_t utf8_length) {
|
||||||
const char *ptr = utf8_str;
|
const char *ptr = utf8_str;
|
||||||
const char* end = ptr + utf8_length;
|
const char* end = ptr + utf8_length;
|
||||||
int result = 0;
|
size_t result = 0;
|
||||||
while (ptr < end) {
|
while (ptr < end) {
|
||||||
jchar c;
|
jchar c;
|
||||||
ptr = UTF8::next(ptr, &c);
|
ptr = UTF8::next(ptr, &c);
|
||||||
@ -212,7 +218,7 @@ int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// converts a utf8 string to quoted ascii
|
// converts a utf8 string to quoted ascii
|
||||||
void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
|
void UTF8::as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen) {
|
||||||
const char *ptr = utf8_str;
|
const char *ptr = utf8_str;
|
||||||
const char *utf8_end = ptr + utf8_length;
|
const char *utf8_end = ptr + utf8_length;
|
||||||
char* p = buf;
|
char* p = buf;
|
||||||
@ -248,7 +254,7 @@ const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
|
|||||||
return quoted_ascii_str;
|
return quoted_ascii_str;
|
||||||
}
|
}
|
||||||
// everything up to this point was ok.
|
// everything up to this point was ok.
|
||||||
int length = ptr - quoted_ascii_str;
|
size_t length = ptr - quoted_ascii_str;
|
||||||
char* buffer = nullptr;
|
char* buffer = nullptr;
|
||||||
for (int round = 0; round < 2; round++) {
|
for (int round = 0; round < 2; round++) {
|
||||||
while (*ptr != '\0') {
|
while (*ptr != '\0') {
|
||||||
@ -330,11 +336,11 @@ jint UTF8::get_supplementary_character(const unsigned char* str) {
|
|||||||
+ ((str[4] & 0x0f) << 6) + (str[5] & 0x3f);
|
+ ((str[4] & 0x0f) << 6) + (str[5] & 0x3f);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
|
bool UTF8::is_legal_utf8(const unsigned char* buffer, size_t length,
|
||||||
bool version_leq_47) {
|
bool version_leq_47) {
|
||||||
int i = 0;
|
size_t i = 0;
|
||||||
int count = length >> 2;
|
size_t count = length >> 2;
|
||||||
for (int k=0; k<count; k++) {
|
for (size_t k = 0; k < count; k++) {
|
||||||
unsigned char b0 = buffer[i];
|
unsigned char b0 = buffer[i];
|
||||||
unsigned char b1 = buffer[i+1];
|
unsigned char b1 = buffer[i+1];
|
||||||
unsigned char b2 = buffer[i+2];
|
unsigned char b2 = buffer[i+2];
|
||||||
@ -405,7 +411,7 @@ static bool is_starting_byte(unsigned char b) {
|
|||||||
// To avoid that the caller can choose to check for validity first.
|
// To avoid that the caller can choose to check for validity first.
|
||||||
// The incoming buffer is still expected to be NUL-terminated.
|
// The incoming buffer is still expected to be NUL-terminated.
|
||||||
// The incoming buffer is expected to be a realistic size - we assert if it is too small.
|
// The incoming buffer is expected to be a realistic size - we assert if it is too small.
|
||||||
void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
|
void UTF8::truncate_to_legal_utf8(unsigned char* buffer, size_t length) {
|
||||||
assert(length > 5, "invalid length");
|
assert(length > 5, "invalid length");
|
||||||
assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated");
|
assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated");
|
||||||
|
|
||||||
@ -433,7 +439,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
|
|||||||
// then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte
|
// then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte
|
||||||
// encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding.
|
// encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding.
|
||||||
|
|
||||||
for (int index = length - 2; index > 0; index--) {
|
for (size_t index = length - 2; index > 0; index--) {
|
||||||
if (is_starting_byte(buffer[index])) {
|
if (is_starting_byte(buffer[index])) {
|
||||||
if (buffer[index] == 0xED) {
|
if (buffer[index] == 0xED) {
|
||||||
// Could be first byte of 3 or 6, or fourth byte of 6.
|
// Could be first byte of 3 or 6, or fourth byte of 6.
|
||||||
@ -441,7 +447,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
|
|||||||
// surrogate value in the range EDA080 to EDAFBF. We only
|
// surrogate value in the range EDA080 to EDAFBF. We only
|
||||||
// need to check for EDA to establish this as the "missing"
|
// need to check for EDA to establish this as the "missing"
|
||||||
// values in EDAxxx would not be valid 3 byte encodings.
|
// values in EDAxxx would not be valid 3 byte encodings.
|
||||||
if ((index - 3) >= 0 &&
|
if (index >= 3 &&
|
||||||
(buffer[index - 3] == 0xED) &&
|
(buffer[index - 3] == 0xED) &&
|
||||||
((buffer[index - 2] & 0xF0) == 0xA0)) {
|
((buffer[index - 2] & 0xF0) == 0xA0)) {
|
||||||
assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check");
|
assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check");
|
||||||
@ -470,7 +476,7 @@ bool UNICODE::is_latin1(const jchar* base, int length) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UNICODE::utf8_size(jchar c) {
|
size_t UNICODE::utf8_size(jchar c) {
|
||||||
if ((0x0001 <= c) && (c <= 0x007F)) {
|
if ((0x0001 <= c) && (c <= 0x007F)) {
|
||||||
// ASCII character
|
// ASCII character
|
||||||
return 1;
|
return 1;
|
||||||
@ -481,7 +487,7 @@ int UNICODE::utf8_size(jchar c) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int UNICODE::utf8_size(jbyte c) {
|
size_t UNICODE::utf8_size(jbyte c) {
|
||||||
if (c >= 0x01) {
|
if (c >= 0x01) {
|
||||||
// ASCII character. Check is equivalent to
|
// ASCII character. Check is equivalent to
|
||||||
// (0x01 <= c) && (c <= 0x7F) because c is signed.
|
// (0x01 <= c) && (c <= 0x7F) because c is signed.
|
||||||
@ -494,11 +500,23 @@ int UNICODE::utf8_size(jbyte c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int UNICODE::utf8_length(const T* base, int length) {
|
size_t UNICODE::utf8_length(const T* base, int length) {
|
||||||
|
size_t result = 0;
|
||||||
|
for (int index = 0; index < length; index++) {
|
||||||
|
result += utf8_size(base[index]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
int UNICODE::utf8_length_as_int(const T* base, int length) {
|
||||||
size_t result = 0;
|
size_t result = 0;
|
||||||
for (int index = 0; index < length; index++) {
|
for (int index = 0; index < length; index++) {
|
||||||
T c = base[index];
|
T c = base[index];
|
||||||
int sz = utf8_size(c);
|
size_t sz = utf8_size(c);
|
||||||
|
// If the length is > INT_MAX-1 we truncate at a completed
|
||||||
|
// modified-UTF8 encoding. This allows for +1 to be added
|
||||||
|
// by the caller for NUL-termination, without overflow.
|
||||||
if (result + sz > INT_MAX-1) {
|
if (result + sz > INT_MAX-1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -508,41 +526,44 @@ int UNICODE::utf8_length(const T* base, int length) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
char* UNICODE::as_utf8(const T* base, int& length) {
|
char* UNICODE::as_utf8(const T* base, size_t& length) {
|
||||||
int utf8_len = utf8_length(base, length);
|
// Incoming length must be <= INT_MAX
|
||||||
|
size_t utf8_len = utf8_length(base, static_cast<int>(length));
|
||||||
u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
|
u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
|
||||||
char* result = as_utf8(base, length, (char*) buf, utf8_len + 1);
|
char* result = as_utf8(base, static_cast<int>(length), (char*) buf, utf8_len + 1);
|
||||||
assert((int) strlen(result) == utf8_len, "length prediction must be correct");
|
assert(strlen(result) == utf8_len, "length prediction must be correct");
|
||||||
// Set string length to uft8 length
|
// Set outgoing string length to uft8 length
|
||||||
length = utf8_len;
|
length = utf8_len;
|
||||||
return (char*) result;
|
return (char*) result;
|
||||||
}
|
}
|
||||||
|
|
||||||
char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) {
|
char* UNICODE::as_utf8(const jchar* base, int length, char* buf, size_t buflen) {
|
||||||
assert(buflen > 0, "zero length output buffer");
|
assert(buflen > 0, "zero length output buffer");
|
||||||
u_char* p = (u_char*)buf;
|
u_char* p = (u_char*)buf;
|
||||||
for (int index = 0; index < length; index++) {
|
for (int index = 0; index < length; index++) {
|
||||||
jchar c = base[index];
|
jchar c = base[index];
|
||||||
buflen -= utf8_size(c);
|
size_t sz = utf8_size(c);
|
||||||
if (buflen <= 0) break; // string is truncated
|
if (sz >= buflen) break; // string is truncated
|
||||||
|
buflen -= sz;
|
||||||
p = utf8_write(p, c);
|
p = utf8_write(p, c);
|
||||||
}
|
}
|
||||||
*p = '\0';
|
*p = '\0';
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) {
|
char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, size_t buflen) {
|
||||||
assert(buflen > 0, "zero length output buffer");
|
assert(buflen > 0, "zero length output buffer");
|
||||||
u_char* p = (u_char*)buf;
|
u_char* p = (u_char*)buf;
|
||||||
for (int index = 0; index < length; index++) {
|
for (int index = 0; index < length; index++) {
|
||||||
jbyte c = base[index];
|
jbyte c = base[index];
|
||||||
int sz = utf8_size(c);
|
size_t sz = utf8_size(c);
|
||||||
|
if (sz >= buflen) break; // string is truncated
|
||||||
buflen -= sz;
|
buflen -= sz;
|
||||||
if (buflen <= 0) break; // string is truncated
|
|
||||||
if (sz == 1) {
|
if (sz == 1) {
|
||||||
// Copy ASCII characters (UTF-8 is ASCII compatible)
|
// Copy ASCII characters (UTF-8 is ASCII compatible)
|
||||||
*p++ = c;
|
*p++ = c;
|
||||||
} else {
|
} else {
|
||||||
|
assert(sz == 2, "must be!");
|
||||||
// Non-ASCII character or 0x00 which should
|
// Non-ASCII character or 0x00 which should
|
||||||
// be encoded as 0xC080 in "modified" UTF8.
|
// be encoded as 0xC080 in "modified" UTF8.
|
||||||
p = utf8_write(p, ((jchar) c) & 0xff);
|
p = utf8_write(p, ((jchar) c) & 0xff);
|
||||||
@ -561,8 +582,8 @@ void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer)
|
|||||||
|
|
||||||
// returns the quoted ascii length of a unicode string
|
// returns the quoted ascii length of a unicode string
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int UNICODE::quoted_ascii_length(const T* base, int length) {
|
size_t UNICODE::quoted_ascii_length(const T* base, int length) {
|
||||||
int result = 0;
|
size_t result = 0;
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
T c = base[i];
|
T c = base[i];
|
||||||
if (c >= 32 && c < 127) {
|
if (c >= 32 && c < 127) {
|
||||||
@ -576,7 +597,7 @@ int UNICODE::quoted_ascii_length(const T* base, int length) {
|
|||||||
|
|
||||||
// converts a unicode string to quoted ascii
|
// converts a unicode string to quoted ascii
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
|
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, size_t buflen) {
|
||||||
char* p = buf;
|
char* p = buf;
|
||||||
char* end = buf + buflen;
|
char* end = buf + buflen;
|
||||||
for (int index = 0; index < length; index++) {
|
for (int index = 0; index < length; index++) {
|
||||||
@ -594,11 +615,13 @@ void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Explicit instantiation for all supported types.
|
// Explicit instantiation for all supported types.
|
||||||
template int UNICODE::utf8_length(const jbyte* base, int length);
|
template size_t UNICODE::utf8_length(const jbyte* base, int length);
|
||||||
template int UNICODE::utf8_length(const jchar* base, int length);
|
template size_t UNICODE::utf8_length(const jchar* base, int length);
|
||||||
template char* UNICODE::as_utf8(const jbyte* base, int& length);
|
template int UNICODE::utf8_length_as_int(const jbyte* base, int length);
|
||||||
template char* UNICODE::as_utf8(const jchar* base, int& length);
|
template int UNICODE::utf8_length_as_int(const jchar* base, int length);
|
||||||
template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
|
template char* UNICODE::as_utf8(const jbyte* base, size_t& length);
|
||||||
template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
|
template char* UNICODE::as_utf8(const jchar* base, size_t& length);
|
||||||
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
|
template size_t UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
|
||||||
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);
|
template size_t UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
|
||||||
|
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, size_t buflen);
|
||||||
|
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, size_t buflen);
|
||||||
|
@ -29,6 +29,45 @@
|
|||||||
#include "memory/allStatic.hpp"
|
#include "memory/allStatic.hpp"
|
||||||
#include "utilities/debug.hpp"
|
#include "utilities/debug.hpp"
|
||||||
|
|
||||||
|
/**
|
||||||
|
|
||||||
|
String handling within Java and the VM requires a bit of explanation.
|
||||||
|
|
||||||
|
Logically a java.lang.String is a sequence of 16-bit Unicode characters
|
||||||
|
encoded in UTF-16. In the past a String contained a Java char[] and so
|
||||||
|
could theoretically contain INT_MAX 16-bit characters. Then came JEP 254:
|
||||||
|
Compact Strings.
|
||||||
|
|
||||||
|
With Compact Strings the Java char[] becomes a Java byte[], and that byte[]
|
||||||
|
contains either latin-1 characters all of which fit in 8-bits, or else each
|
||||||
|
pair of bytes represents a UTF-16 character. Consequently the maximum length
|
||||||
|
in characters of a latin-1 string is INT_MAX, whilst for non-latin-1 it is INT_MAX/2.
|
||||||
|
|
||||||
|
In the code below if we have latin-1 content then we treat the String's data
|
||||||
|
array as a jbyte[], else a jchar[]. The lengths of these arrays are specified
|
||||||
|
as an int value, with a nominal maximum of INT_MAX.
|
||||||
|
|
||||||
|
The modified UTF-8 encoding specified for the VM, nominally encodes characters
|
||||||
|
in 1, 2, 3 or 6 bytes. The 6-byte representation is actually two 3-byte representations
|
||||||
|
for two UTF-16 characters forming a surrogate pair. If we are dealing with
|
||||||
|
a latin-1 string then each character will be encoded as either 1 or 2 bytes and so the
|
||||||
|
maximum UTF8 length is 2*INT_MAX. This can't be stored in an int so utf8 buffers must
|
||||||
|
use a size_t length. For non-latin-1 strings each UTF-16 character will encode as either
|
||||||
|
2 or 3 bytes, so the maximum UTF8 length in that case is 3 * INT_MAX/2 i.e. 1.5*INT_MAX.
|
||||||
|
|
||||||
|
The "quoted ascii" form of a unicode string is at worst 6 times longer than its
|
||||||
|
regular form, and so these lengths must always be size_t - though if we know we only
|
||||||
|
ever do this to symbols (or small symbol combinations) then we could use int.
|
||||||
|
|
||||||
|
There is an additional assumption/expectation that our UTF8 API's are never dealing with
|
||||||
|
invalid UTF8, and more generally that all UTF8 sequences could form valid Strings.
|
||||||
|
Consequently the Unicode length of a UTF8 sequence is assumed to always be representable
|
||||||
|
by an int. However, there are API's, such as JNI NewStringUTF, that do deal with such input
|
||||||
|
and could potentially have an unrepresentable string. The long standing position with JNI
|
||||||
|
is that the user must supply valid input so we do not try to account for these cases.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
// Low-level interface for UTF8 strings
|
// Low-level interface for UTF8 strings
|
||||||
|
|
||||||
class UTF8 : AllStatic {
|
class UTF8 : AllStatic {
|
||||||
@ -41,20 +80,20 @@ class UTF8 : AllStatic {
|
|||||||
static int unicode_length(const char* utf8_str, bool& is_latin1, bool& has_multibyte);
|
static int unicode_length(const char* utf8_str, bool& is_latin1, bool& has_multibyte);
|
||||||
|
|
||||||
// returns the unicode length of a non-0-terminated utf8 string
|
// returns the unicode length of a non-0-terminated utf8 string
|
||||||
static int unicode_length(const char* utf8_str, int len) {
|
static int unicode_length(const char* utf8_str, size_t len) {
|
||||||
bool is_latin1, has_multibyte;
|
bool is_latin1, has_multibyte;
|
||||||
return unicode_length(utf8_str, len, is_latin1, has_multibyte);
|
return unicode_length(utf8_str, len, is_latin1, has_multibyte);
|
||||||
}
|
}
|
||||||
static int unicode_length(const char* utf8_str, int len, bool& is_latin1, bool& has_multibyte);
|
static int unicode_length(const char* utf8_str, size_t len, bool& is_latin1, bool& has_multibyte);
|
||||||
|
|
||||||
// converts a utf8 string to a unicode string
|
// converts a utf8 string to a unicode string
|
||||||
template<typename T> static void convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length);
|
template<typename T> static void convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length);
|
||||||
|
|
||||||
// returns the quoted ascii length of a utf8 string
|
// returns the quoted ascii length of a utf8 string
|
||||||
static int quoted_ascii_length(const char* utf8_str, int utf8_length);
|
static size_t quoted_ascii_length(const char* utf8_str, size_t utf8_length);
|
||||||
|
|
||||||
// converts a utf8 string to quoted ascii
|
// converts a utf8 string to quoted ascii
|
||||||
static void as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen);
|
static void as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen);
|
||||||
|
|
||||||
#ifndef PRODUCT
|
#ifndef PRODUCT
|
||||||
// converts a quoted ascii string to utf8 string. returns the original
|
// converts a quoted ascii string to utf8 string. returns the original
|
||||||
@ -82,13 +121,13 @@ class UTF8 : AllStatic {
|
|||||||
while(--length >= 0 && base[length] != c);
|
while(--length >= 0 && base[length] != c);
|
||||||
return (length < 0) ? nullptr : &base[length];
|
return (length < 0) ? nullptr : &base[length];
|
||||||
}
|
}
|
||||||
static bool equal(const jbyte* base1, int length1, const jbyte* base2,int length2);
|
static bool equal(const jbyte* base1, int length1, const jbyte* base2, int length2);
|
||||||
static bool is_supplementary_character(const unsigned char* str);
|
static bool is_supplementary_character(const unsigned char* str);
|
||||||
static jint get_supplementary_character(const unsigned char* str);
|
static jint get_supplementary_character(const unsigned char* str);
|
||||||
|
|
||||||
static bool is_legal_utf8(const unsigned char* buffer, int length,
|
static bool is_legal_utf8(const unsigned char* buffer, size_t length,
|
||||||
bool version_leq_47);
|
bool version_leq_47);
|
||||||
static void truncate_to_legal_utf8(unsigned char* buffer, int length);
|
static void truncate_to_legal_utf8(unsigned char* buffer, size_t length);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -99,6 +138,12 @@ class UTF8 : AllStatic {
|
|||||||
// units, so a supplementary character uses two positions in a unicode string.
|
// units, so a supplementary character uses two positions in a unicode string.
|
||||||
|
|
||||||
class UNICODE : AllStatic {
|
class UNICODE : AllStatic {
|
||||||
|
|
||||||
|
// returns the utf8 size of a unicode character
|
||||||
|
// uses size_t for convenience in overflow checks
|
||||||
|
static size_t utf8_size(jchar c);
|
||||||
|
static size_t utf8_size(jbyte c);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// checks if the given unicode character can be encoded as latin1
|
// checks if the given unicode character can be encoded as latin1
|
||||||
static bool is_latin1(jchar c);
|
static bool is_latin1(jchar c);
|
||||||
@ -106,28 +151,27 @@ class UNICODE : AllStatic {
|
|||||||
// checks if the given string can be encoded as latin1
|
// checks if the given string can be encoded as latin1
|
||||||
static bool is_latin1(const jchar* base, int length);
|
static bool is_latin1(const jchar* base, int length);
|
||||||
|
|
||||||
// returns the utf8 size of a unicode character
|
|
||||||
static int utf8_size(jchar c);
|
|
||||||
static int utf8_size(jbyte c);
|
|
||||||
|
|
||||||
// returns the utf8 length of a unicode string
|
// returns the utf8 length of a unicode string
|
||||||
template<typename T> static int utf8_length(const T* base, int length);
|
template<typename T> static size_t utf8_length(const T* base, int length);
|
||||||
|
|
||||||
|
// returns the utf8 length of a unicode string as an int - truncated if needed
|
||||||
|
template<typename T> static int utf8_length_as_int(const T* base, int length);
|
||||||
|
|
||||||
// converts a unicode string to utf8 string
|
// converts a unicode string to utf8 string
|
||||||
static void convert_to_utf8(const jchar* base, int length, char* utf8_buffer);
|
static void convert_to_utf8(const jchar* base, int length, char* utf8_buffer);
|
||||||
|
|
||||||
// converts a unicode string to a utf8 string; result is allocated
|
// converts a unicode string to a utf8 string; result is allocated
|
||||||
// in resource area unless a buffer is provided. The unicode 'length'
|
// in resource area unless a buffer is provided. The unicode 'length'
|
||||||
// parameter is set to the length of the result utf8 string.
|
// parameter is set to the length of the resulting utf8 string.
|
||||||
template<typename T> static char* as_utf8(const T* base, int& length);
|
template<typename T> static char* as_utf8(const T* base, size_t& length);
|
||||||
static char* as_utf8(const jchar* base, int length, char* buf, int buflen);
|
static char* as_utf8(const jchar* base, int length, char* buf, size_t buflen);
|
||||||
static char* as_utf8(const jbyte* base, int length, char* buf, int buflen);
|
static char* as_utf8(const jbyte* base, int length, char* buf, size_t buflen);
|
||||||
|
|
||||||
// returns the quoted ascii length of a unicode string
|
// returns the quoted ascii length of a unicode string
|
||||||
template<typename T> static int quoted_ascii_length(const T* base, int length);
|
template<typename T> static size_t quoted_ascii_length(const T* base, int length);
|
||||||
|
|
||||||
// converts a unicode string to quoted ascii
|
// converts a unicode string to quoted ascii
|
||||||
template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, int buflen);
|
template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, size_t buflen);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // SHARE_UTILITIES_UTF8_HPP
|
#endif // SHARE_UTILITIES_UTF8_HPP
|
||||||
|
Loading…
x
Reference in New Issue
Block a user