8338257: UTF8 lengths should be size_t not int

Reviewed-by: stuefe, coleenp, dlong
This commit is contained in:
David Holmes 2024-08-29 20:38:52 +00:00
parent 777ed2b5d2
commit a4962ace4d
16 changed files with 229 additions and 122 deletions

View File

@ -431,10 +431,10 @@ void HashtableTextDump::get_utf8(char* utf8_buffer, int utf8_length) {
}
// NOTE: the content is NOT the same as
// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen).
// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, size_t buflen).
// We want to escape \r\n\t so that output [1] is more readable; [2] can be more easily
// parsed by scripts; [3] quickly processed by HashtableTextDump::get_utf8()
void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, int utf8_length) {
void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length) {
const char *c = utf8_string;
const char *end = c + utf8_length;
for (; c < end; c++) {

View File

@ -431,7 +431,7 @@ public:
int unescape(const char* from, const char* end, int count);
void get_utf8(char* utf8_buffer, int utf8_length);
static void put_utf8(outputStream* st, const char* utf8_string, int utf8_length);
static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
};
#endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP

View File

@ -304,7 +304,8 @@ Handle java_lang_String::create_from_unicode(const jchar* unicode, int length, T
#ifdef ASSERT
{
ResourceMark rm;
char* expected = UNICODE::as_utf8(unicode, length);
size_t utf8_len = static_cast<size_t>(length);
char* expected = UNICODE::as_utf8(unicode, utf8_len);
char* actual = as_utf8_string(h_obj());
if (strcmp(expected, actual) != 0) {
fatal("Unicode conversion failure: %s --> %s", expected, actual);
@ -346,7 +347,7 @@ Handle java_lang_String::create_from_str(const char* utf8_str, TRAPS) {
#ifdef ASSERT
// This check is too strict when the input string is not a valid UTF8.
// For example, it may be created with arbitrary content via jni_NewStringUTF.
if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, (int)strlen(utf8_str), false)) {
if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, strlen(utf8_str), false)) {
ResourceMark rm;
const char* expected = utf8_str;
char* actual = as_utf8_string(h_obj());
@ -554,7 +555,7 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
if (length == 0) return nullptr;
char* result;
int result_length;
size_t result_length;
if (!is_latin1) {
jchar* base = value->char_at_addr(0);
result_length = UNICODE::quoted_ascii_length(base, length) + 1;
@ -566,8 +567,8 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
result = NEW_RESOURCE_ARRAY(char, result_length);
UNICODE::as_quoted_ascii(base, length, result, result_length);
}
assert(result_length >= length + 1, "must not be shorter");
assert(result_length == (int)strlen(result) + 1, "must match");
assert(result_length >= (size_t)length + 1, "must not be shorter");
assert(result_length == strlen(result) + 1, "must match");
return result;
}
@ -582,8 +583,9 @@ Symbol* java_lang_String::as_symbol(oop java_string) {
} else {
ResourceMark rm;
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
const char* base = UNICODE::as_utf8(position, length);
Symbol* sym = SymbolTable::new_symbol(base, length);
size_t utf8_len = static_cast<size_t>(length);
const char* base = UNICODE::as_utf8(position, utf8_len);
Symbol* sym = SymbolTable::new_symbol(base, checked_cast<int>(utf8_len));
return sym;
}
}
@ -598,12 +600,13 @@ Symbol* java_lang_String::as_symbol_or_null(oop java_string) {
} else {
ResourceMark rm;
jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
const char* base = UNICODE::as_utf8(position, length);
return SymbolTable::probe(base, length);
size_t utf8_len = static_cast<size_t>(length);
const char* base = UNICODE::as_utf8(position, utf8_len);
return SymbolTable::probe(base, checked_cast<int>(utf8_len));
}
}
int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
size_t java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
assert(value_equals(value, java_lang_String::value(java_string)),
"value must be same as java_lang_String::value(java_string)");
int length = java_lang_String::length(java_string, value);
@ -617,18 +620,39 @@ int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
}
}
int java_lang_String::utf8_length(oop java_string) {
size_t java_lang_String::utf8_length(oop java_string) {
typeArrayOop value = java_lang_String::value(java_string);
return utf8_length(java_string, value);
}
int java_lang_String::utf8_length_as_int(oop java_string) {
typeArrayOop value = java_lang_String::value(java_string);
return utf8_length_as_int(java_string, value);
}
int java_lang_String::utf8_length_as_int(oop java_string, typeArrayOop value) {
assert(value_equals(value, java_lang_String::value(java_string)),
"value must be same as java_lang_String::value(java_string)");
int length = java_lang_String::length(java_string, value);
if (length == 0) {
return 0;
}
if (!java_lang_String::is_latin1(java_string)) {
return UNICODE::utf8_length_as_int(value->char_at_addr(0), length);
} else {
return UNICODE::utf8_length_as_int(value->byte_at_addr(0), length);
}
}
char* java_lang_String::as_utf8_string(oop java_string) {
int length;
size_t length;
return as_utf8_string(java_string, length);
}
char* java_lang_String::as_utf8_string(oop java_string, int& length) {
char* java_lang_String::as_utf8_string(oop java_string, size_t& length) {
typeArrayOop value = java_lang_String::value(java_string);
// `length` is used as the incoming number of characters to
// convert, and then set as the number of bytes in the UTF8 sequence.
length = java_lang_String::length(java_string, value);
bool is_latin1 = java_lang_String::is_latin1(java_string);
if (!is_latin1) {
@ -642,7 +666,7 @@ char* java_lang_String::as_utf8_string(oop java_string, int& length) {
// Uses a provided buffer if it's sufficiently large, otherwise allocates
// a resource array to fit
char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int buflen, int& utf8_len) {
char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& utf8_len) {
typeArrayOop value = java_lang_String::value(java_string);
int len = java_lang_String::length(java_string, value);
bool is_latin1 = java_lang_String::is_latin1(java_string);
@ -663,7 +687,7 @@ char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int bufl
}
}
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen) {
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen) {
assert(value_equals(value, java_lang_String::value(java_string)),
"value must be same as java_lang_String::value(java_string)");
int length = java_lang_String::length(java_string, value);
@ -677,25 +701,28 @@ char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char
}
}
char* java_lang_String::as_utf8_string(oop java_string, char* buf, int buflen) {
char* java_lang_String::as_utf8_string(oop java_string, char* buf, size_t buflen) {
typeArrayOop value = java_lang_String::value(java_string);
return as_utf8_string(java_string, value, buf, buflen);
}
char* java_lang_String::as_utf8_string(oop java_string, int start, int len) {
// `length` is used as the incoming number of characters to
// convert, and then set as the number of bytes in the UTF8 sequence.
size_t length = static_cast<size_t>(len);
typeArrayOop value = java_lang_String::value(java_string);
bool is_latin1 = java_lang_String::is_latin1(java_string);
assert(start + len <= java_lang_String::length(java_string), "just checking");
if (!is_latin1) {
jchar* position = value->char_at_addr(start);
return UNICODE::as_utf8(position, len);
return UNICODE::as_utf8(position, length);
} else {
jbyte* position = value->byte_at_addr(start);
return UNICODE::as_utf8(position, len);
return UNICODE::as_utf8(position, length);
}
}
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen) {
char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen) {
assert(value_equals(value, java_lang_String::value(java_string)),
"value must be same as java_lang_String::value(java_string)");
assert(start + len <= java_lang_String::length(java_string), "just checking");

View File

@ -131,17 +131,21 @@ class java_lang_String : AllStatic {
static inline bool deduplication_requested(oop java_string);
static inline int length(oop java_string);
static inline int length(oop java_string, typeArrayOop string_value);
static int utf8_length(oop java_string);
static int utf8_length(oop java_string, typeArrayOop string_value);
static size_t utf8_length(oop java_string);
static size_t utf8_length(oop java_string, typeArrayOop string_value);
// Legacy variants that truncate the length if needed
static int utf8_length_as_int(oop java_string);
static int utf8_length_as_int(oop java_string, typeArrayOop string_value);
// String converters
static char* as_utf8_string(oop java_string);
static char* as_utf8_string(oop java_string, int& length);
static char* as_utf8_string_full(oop java_string, char* buf, int buflen, int& length);
static char* as_utf8_string(oop java_string, char* buf, int buflen);
// `length` is set to the length of the utf8 sequence.
static char* as_utf8_string(oop java_string, size_t& length);
static char* as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& length);
static char* as_utf8_string(oop java_string, char* buf, size_t buflen);
static char* as_utf8_string(oop java_string, int start, int len);
static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen);
static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen);
static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen);
static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen);
static char* as_platform_dependent_str(Handle java_string, TRAPS);
static jchar* as_unicode_string(oop java_string, int& length, TRAPS);
static jchar* as_unicode_string_or_null(oop java_string, int& length);

View File

@ -72,7 +72,9 @@ static char* get_module_name(oop module, int& len, TRAPS) {
if (name_oop == nullptr) {
THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), "Null module name");
}
char* module_name = java_lang_String::as_utf8_string(name_oop, len);
size_t utf8_len;
char* module_name = java_lang_String::as_utf8_string(name_oop, utf8_len);
len = checked_cast<int>(utf8_len); // module names are < 64K
if (!verify_module_name(module_name, len)) {
THROW_MSG_NULL(vmSymbols::java_lang_IllegalArgumentException(),
err_msg("Invalid module name: %s", module_name));
@ -84,9 +86,9 @@ static Symbol* as_symbol(jstring str_object) {
if (str_object == nullptr) {
return nullptr;
}
int len;
size_t len;
char* str = java_lang_String::as_utf8_string(JNIHandles::resolve_non_null(str_object), len);
return SymbolTable::new_symbol(str, len);
return SymbolTable::new_symbol(str, checked_cast<int>(len));
}
ModuleEntryTable* Modules::get_module_entry_table(Handle h_loader) {
@ -142,8 +144,10 @@ bool Modules::is_package_defined(Symbol* package, Handle h_loader) {
// Will use the provided buffer if it's sufficiently large, otherwise allocates
// a resource array
// The length of the resulting string will be assigned to utf8_len
static const char* as_internal_package(oop package_string, char* buf, int buflen, int& utf8_len) {
char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, utf8_len);
static const char* as_internal_package(oop package_string, char* buf, size_t buflen, int& utf8_len) {
size_t full_utf8_len;
char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, full_utf8_len);
utf8_len = checked_cast<int>(full_utf8_len); // package names are < 64K
// Turn all '/'s into '.'s
for (int index = 0; index < utf8_len; index++) {

View File

@ -686,7 +686,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
st->print("%d: ", length);
} else {
ResourceMark rm(current);
int utf8_length = length;
size_t utf8_length = length;
char* utf8_string;
if (!is_latin1) {
@ -697,7 +697,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
utf8_string = UNICODE::as_utf8(bytes, utf8_length);
}
st->print("%d: ", utf8_length);
st->print("%zu: ", utf8_length);
HashtableTextDump::put_utf8(st, utf8_string, utf8_length);
}
st->cr();

View File

@ -349,6 +349,7 @@ Symbol* SymbolTable::lookup_common(const char* name,
// to be used for arbitrary strings. For debug builds we will assert if
// a string is too long, whereas product builds will truncate it.
static int check_length(const char* name, int len) {
assert(len >= 0, "negative length %d suggests integer overflow in the caller", len);
assert(len <= Symbol::max_length(),
"String length %d exceeds the maximum Symbol length of %d", len, Symbol::max_length());
if (len > Symbol::max_length()) {
@ -461,33 +462,33 @@ Symbol* SymbolTable::lookup_only(const char* name, int len, unsigned int& hash)
// and probing logic, so there is no need for convert_to_utf8 until
// an actual new Symbol* is created.
Symbol* SymbolTable::new_symbol(const jchar* name, int utf16_length) {
int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
char stack_buf[ON_STACK_BUFFER_LENGTH];
if (utf8_length < (int) sizeof(stack_buf)) {
if (utf8_length < sizeof(stack_buf)) {
char* chars = stack_buf;
UNICODE::convert_to_utf8(name, utf16_length, chars);
return new_symbol(chars, utf8_length);
return new_symbol(chars, checked_cast<int>(utf8_length));
} else {
ResourceMark rm;
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
UNICODE::convert_to_utf8(name, utf16_length, chars);
return new_symbol(chars, utf8_length);
return new_symbol(chars, checked_cast<int>(utf8_length));
}
}
Symbol* SymbolTable::lookup_only_unicode(const jchar* name, int utf16_length,
unsigned int& hash) {
int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
char stack_buf[ON_STACK_BUFFER_LENGTH];
if (utf8_length < (int) sizeof(stack_buf)) {
if (utf8_length < sizeof(stack_buf)) {
char* chars = stack_buf;
UNICODE::convert_to_utf8(name, utf16_length, chars);
return lookup_only(chars, utf8_length, hash);
return lookup_only(chars, checked_cast<int>(utf8_length), hash);
} else {
ResourceMark rm;
char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
UNICODE::convert_to_utf8(name, utf16_length, chars);
return lookup_only(chars, utf8_length, hash);
return lookup_only(chars, checked_cast<int>(utf8_length), hash);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2012, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -292,10 +292,10 @@ static const char* get_as_dcmd_arena_string(oop string) {
char* str = nullptr;
const typeArrayOop value = java_lang_String::value(string);
if (value != nullptr) {
const size_t length = static_cast<size_t>(java_lang_String::utf8_length(string, value)) + 1;
const size_t length = java_lang_String::utf8_length(string, value) + 1;
str = dcmd_arena_allocate(length);
assert(str != nullptr, "invariant");
java_lang_String::as_utf8_string(string, value, str, static_cast<int>(length));
java_lang_String::as_utf8_string(string, value, str, length);
}
return str;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -502,7 +502,7 @@ Klass* JfrJavaSupport::klass(const jobject handle) {
return obj->klass();
}
static char* allocate_string(bool c_heap, int length, Thread* thread) {
static char* allocate_string(bool c_heap, size_t length, Thread* thread) {
return c_heap ? NEW_C_HEAP_ARRAY(char, length, mtTracing) :
NEW_RESOURCE_ARRAY_IN_THREAD(thread, char, length);
}
@ -511,7 +511,7 @@ const char* JfrJavaSupport::c_str(oop string, Thread* thread, bool c_heap /* fal
char* str = nullptr;
const typeArrayOop value = java_lang_String::value(string);
if (value != nullptr) {
const int length = java_lang_String::utf8_length(string, value);
const size_t length = java_lang_String::utf8_length(string, value);
str = allocate_string(c_heap, length + 1, thread);
if (str == nullptr) {
return nullptr;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -121,7 +121,10 @@ static const char* get_java_thread_name(const JavaThread* jt, int& length, oop v
}
assert(thread_obj != nullptr, "invariant");
const oop name = java_lang_Thread::name(thread_obj);
return name != nullptr ? java_lang_String::as_utf8_string(name, length) : nullptr;
size_t utf8_len;
const char* ret = name != nullptr ? java_lang_String::as_utf8_string(name, utf8_len) : nullptr;
length = checked_cast<int>(utf8_len); // Thread names should be short
return ret;
}
const char* JfrThreadName::name(const Thread* t, int& length, oop vthread) {

View File

@ -166,7 +166,7 @@ void Symbol::print_symbol_on(outputStream* st) const {
char* Symbol::as_quoted_ascii() const {
const char *ptr = (const char *)&_body[0];
int quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
size_t quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
char* result = NEW_RESOURCE_ARRAY(char, quoted_length + 1);
UTF8::as_quoted_ascii(ptr, utf8_length(), result, quoted_length + 1);
return result;

View File

@ -2223,7 +2223,7 @@ JNI_END
JNI_ENTRY(jsize, jni_GetStringUTFLength(JNIEnv *env, jstring string))
HOTSPOT_JNI_GETSTRINGUTFLENGTH_ENTRY(env, string);
oop java_string = JNIHandles::resolve_non_null(string);
jsize ret = java_lang_String::utf8_length(java_string);
jsize ret = java_lang_String::utf8_length_as_int(java_string);
HOTSPOT_JNI_GETSTRINGUTFLENGTH_RETURN(ret);
return ret;
JNI_END
@ -2236,10 +2236,11 @@ JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboole
typeArrayOop s_value = java_lang_String::value(java_string);
if (s_value != nullptr) {
size_t length = java_lang_String::utf8_length(java_string, s_value);
/* JNI Specification states return null on OOM */
// JNI Specification states return null on OOM.
// The resulting sequence doesn't have to be NUL-terminated but we do.
result = AllocateHeap(length + 1, mtInternal, AllocFailStrategy::RETURN_NULL);
if (result != nullptr) {
java_lang_String::as_utf8_string(java_string, s_value, result, (int) length + 1);
java_lang_String::as_utf8_string(java_string, s_value, result, length + 1);
if (isCopy != nullptr) {
*isCopy = JNI_TRUE;
}

View File

@ -1321,7 +1321,7 @@ JvmtiEnv::GetThreadInfo(jthread thread, jvmtiThreadInfo* info_ptr) {
if (name() != nullptr) {
n = java_lang_String::as_utf8_string(name());
} else {
int utf8_length = 0;
size_t utf8_length = 0;
n = UNICODE::as_utf8((jchar*) nullptr, utf8_length);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -45,7 +45,7 @@ static const char* allocate(oop string) {
char* str = nullptr;
const typeArrayOop value = java_lang_String::value(string);
if (value != nullptr) {
const int length = java_lang_String::utf8_length(string, value);
const size_t length = java_lang_String::utf8_length(string, value);
str = NEW_C_HEAP_ARRAY(char, length + 1, mtServiceability);
java_lang_String::as_utf8_string(string, value, str, length + 1);
}

View File

@ -98,15 +98,21 @@ char* UTF8::next_character(const char* str, jint* value) {
return next_ch;
}
// Count bytes of the form 10xxxxxx and deduct this count
// The number of unicode characters in a utf8 sequence can be easily
// determined by noting that bytes of the form 10xxxxxx are part of
// a 2 or 3-byte multi-byte sequence, all others are either characters
// themselves or else the start of a multi-byte character.
// Calculate the unicode length of a utf8 string of known size
// by counting bytes of the form 10xxxxxx and deducting this count
// from the total byte count. The utf8 string must be in
// legal form which has been verified in the format checker.
int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
int num_chars = len;
int UTF8::unicode_length(const char* str, size_t len, bool& is_latin1, bool& has_multibyte) {
size_t num_chars = len;
has_multibyte = false;
is_latin1 = true;
unsigned char prev = 0;
for (int i = 0; i < len; i++) {
for (size_t i = 0; i < len; i++) {
unsigned char c = str[i];
if ((c & 0xC0) == 0x80) {
// Multibyte, check if valid latin1 character.
@ -118,12 +124,12 @@ int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_mu
}
prev = c;
}
return num_chars;
return checked_cast<int>(num_chars);
}
// Count bytes of the utf8 string except those in form
// 10xxxxxx which only appear in multibyte characters.
// The utf8 string must be in legal form and has been
// Calculate the unicode length of a nul-terminated utf8 string
// by counting bytes of the utf8 string except those in the form
// 10xxxxxx. The utf8 string must be in legal form and has been
// verified in the format checker.
int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
int num_chars = 0;
@ -195,10 +201,10 @@ template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unico
template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
// returns the quoted ascii length of a 0-terminated utf8 string
int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
size_t UTF8::quoted_ascii_length(const char* utf8_str, size_t utf8_length) {
const char *ptr = utf8_str;
const char* end = ptr + utf8_length;
int result = 0;
size_t result = 0;
while (ptr < end) {
jchar c;
ptr = UTF8::next(ptr, &c);
@ -212,7 +218,7 @@ int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
}
// converts a utf8 string to quoted ascii
void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
void UTF8::as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen) {
const char *ptr = utf8_str;
const char *utf8_end = ptr + utf8_length;
char* p = buf;
@ -248,7 +254,7 @@ const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
return quoted_ascii_str;
}
// everything up to this point was ok.
int length = ptr - quoted_ascii_str;
size_t length = ptr - quoted_ascii_str;
char* buffer = nullptr;
for (int round = 0; round < 2; round++) {
while (*ptr != '\0') {
@ -330,11 +336,11 @@ jint UTF8::get_supplementary_character(const unsigned char* str) {
+ ((str[4] & 0x0f) << 6) + (str[5] & 0x3f);
}
bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
bool UTF8::is_legal_utf8(const unsigned char* buffer, size_t length,
bool version_leq_47) {
int i = 0;
int count = length >> 2;
for (int k=0; k<count; k++) {
size_t i = 0;
size_t count = length >> 2;
for (size_t k = 0; k < count; k++) {
unsigned char b0 = buffer[i];
unsigned char b1 = buffer[i+1];
unsigned char b2 = buffer[i+2];
@ -405,7 +411,7 @@ static bool is_starting_byte(unsigned char b) {
// To avoid that the caller can choose to check for validity first.
// The incoming buffer is still expected to be NUL-terminated.
// The incoming buffer is expected to be a realistic size - we assert if it is too small.
void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
void UTF8::truncate_to_legal_utf8(unsigned char* buffer, size_t length) {
assert(length > 5, "invalid length");
assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated");
@ -433,7 +439,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
// then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte
// encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding.
for (int index = length - 2; index > 0; index--) {
for (size_t index = length - 2; index > 0; index--) {
if (is_starting_byte(buffer[index])) {
if (buffer[index] == 0xED) {
// Could be first byte of 3 or 6, or fourth byte of 6.
@ -441,7 +447,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
// surrogate value in the range EDA080 to EDAFBF. We only
// need to check for EDA to establish this as the "missing"
// values in EDAxxx would not be valid 3 byte encodings.
if ((index - 3) >= 0 &&
if (index >= 3 &&
(buffer[index - 3] == 0xED) &&
((buffer[index - 2] & 0xF0) == 0xA0)) {
assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check");
@ -470,7 +476,7 @@ bool UNICODE::is_latin1(const jchar* base, int length) {
return true;
}
int UNICODE::utf8_size(jchar c) {
size_t UNICODE::utf8_size(jchar c) {
if ((0x0001 <= c) && (c <= 0x007F)) {
// ASCII character
return 1;
@ -481,7 +487,7 @@ int UNICODE::utf8_size(jchar c) {
}
}
int UNICODE::utf8_size(jbyte c) {
size_t UNICODE::utf8_size(jbyte c) {
if (c >= 0x01) {
// ASCII character. Check is equivalent to
// (0x01 <= c) && (c <= 0x7F) because c is signed.
@ -494,11 +500,23 @@ int UNICODE::utf8_size(jbyte c) {
}
template<typename T>
int UNICODE::utf8_length(const T* base, int length) {
size_t UNICODE::utf8_length(const T* base, int length) {
size_t result = 0;
for (int index = 0; index < length; index++) {
result += utf8_size(base[index]);
}
return result;
}
template<typename T>
int UNICODE::utf8_length_as_int(const T* base, int length) {
size_t result = 0;
for (int index = 0; index < length; index++) {
T c = base[index];
int sz = utf8_size(c);
size_t sz = utf8_size(c);
// If the length is > INT_MAX-1 we truncate at a completed
// modified-UTF8 encoding. This allows for +1 to be added
// by the caller for NUL-termination, without overflow.
if (result + sz > INT_MAX-1) {
break;
}
@ -508,41 +526,44 @@ int UNICODE::utf8_length(const T* base, int length) {
}
template<typename T>
char* UNICODE::as_utf8(const T* base, int& length) {
int utf8_len = utf8_length(base, length);
char* UNICODE::as_utf8(const T* base, size_t& length) {
// Incoming length must be <= INT_MAX
size_t utf8_len = utf8_length(base, static_cast<int>(length));
u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
char* result = as_utf8(base, length, (char*) buf, utf8_len + 1);
assert((int) strlen(result) == utf8_len, "length prediction must be correct");
// Set string length to uft8 length
char* result = as_utf8(base, static_cast<int>(length), (char*) buf, utf8_len + 1);
assert(strlen(result) == utf8_len, "length prediction must be correct");
// Set outgoing string length to uft8 length
length = utf8_len;
return (char*) result;
}
char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) {
char* UNICODE::as_utf8(const jchar* base, int length, char* buf, size_t buflen) {
assert(buflen > 0, "zero length output buffer");
u_char* p = (u_char*)buf;
for (int index = 0; index < length; index++) {
jchar c = base[index];
buflen -= utf8_size(c);
if (buflen <= 0) break; // string is truncated
size_t sz = utf8_size(c);
if (sz >= buflen) break; // string is truncated
buflen -= sz;
p = utf8_write(p, c);
}
*p = '\0';
return buf;
}
char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) {
char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, size_t buflen) {
assert(buflen > 0, "zero length output buffer");
u_char* p = (u_char*)buf;
for (int index = 0; index < length; index++) {
jbyte c = base[index];
int sz = utf8_size(c);
size_t sz = utf8_size(c);
if (sz >= buflen) break; // string is truncated
buflen -= sz;
if (buflen <= 0) break; // string is truncated
if (sz == 1) {
// Copy ASCII characters (UTF-8 is ASCII compatible)
*p++ = c;
} else {
assert(sz == 2, "must be!");
// Non-ASCII character or 0x00 which should
// be encoded as 0xC080 in "modified" UTF8.
p = utf8_write(p, ((jchar) c) & 0xff);
@ -561,8 +582,8 @@ void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer)
// returns the quoted ascii length of a unicode string
template<typename T>
int UNICODE::quoted_ascii_length(const T* base, int length) {
int result = 0;
size_t UNICODE::quoted_ascii_length(const T* base, int length) {
size_t result = 0;
for (int i = 0; i < length; i++) {
T c = base[i];
if (c >= 32 && c < 127) {
@ -576,7 +597,7 @@ int UNICODE::quoted_ascii_length(const T* base, int length) {
// converts a unicode string to quoted ascii
template<typename T>
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, size_t buflen) {
char* p = buf;
char* end = buf + buflen;
for (int index = 0; index < length; index++) {
@ -594,11 +615,13 @@ void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen)
}
// Explicit instantiation for all supported types.
template int UNICODE::utf8_length(const jbyte* base, int length);
template int UNICODE::utf8_length(const jchar* base, int length);
template char* UNICODE::as_utf8(const jbyte* base, int& length);
template char* UNICODE::as_utf8(const jchar* base, int& length);
template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);
template size_t UNICODE::utf8_length(const jbyte* base, int length);
template size_t UNICODE::utf8_length(const jchar* base, int length);
template int UNICODE::utf8_length_as_int(const jbyte* base, int length);
template int UNICODE::utf8_length_as_int(const jchar* base, int length);
template char* UNICODE::as_utf8(const jbyte* base, size_t& length);
template char* UNICODE::as_utf8(const jchar* base, size_t& length);
template size_t UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
template size_t UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, size_t buflen);
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, size_t buflen);

View File

@ -29,6 +29,45 @@
#include "memory/allStatic.hpp"
#include "utilities/debug.hpp"
/**
String handling within Java and the VM requires a bit of explanation.
Logically a java.lang.String is a sequence of 16-bit Unicode characters
encoded in UTF-16. In the past a String contained a Java char[] and so
could theoretically contain INT_MAX 16-bit characters. Then came JEP 254:
Compact Strings.
With Compact Strings the Java char[] becomes a Java byte[], and that byte[]
contains either latin-1 characters all of which fit in 8-bits, or else each
pair of bytes represents a UTF-16 character. Consequently the maximum length
in characters of a latin-1 string is INT_MAX, whilst for non-latin-1 it is INT_MAX/2.
In the code below if we have latin-1 content then we treat the String's data
array as a jbyte[], else a jchar[]. The lengths of these arrays are specified
as an int value, with a nominal maximum of INT_MAX.
The modified UTF-8 encoding specified for the VM, nominally encodes characters
in 1, 2, 3 or 6 bytes. The 6-byte representation is actually two 3-byte representations
for two UTF-16 characters forming a surrogate pair. If we are dealing with
a latin-1 string then each character will be encoded as either 1 or 2 bytes and so the
maximum UTF8 length is 2*INT_MAX. This can't be stored in an int so utf8 buffers must
use a size_t length. For non-latin-1 strings each UTF-16 character will encode as either
2 or 3 bytes, so the maximum UTF8 length in that case is 3 * INT_MAX/2 i.e. 1.5*INT_MAX.
The "quoted ascii" form of a unicode string is at worst 6 times longer than its
regular form, and so these lengths must always be size_t - though if we know we only
ever do this to symbols (or small symbol combinations) then we could use int.
There is an additional assumption/expectation that our UTF8 API's are never dealing with
invalid UTF8, and more generally that all UTF8 sequences could form valid Strings.
Consequently the Unicode length of a UTF8 sequence is assumed to always be representable
by an int. However, there are API's, such as JNI NewStringUTF, that do deal with such input
and could potentially have an unrepresentable string. The long standing position with JNI
is that the user must supply valid input so we do not try to account for these cases.
*/
// Low-level interface for UTF8 strings
class UTF8 : AllStatic {
@ -41,20 +80,20 @@ class UTF8 : AllStatic {
static int unicode_length(const char* utf8_str, bool& is_latin1, bool& has_multibyte);
// returns the unicode length of a non-0-terminated utf8 string
static int unicode_length(const char* utf8_str, int len) {
static int unicode_length(const char* utf8_str, size_t len) {
bool is_latin1, has_multibyte;
return unicode_length(utf8_str, len, is_latin1, has_multibyte);
}
static int unicode_length(const char* utf8_str, int len, bool& is_latin1, bool& has_multibyte);
static int unicode_length(const char* utf8_str, size_t len, bool& is_latin1, bool& has_multibyte);
// converts a utf8 string to a unicode string
template<typename T> static void convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length);
// returns the quoted ascii length of a utf8 string
static int quoted_ascii_length(const char* utf8_str, int utf8_length);
static size_t quoted_ascii_length(const char* utf8_str, size_t utf8_length);
// converts a utf8 string to quoted ascii
static void as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen);
static void as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen);
#ifndef PRODUCT
// converts a quoted ascii string to utf8 string. returns the original
@ -82,13 +121,13 @@ class UTF8 : AllStatic {
while(--length >= 0 && base[length] != c);
return (length < 0) ? nullptr : &base[length];
}
static bool equal(const jbyte* base1, int length1, const jbyte* base2,int length2);
static bool equal(const jbyte* base1, int length1, const jbyte* base2, int length2);
static bool is_supplementary_character(const unsigned char* str);
static jint get_supplementary_character(const unsigned char* str);
static bool is_legal_utf8(const unsigned char* buffer, int length,
static bool is_legal_utf8(const unsigned char* buffer, size_t length,
bool version_leq_47);
static void truncate_to_legal_utf8(unsigned char* buffer, int length);
static void truncate_to_legal_utf8(unsigned char* buffer, size_t length);
};
@ -99,6 +138,12 @@ class UTF8 : AllStatic {
// units, so a supplementary character uses two positions in a unicode string.
class UNICODE : AllStatic {
// returns the utf8 size of a unicode character
// uses size_t for convenience in overflow checks
static size_t utf8_size(jchar c);
static size_t utf8_size(jbyte c);
public:
// checks if the given unicode character can be encoded as latin1
static bool is_latin1(jchar c);
@ -106,28 +151,27 @@ class UNICODE : AllStatic {
// checks if the given string can be encoded as latin1
static bool is_latin1(const jchar* base, int length);
// returns the utf8 size of a unicode character
static int utf8_size(jchar c);
static int utf8_size(jbyte c);
// returns the utf8 length of a unicode string
template<typename T> static int utf8_length(const T* base, int length);
template<typename T> static size_t utf8_length(const T* base, int length);
// returns the utf8 length of a unicode string as an int - truncated if needed
template<typename T> static int utf8_length_as_int(const T* base, int length);
// converts a unicode string to utf8 string
static void convert_to_utf8(const jchar* base, int length, char* utf8_buffer);
// converts a unicode string to a utf8 string; result is allocated
// in resource area unless a buffer is provided. The unicode 'length'
// parameter is set to the length of the result utf8 string.
template<typename T> static char* as_utf8(const T* base, int& length);
static char* as_utf8(const jchar* base, int length, char* buf, int buflen);
static char* as_utf8(const jbyte* base, int length, char* buf, int buflen);
// parameter is set to the length of the resulting utf8 string.
template<typename T> static char* as_utf8(const T* base, size_t& length);
static char* as_utf8(const jchar* base, int length, char* buf, size_t buflen);
static char* as_utf8(const jbyte* base, int length, char* buf, size_t buflen);
// returns the quoted ascii length of a unicode string
template<typename T> static int quoted_ascii_length(const T* base, int length);
template<typename T> static size_t quoted_ascii_length(const T* base, int length);
// converts a unicode string to quoted ascii
template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, int buflen);
template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, size_t buflen);
};
#endif // SHARE_UTILITIES_UTF8_HPP