From a4962ace4d3afb36e9d6822a4f02a1515fac40ed Mon Sep 17 00:00:00 2001 From: David Holmes Date: Thu, 29 Aug 2024 20:38:52 +0000 Subject: [PATCH] 8338257: UTF8 lengths should be size_t not int Reviewed-by: stuefe, coleenp, dlong --- .../share/classfile/compactHashtable.cpp | 4 +- .../share/classfile/compactHashtable.hpp | 2 +- src/hotspot/share/classfile/javaClasses.cpp | 65 +++++++--- src/hotspot/share/classfile/javaClasses.hpp | 18 +-- src/hotspot/share/classfile/modules.cpp | 14 ++- src/hotspot/share/classfile/stringTable.cpp | 4 +- src/hotspot/share/classfile/symbolTable.cpp | 17 +-- src/hotspot/share/jfr/dcmd/jfrDcmds.cpp | 6 +- src/hotspot/share/jfr/jni/jfrJavaSupport.cpp | 6 +- .../checkpoint/types/jfrThreadState.cpp | 7 +- src/hotspot/share/oops/symbol.cpp | 2 +- src/hotspot/share/prims/jni.cpp | 7 +- src/hotspot/share/prims/jvmtiEnv.cpp | 2 +- .../share/services/finalizerService.cpp | 4 +- src/hotspot/share/utilities/utf8.cpp | 113 +++++++++++------- src/hotspot/share/utilities/utf8.hpp | 80 ++++++++++--- 16 files changed, 229 insertions(+), 122 deletions(-) diff --git a/src/hotspot/share/classfile/compactHashtable.cpp b/src/hotspot/share/classfile/compactHashtable.cpp index d4657e35a84..57991589fdc 100644 --- a/src/hotspot/share/classfile/compactHashtable.cpp +++ b/src/hotspot/share/classfile/compactHashtable.cpp @@ -431,10 +431,10 @@ void HashtableTextDump::get_utf8(char* utf8_buffer, int utf8_length) { } // NOTE: the content is NOT the same as -// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen). +// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, size_t buflen). // We want to escape \r\n\t so that output [1] is more readable; [2] can be more easily // parsed by scripts; [3] quickly processed by HashtableTextDump::get_utf8() -void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, int utf8_length) { +void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length) { const char *c = utf8_string; const char *end = c + utf8_length; for (; c < end; c++) { diff --git a/src/hotspot/share/classfile/compactHashtable.hpp b/src/hotspot/share/classfile/compactHashtable.hpp index 6cb689ad20d..73e9f7fc092 100644 --- a/src/hotspot/share/classfile/compactHashtable.hpp +++ b/src/hotspot/share/classfile/compactHashtable.hpp @@ -431,7 +431,7 @@ public: int unescape(const char* from, const char* end, int count); void get_utf8(char* utf8_buffer, int utf8_length); - static void put_utf8(outputStream* st, const char* utf8_string, int utf8_length); + static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length); }; #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP diff --git a/src/hotspot/share/classfile/javaClasses.cpp b/src/hotspot/share/classfile/javaClasses.cpp index 8a6c8a8ce0b..b6ef682ae09 100644 --- a/src/hotspot/share/classfile/javaClasses.cpp +++ b/src/hotspot/share/classfile/javaClasses.cpp @@ -304,7 +304,8 @@ Handle java_lang_String::create_from_unicode(const jchar* unicode, int length, T #ifdef ASSERT { ResourceMark rm; - char* expected = UNICODE::as_utf8(unicode, length); + size_t utf8_len = static_cast(length); + char* expected = UNICODE::as_utf8(unicode, utf8_len); char* actual = as_utf8_string(h_obj()); if (strcmp(expected, actual) != 0) { fatal("Unicode conversion failure: %s --> %s", expected, actual); @@ -346,7 +347,7 @@ Handle java_lang_String::create_from_str(const char* utf8_str, TRAPS) { #ifdef ASSERT // This check is too strict when the input string is not a valid UTF8. // For example, it may be created with arbitrary content via jni_NewStringUTF. - if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, (int)strlen(utf8_str), false)) { + if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, strlen(utf8_str), false)) { ResourceMark rm; const char* expected = utf8_str; char* actual = as_utf8_string(h_obj()); @@ -554,7 +555,7 @@ char* java_lang_String::as_quoted_ascii(oop java_string) { if (length == 0) return nullptr; char* result; - int result_length; + size_t result_length; if (!is_latin1) { jchar* base = value->char_at_addr(0); result_length = UNICODE::quoted_ascii_length(base, length) + 1; @@ -566,8 +567,8 @@ char* java_lang_String::as_quoted_ascii(oop java_string) { result = NEW_RESOURCE_ARRAY(char, result_length); UNICODE::as_quoted_ascii(base, length, result, result_length); } - assert(result_length >= length + 1, "must not be shorter"); - assert(result_length == (int)strlen(result) + 1, "must match"); + assert(result_length >= (size_t)length + 1, "must not be shorter"); + assert(result_length == strlen(result) + 1, "must match"); return result; } @@ -582,8 +583,9 @@ Symbol* java_lang_String::as_symbol(oop java_string) { } else { ResourceMark rm; jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0); - const char* base = UNICODE::as_utf8(position, length); - Symbol* sym = SymbolTable::new_symbol(base, length); + size_t utf8_len = static_cast(length); + const char* base = UNICODE::as_utf8(position, utf8_len); + Symbol* sym = SymbolTable::new_symbol(base, checked_cast(utf8_len)); return sym; } } @@ -598,12 +600,13 @@ Symbol* java_lang_String::as_symbol_or_null(oop java_string) { } else { ResourceMark rm; jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0); - const char* base = UNICODE::as_utf8(position, length); - return SymbolTable::probe(base, length); + size_t utf8_len = static_cast(length); + const char* base = UNICODE::as_utf8(position, utf8_len); + return SymbolTable::probe(base, checked_cast(utf8_len)); } } -int java_lang_String::utf8_length(oop java_string, typeArrayOop value) { +size_t java_lang_String::utf8_length(oop java_string, typeArrayOop value) { assert(value_equals(value, java_lang_String::value(java_string)), "value must be same as java_lang_String::value(java_string)"); int length = java_lang_String::length(java_string, value); @@ -617,18 +620,39 @@ int java_lang_String::utf8_length(oop java_string, typeArrayOop value) { } } -int java_lang_String::utf8_length(oop java_string) { +size_t java_lang_String::utf8_length(oop java_string) { typeArrayOop value = java_lang_String::value(java_string); return utf8_length(java_string, value); } +int java_lang_String::utf8_length_as_int(oop java_string) { + typeArrayOop value = java_lang_String::value(java_string); + return utf8_length_as_int(java_string, value); +} + +int java_lang_String::utf8_length_as_int(oop java_string, typeArrayOop value) { + assert(value_equals(value, java_lang_String::value(java_string)), + "value must be same as java_lang_String::value(java_string)"); + int length = java_lang_String::length(java_string, value); + if (length == 0) { + return 0; + } + if (!java_lang_String::is_latin1(java_string)) { + return UNICODE::utf8_length_as_int(value->char_at_addr(0), length); + } else { + return UNICODE::utf8_length_as_int(value->byte_at_addr(0), length); + } +} + char* java_lang_String::as_utf8_string(oop java_string) { - int length; + size_t length; return as_utf8_string(java_string, length); } -char* java_lang_String::as_utf8_string(oop java_string, int& length) { +char* java_lang_String::as_utf8_string(oop java_string, size_t& length) { typeArrayOop value = java_lang_String::value(java_string); + // `length` is used as the incoming number of characters to + // convert, and then set as the number of bytes in the UTF8 sequence. length = java_lang_String::length(java_string, value); bool is_latin1 = java_lang_String::is_latin1(java_string); if (!is_latin1) { @@ -642,7 +666,7 @@ char* java_lang_String::as_utf8_string(oop java_string, int& length) { // Uses a provided buffer if it's sufficiently large, otherwise allocates // a resource array to fit -char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int buflen, int& utf8_len) { +char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& utf8_len) { typeArrayOop value = java_lang_String::value(java_string); int len = java_lang_String::length(java_string, value); bool is_latin1 = java_lang_String::is_latin1(java_string); @@ -663,7 +687,7 @@ char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int bufl } } -char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen) { +char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen) { assert(value_equals(value, java_lang_String::value(java_string)), "value must be same as java_lang_String::value(java_string)"); int length = java_lang_String::length(java_string, value); @@ -677,25 +701,28 @@ char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char } } -char* java_lang_String::as_utf8_string(oop java_string, char* buf, int buflen) { +char* java_lang_String::as_utf8_string(oop java_string, char* buf, size_t buflen) { typeArrayOop value = java_lang_String::value(java_string); return as_utf8_string(java_string, value, buf, buflen); } char* java_lang_String::as_utf8_string(oop java_string, int start, int len) { + // `length` is used as the incoming number of characters to + // convert, and then set as the number of bytes in the UTF8 sequence. + size_t length = static_cast(len); typeArrayOop value = java_lang_String::value(java_string); bool is_latin1 = java_lang_String::is_latin1(java_string); assert(start + len <= java_lang_String::length(java_string), "just checking"); if (!is_latin1) { jchar* position = value->char_at_addr(start); - return UNICODE::as_utf8(position, len); + return UNICODE::as_utf8(position, length); } else { jbyte* position = value->byte_at_addr(start); - return UNICODE::as_utf8(position, len); + return UNICODE::as_utf8(position, length); } } -char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen) { +char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen) { assert(value_equals(value, java_lang_String::value(java_string)), "value must be same as java_lang_String::value(java_string)"); assert(start + len <= java_lang_String::length(java_string), "just checking"); diff --git a/src/hotspot/share/classfile/javaClasses.hpp b/src/hotspot/share/classfile/javaClasses.hpp index eb3c5e29fdb..6f82a75e8ff 100644 --- a/src/hotspot/share/classfile/javaClasses.hpp +++ b/src/hotspot/share/classfile/javaClasses.hpp @@ -131,17 +131,21 @@ class java_lang_String : AllStatic { static inline bool deduplication_requested(oop java_string); static inline int length(oop java_string); static inline int length(oop java_string, typeArrayOop string_value); - static int utf8_length(oop java_string); - static int utf8_length(oop java_string, typeArrayOop string_value); + static size_t utf8_length(oop java_string); + static size_t utf8_length(oop java_string, typeArrayOop string_value); + // Legacy variants that truncate the length if needed + static int utf8_length_as_int(oop java_string); + static int utf8_length_as_int(oop java_string, typeArrayOop string_value); // String converters static char* as_utf8_string(oop java_string); - static char* as_utf8_string(oop java_string, int& length); - static char* as_utf8_string_full(oop java_string, char* buf, int buflen, int& length); - static char* as_utf8_string(oop java_string, char* buf, int buflen); + // `length` is set to the length of the utf8 sequence. + static char* as_utf8_string(oop java_string, size_t& length); + static char* as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& length); + static char* as_utf8_string(oop java_string, char* buf, size_t buflen); static char* as_utf8_string(oop java_string, int start, int len); - static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen); - static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen); + static char* as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen); + static char* as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen); static char* as_platform_dependent_str(Handle java_string, TRAPS); static jchar* as_unicode_string(oop java_string, int& length, TRAPS); static jchar* as_unicode_string_or_null(oop java_string, int& length); diff --git a/src/hotspot/share/classfile/modules.cpp b/src/hotspot/share/classfile/modules.cpp index ddbb84db3be..14e730f7a33 100644 --- a/src/hotspot/share/classfile/modules.cpp +++ b/src/hotspot/share/classfile/modules.cpp @@ -72,7 +72,9 @@ static char* get_module_name(oop module, int& len, TRAPS) { if (name_oop == nullptr) { THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), "Null module name"); } - char* module_name = java_lang_String::as_utf8_string(name_oop, len); + size_t utf8_len; + char* module_name = java_lang_String::as_utf8_string(name_oop, utf8_len); + len = checked_cast(utf8_len); // module names are < 64K if (!verify_module_name(module_name, len)) { THROW_MSG_NULL(vmSymbols::java_lang_IllegalArgumentException(), err_msg("Invalid module name: %s", module_name)); @@ -84,9 +86,9 @@ static Symbol* as_symbol(jstring str_object) { if (str_object == nullptr) { return nullptr; } - int len; + size_t len; char* str = java_lang_String::as_utf8_string(JNIHandles::resolve_non_null(str_object), len); - return SymbolTable::new_symbol(str, len); + return SymbolTable::new_symbol(str, checked_cast(len)); } ModuleEntryTable* Modules::get_module_entry_table(Handle h_loader) { @@ -142,8 +144,10 @@ bool Modules::is_package_defined(Symbol* package, Handle h_loader) { // Will use the provided buffer if it's sufficiently large, otherwise allocates // a resource array // The length of the resulting string will be assigned to utf8_len -static const char* as_internal_package(oop package_string, char* buf, int buflen, int& utf8_len) { - char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, utf8_len); +static const char* as_internal_package(oop package_string, char* buf, size_t buflen, int& utf8_len) { + size_t full_utf8_len; + char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, full_utf8_len); + utf8_len = checked_cast(full_utf8_len); // package names are < 64K // Turn all '/'s into '.'s for (int index = 0; index < utf8_len; index++) { diff --git a/src/hotspot/share/classfile/stringTable.cpp b/src/hotspot/share/classfile/stringTable.cpp index b01ecb24ac9..3a6cf166ff5 100644 --- a/src/hotspot/share/classfile/stringTable.cpp +++ b/src/hotspot/share/classfile/stringTable.cpp @@ -686,7 +686,7 @@ static void print_string(Thread* current, outputStream* st, oop s) { st->print("%d: ", length); } else { ResourceMark rm(current); - int utf8_length = length; + size_t utf8_length = length; char* utf8_string; if (!is_latin1) { @@ -697,7 +697,7 @@ static void print_string(Thread* current, outputStream* st, oop s) { utf8_string = UNICODE::as_utf8(bytes, utf8_length); } - st->print("%d: ", utf8_length); + st->print("%zu: ", utf8_length); HashtableTextDump::put_utf8(st, utf8_string, utf8_length); } st->cr(); diff --git a/src/hotspot/share/classfile/symbolTable.cpp b/src/hotspot/share/classfile/symbolTable.cpp index 95094238946..19306a2a9db 100644 --- a/src/hotspot/share/classfile/symbolTable.cpp +++ b/src/hotspot/share/classfile/symbolTable.cpp @@ -349,6 +349,7 @@ Symbol* SymbolTable::lookup_common(const char* name, // to be used for arbitrary strings. For debug builds we will assert if // a string is too long, whereas product builds will truncate it. static int check_length(const char* name, int len) { + assert(len >= 0, "negative length %d suggests integer overflow in the caller", len); assert(len <= Symbol::max_length(), "String length %d exceeds the maximum Symbol length of %d", len, Symbol::max_length()); if (len > Symbol::max_length()) { @@ -461,33 +462,33 @@ Symbol* SymbolTable::lookup_only(const char* name, int len, unsigned int& hash) // and probing logic, so there is no need for convert_to_utf8 until // an actual new Symbol* is created. Symbol* SymbolTable::new_symbol(const jchar* name, int utf16_length) { - int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length); + size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length); char stack_buf[ON_STACK_BUFFER_LENGTH]; - if (utf8_length < (int) sizeof(stack_buf)) { + if (utf8_length < sizeof(stack_buf)) { char* chars = stack_buf; UNICODE::convert_to_utf8(name, utf16_length, chars); - return new_symbol(chars, utf8_length); + return new_symbol(chars, checked_cast(utf8_length)); } else { ResourceMark rm; char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1); UNICODE::convert_to_utf8(name, utf16_length, chars); - return new_symbol(chars, utf8_length); + return new_symbol(chars, checked_cast(utf8_length)); } } Symbol* SymbolTable::lookup_only_unicode(const jchar* name, int utf16_length, unsigned int& hash) { - int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length); + size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length); char stack_buf[ON_STACK_BUFFER_LENGTH]; - if (utf8_length < (int) sizeof(stack_buf)) { + if (utf8_length < sizeof(stack_buf)) { char* chars = stack_buf; UNICODE::convert_to_utf8(name, utf16_length, chars); - return lookup_only(chars, utf8_length, hash); + return lookup_only(chars, checked_cast(utf8_length), hash); } else { ResourceMark rm; char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1); UNICODE::convert_to_utf8(name, utf16_length, chars); - return lookup_only(chars, utf8_length, hash); + return lookup_only(chars, checked_cast(utf8_length), hash); } } diff --git a/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp b/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp index c6944cfa219..56e006ab25c 100644 --- a/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp +++ b/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -292,10 +292,10 @@ static const char* get_as_dcmd_arena_string(oop string) { char* str = nullptr; const typeArrayOop value = java_lang_String::value(string); if (value != nullptr) { - const size_t length = static_cast(java_lang_String::utf8_length(string, value)) + 1; + const size_t length = java_lang_String::utf8_length(string, value) + 1; str = dcmd_arena_allocate(length); assert(str != nullptr, "invariant"); - java_lang_String::as_utf8_string(string, value, str, static_cast(length)); + java_lang_String::as_utf8_string(string, value, str, length); } return str; } diff --git a/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp b/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp index 0b3097ca1fa..4e2493fc251 100644 --- a/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp +++ b/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -502,7 +502,7 @@ Klass* JfrJavaSupport::klass(const jobject handle) { return obj->klass(); } -static char* allocate_string(bool c_heap, int length, Thread* thread) { +static char* allocate_string(bool c_heap, size_t length, Thread* thread) { return c_heap ? NEW_C_HEAP_ARRAY(char, length, mtTracing) : NEW_RESOURCE_ARRAY_IN_THREAD(thread, char, length); } @@ -511,7 +511,7 @@ const char* JfrJavaSupport::c_str(oop string, Thread* thread, bool c_heap /* fal char* str = nullptr; const typeArrayOop value = java_lang_String::value(string); if (value != nullptr) { - const int length = java_lang_String::utf8_length(string, value); + const size_t length = java_lang_String::utf8_length(string, value); str = allocate_string(c_heap, length + 1, thread); if (str == nullptr) { return nullptr; diff --git a/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp b/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp index bbc14327801..f6bf3e685b6 100644 --- a/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp +++ b/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp @@ -1,5 +1,5 @@ /* -* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved. +* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -121,7 +121,10 @@ static const char* get_java_thread_name(const JavaThread* jt, int& length, oop v } assert(thread_obj != nullptr, "invariant"); const oop name = java_lang_Thread::name(thread_obj); - return name != nullptr ? java_lang_String::as_utf8_string(name, length) : nullptr; + size_t utf8_len; + const char* ret = name != nullptr ? java_lang_String::as_utf8_string(name, utf8_len) : nullptr; + length = checked_cast(utf8_len); // Thread names should be short + return ret; } const char* JfrThreadName::name(const Thread* t, int& length, oop vthread) { diff --git a/src/hotspot/share/oops/symbol.cpp b/src/hotspot/share/oops/symbol.cpp index 8fe7c2aadbf..276a2018241 100644 --- a/src/hotspot/share/oops/symbol.cpp +++ b/src/hotspot/share/oops/symbol.cpp @@ -166,7 +166,7 @@ void Symbol::print_symbol_on(outputStream* st) const { char* Symbol::as_quoted_ascii() const { const char *ptr = (const char *)&_body[0]; - int quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length()); + size_t quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length()); char* result = NEW_RESOURCE_ARRAY(char, quoted_length + 1); UTF8::as_quoted_ascii(ptr, utf8_length(), result, quoted_length + 1); return result; diff --git a/src/hotspot/share/prims/jni.cpp b/src/hotspot/share/prims/jni.cpp index c9357fe0216..8a20d5f85b0 100644 --- a/src/hotspot/share/prims/jni.cpp +++ b/src/hotspot/share/prims/jni.cpp @@ -2223,7 +2223,7 @@ JNI_END JNI_ENTRY(jsize, jni_GetStringUTFLength(JNIEnv *env, jstring string)) HOTSPOT_JNI_GETSTRINGUTFLENGTH_ENTRY(env, string); oop java_string = JNIHandles::resolve_non_null(string); - jsize ret = java_lang_String::utf8_length(java_string); + jsize ret = java_lang_String::utf8_length_as_int(java_string); HOTSPOT_JNI_GETSTRINGUTFLENGTH_RETURN(ret); return ret; JNI_END @@ -2236,10 +2236,11 @@ JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboole typeArrayOop s_value = java_lang_String::value(java_string); if (s_value != nullptr) { size_t length = java_lang_String::utf8_length(java_string, s_value); - /* JNI Specification states return null on OOM */ + // JNI Specification states return null on OOM. + // The resulting sequence doesn't have to be NUL-terminated but we do. result = AllocateHeap(length + 1, mtInternal, AllocFailStrategy::RETURN_NULL); if (result != nullptr) { - java_lang_String::as_utf8_string(java_string, s_value, result, (int) length + 1); + java_lang_String::as_utf8_string(java_string, s_value, result, length + 1); if (isCopy != nullptr) { *isCopy = JNI_TRUE; } diff --git a/src/hotspot/share/prims/jvmtiEnv.cpp b/src/hotspot/share/prims/jvmtiEnv.cpp index ccccb5f1bda..63cc33222ec 100644 --- a/src/hotspot/share/prims/jvmtiEnv.cpp +++ b/src/hotspot/share/prims/jvmtiEnv.cpp @@ -1321,7 +1321,7 @@ JvmtiEnv::GetThreadInfo(jthread thread, jvmtiThreadInfo* info_ptr) { if (name() != nullptr) { n = java_lang_String::as_utf8_string(name()); } else { - int utf8_length = 0; + size_t utf8_length = 0; n = UNICODE::as_utf8((jchar*) nullptr, utf8_length); } diff --git a/src/hotspot/share/services/finalizerService.cpp b/src/hotspot/share/services/finalizerService.cpp index ecd9168cd65..fd46827ee00 100644 --- a/src/hotspot/share/services/finalizerService.cpp +++ b/src/hotspot/share/services/finalizerService.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -45,7 +45,7 @@ static const char* allocate(oop string) { char* str = nullptr; const typeArrayOop value = java_lang_String::value(string); if (value != nullptr) { - const int length = java_lang_String::utf8_length(string, value); + const size_t length = java_lang_String::utf8_length(string, value); str = NEW_C_HEAP_ARRAY(char, length + 1, mtServiceability); java_lang_String::as_utf8_string(string, value, str, length + 1); } diff --git a/src/hotspot/share/utilities/utf8.cpp b/src/hotspot/share/utilities/utf8.cpp index 47cbb04da4b..cd28e715009 100644 --- a/src/hotspot/share/utilities/utf8.cpp +++ b/src/hotspot/share/utilities/utf8.cpp @@ -98,15 +98,21 @@ char* UTF8::next_character(const char* str, jint* value) { return next_ch; } -// Count bytes of the form 10xxxxxx and deduct this count +// The number of unicode characters in a utf8 sequence can be easily +// determined by noting that bytes of the form 10xxxxxx are part of +// a 2 or 3-byte multi-byte sequence, all others are either characters +// themselves or else the start of a multi-byte character. + +// Calculate the unicode length of a utf8 string of known size +// by counting bytes of the form 10xxxxxx and deducting this count // from the total byte count. The utf8 string must be in // legal form which has been verified in the format checker. -int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) { - int num_chars = len; +int UTF8::unicode_length(const char* str, size_t len, bool& is_latin1, bool& has_multibyte) { + size_t num_chars = len; has_multibyte = false; is_latin1 = true; unsigned char prev = 0; - for (int i = 0; i < len; i++) { + for (size_t i = 0; i < len; i++) { unsigned char c = str[i]; if ((c & 0xC0) == 0x80) { // Multibyte, check if valid latin1 character. @@ -118,12 +124,12 @@ int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_mu } prev = c; } - return num_chars; + return checked_cast(num_chars); } -// Count bytes of the utf8 string except those in form -// 10xxxxxx which only appear in multibyte characters. -// The utf8 string must be in legal form and has been +// Calculate the unicode length of a nul-terminated utf8 string +// by counting bytes of the utf8 string except those in the form +// 10xxxxxx. The utf8 string must be in legal form and has been // verified in the format checker. int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) { int num_chars = 0; @@ -195,10 +201,10 @@ template void UTF8::convert_to_unicode(const char* utf8_str, jchar* unico template void UTF8::convert_to_unicode(const char* utf8_str, jbyte* unicode_str, int unicode_length); // returns the quoted ascii length of a 0-terminated utf8 string -int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { +size_t UTF8::quoted_ascii_length(const char* utf8_str, size_t utf8_length) { const char *ptr = utf8_str; const char* end = ptr + utf8_length; - int result = 0; + size_t result = 0; while (ptr < end) { jchar c; ptr = UTF8::next(ptr, &c); @@ -212,7 +218,7 @@ int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { } // converts a utf8 string to quoted ascii -void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) { +void UTF8::as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen) { const char *ptr = utf8_str; const char *utf8_end = ptr + utf8_length; char* p = buf; @@ -248,7 +254,7 @@ const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) { return quoted_ascii_str; } // everything up to this point was ok. - int length = ptr - quoted_ascii_str; + size_t length = ptr - quoted_ascii_str; char* buffer = nullptr; for (int round = 0; round < 2; round++) { while (*ptr != '\0') { @@ -330,11 +336,11 @@ jint UTF8::get_supplementary_character(const unsigned char* str) { + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); } -bool UTF8::is_legal_utf8(const unsigned char* buffer, int length, +bool UTF8::is_legal_utf8(const unsigned char* buffer, size_t length, bool version_leq_47) { - int i = 0; - int count = length >> 2; - for (int k=0; k> 2; + for (size_t k = 0; k < count; k++) { unsigned char b0 = buffer[i]; unsigned char b1 = buffer[i+1]; unsigned char b2 = buffer[i+2]; @@ -405,7 +411,7 @@ static bool is_starting_byte(unsigned char b) { // To avoid that the caller can choose to check for validity first. // The incoming buffer is still expected to be NUL-terminated. // The incoming buffer is expected to be a realistic size - we assert if it is too small. -void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) { +void UTF8::truncate_to_legal_utf8(unsigned char* buffer, size_t length) { assert(length > 5, "invalid length"); assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated"); @@ -433,7 +439,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) { // then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte // encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding. - for (int index = length - 2; index > 0; index--) { + for (size_t index = length - 2; index > 0; index--) { if (is_starting_byte(buffer[index])) { if (buffer[index] == 0xED) { // Could be first byte of 3 or 6, or fourth byte of 6. @@ -441,7 +447,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) { // surrogate value in the range EDA080 to EDAFBF. We only // need to check for EDA to establish this as the "missing" // values in EDAxxx would not be valid 3 byte encodings. - if ((index - 3) >= 0 && + if (index >= 3 && (buffer[index - 3] == 0xED) && ((buffer[index - 2] & 0xF0) == 0xA0)) { assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check"); @@ -470,7 +476,7 @@ bool UNICODE::is_latin1(const jchar* base, int length) { return true; } -int UNICODE::utf8_size(jchar c) { +size_t UNICODE::utf8_size(jchar c) { if ((0x0001 <= c) && (c <= 0x007F)) { // ASCII character return 1; @@ -481,7 +487,7 @@ int UNICODE::utf8_size(jchar c) { } } -int UNICODE::utf8_size(jbyte c) { +size_t UNICODE::utf8_size(jbyte c) { if (c >= 0x01) { // ASCII character. Check is equivalent to // (0x01 <= c) && (c <= 0x7F) because c is signed. @@ -494,11 +500,23 @@ int UNICODE::utf8_size(jbyte c) { } template -int UNICODE::utf8_length(const T* base, int length) { +size_t UNICODE::utf8_length(const T* base, int length) { + size_t result = 0; + for (int index = 0; index < length; index++) { + result += utf8_size(base[index]); + } + return result; +} + +template +int UNICODE::utf8_length_as_int(const T* base, int length) { size_t result = 0; for (int index = 0; index < length; index++) { T c = base[index]; - int sz = utf8_size(c); + size_t sz = utf8_size(c); + // If the length is > INT_MAX-1 we truncate at a completed + // modified-UTF8 encoding. This allows for +1 to be added + // by the caller for NUL-termination, without overflow. if (result + sz > INT_MAX-1) { break; } @@ -508,41 +526,44 @@ int UNICODE::utf8_length(const T* base, int length) { } template -char* UNICODE::as_utf8(const T* base, int& length) { - int utf8_len = utf8_length(base, length); +char* UNICODE::as_utf8(const T* base, size_t& length) { + // Incoming length must be <= INT_MAX + size_t utf8_len = utf8_length(base, static_cast(length)); u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); - char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); - assert((int) strlen(result) == utf8_len, "length prediction must be correct"); - // Set string length to uft8 length + char* result = as_utf8(base, static_cast(length), (char*) buf, utf8_len + 1); + assert(strlen(result) == utf8_len, "length prediction must be correct"); + // Set outgoing string length to uft8 length length = utf8_len; return (char*) result; } -char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) { +char* UNICODE::as_utf8(const jchar* base, int length, char* buf, size_t buflen) { assert(buflen > 0, "zero length output buffer"); u_char* p = (u_char*)buf; for (int index = 0; index < length; index++) { jchar c = base[index]; - buflen -= utf8_size(c); - if (buflen <= 0) break; // string is truncated + size_t sz = utf8_size(c); + if (sz >= buflen) break; // string is truncated + buflen -= sz; p = utf8_write(p, c); } *p = '\0'; return buf; } -char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) { +char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, size_t buflen) { assert(buflen > 0, "zero length output buffer"); u_char* p = (u_char*)buf; for (int index = 0; index < length; index++) { jbyte c = base[index]; - int sz = utf8_size(c); + size_t sz = utf8_size(c); + if (sz >= buflen) break; // string is truncated buflen -= sz; - if (buflen <= 0) break; // string is truncated if (sz == 1) { // Copy ASCII characters (UTF-8 is ASCII compatible) *p++ = c; } else { + assert(sz == 2, "must be!"); // Non-ASCII character or 0x00 which should // be encoded as 0xC080 in "modified" UTF8. p = utf8_write(p, ((jchar) c) & 0xff); @@ -561,8 +582,8 @@ void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) // returns the quoted ascii length of a unicode string template -int UNICODE::quoted_ascii_length(const T* base, int length) { - int result = 0; +size_t UNICODE::quoted_ascii_length(const T* base, int length) { + size_t result = 0; for (int i = 0; i < length; i++) { T c = base[i]; if (c >= 32 && c < 127) { @@ -576,7 +597,7 @@ int UNICODE::quoted_ascii_length(const T* base, int length) { // converts a unicode string to quoted ascii template -void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { +void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, size_t buflen) { char* p = buf; char* end = buf + buflen; for (int index = 0; index < length; index++) { @@ -594,11 +615,13 @@ void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) } // Explicit instantiation for all supported types. -template int UNICODE::utf8_length(const jbyte* base, int length); -template int UNICODE::utf8_length(const jchar* base, int length); -template char* UNICODE::as_utf8(const jbyte* base, int& length); -template char* UNICODE::as_utf8(const jchar* base, int& length); -template int UNICODE::quoted_ascii_length(const jbyte* base, int length); -template int UNICODE::quoted_ascii_length(const jchar* base, int length); -template void UNICODE::as_quoted_ascii(const jbyte* base, int length, char* buf, int buflen); -template void UNICODE::as_quoted_ascii(const jchar* base, int length, char* buf, int buflen); +template size_t UNICODE::utf8_length(const jbyte* base, int length); +template size_t UNICODE::utf8_length(const jchar* base, int length); +template int UNICODE::utf8_length_as_int(const jbyte* base, int length); +template int UNICODE::utf8_length_as_int(const jchar* base, int length); +template char* UNICODE::as_utf8(const jbyte* base, size_t& length); +template char* UNICODE::as_utf8(const jchar* base, size_t& length); +template size_t UNICODE::quoted_ascii_length(const jbyte* base, int length); +template size_t UNICODE::quoted_ascii_length(const jchar* base, int length); +template void UNICODE::as_quoted_ascii(const jbyte* base, int length, char* buf, size_t buflen); +template void UNICODE::as_quoted_ascii(const jchar* base, int length, char* buf, size_t buflen); diff --git a/src/hotspot/share/utilities/utf8.hpp b/src/hotspot/share/utilities/utf8.hpp index 9a18dd0ff93..3b4ff30e3c3 100644 --- a/src/hotspot/share/utilities/utf8.hpp +++ b/src/hotspot/share/utilities/utf8.hpp @@ -29,6 +29,45 @@ #include "memory/allStatic.hpp" #include "utilities/debug.hpp" +/** + +String handling within Java and the VM requires a bit of explanation. + +Logically a java.lang.String is a sequence of 16-bit Unicode characters +encoded in UTF-16. In the past a String contained a Java char[] and so +could theoretically contain INT_MAX 16-bit characters. Then came JEP 254: +Compact Strings. + +With Compact Strings the Java char[] becomes a Java byte[], and that byte[] +contains either latin-1 characters all of which fit in 8-bits, or else each +pair of bytes represents a UTF-16 character. Consequently the maximum length +in characters of a latin-1 string is INT_MAX, whilst for non-latin-1 it is INT_MAX/2. + +In the code below if we have latin-1 content then we treat the String's data +array as a jbyte[], else a jchar[]. The lengths of these arrays are specified +as an int value, with a nominal maximum of INT_MAX. + +The modified UTF-8 encoding specified for the VM, nominally encodes characters +in 1, 2, 3 or 6 bytes. The 6-byte representation is actually two 3-byte representations +for two UTF-16 characters forming a surrogate pair. If we are dealing with +a latin-1 string then each character will be encoded as either 1 or 2 bytes and so the +maximum UTF8 length is 2*INT_MAX. This can't be stored in an int so utf8 buffers must +use a size_t length. For non-latin-1 strings each UTF-16 character will encode as either +2 or 3 bytes, so the maximum UTF8 length in that case is 3 * INT_MAX/2 i.e. 1.5*INT_MAX. + +The "quoted ascii" form of a unicode string is at worst 6 times longer than its +regular form, and so these lengths must always be size_t - though if we know we only +ever do this to symbols (or small symbol combinations) then we could use int. + +There is an additional assumption/expectation that our UTF8 API's are never dealing with +invalid UTF8, and more generally that all UTF8 sequences could form valid Strings. +Consequently the Unicode length of a UTF8 sequence is assumed to always be representable +by an int. However, there are API's, such as JNI NewStringUTF, that do deal with such input +and could potentially have an unrepresentable string. The long standing position with JNI +is that the user must supply valid input so we do not try to account for these cases. + +*/ + // Low-level interface for UTF8 strings class UTF8 : AllStatic { @@ -41,20 +80,20 @@ class UTF8 : AllStatic { static int unicode_length(const char* utf8_str, bool& is_latin1, bool& has_multibyte); // returns the unicode length of a non-0-terminated utf8 string - static int unicode_length(const char* utf8_str, int len) { + static int unicode_length(const char* utf8_str, size_t len) { bool is_latin1, has_multibyte; return unicode_length(utf8_str, len, is_latin1, has_multibyte); } - static int unicode_length(const char* utf8_str, int len, bool& is_latin1, bool& has_multibyte); + static int unicode_length(const char* utf8_str, size_t len, bool& is_latin1, bool& has_multibyte); // converts a utf8 string to a unicode string template static void convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length); // returns the quoted ascii length of a utf8 string - static int quoted_ascii_length(const char* utf8_str, int utf8_length); + static size_t quoted_ascii_length(const char* utf8_str, size_t utf8_length); // converts a utf8 string to quoted ascii - static void as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen); + static void as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen); #ifndef PRODUCT // converts a quoted ascii string to utf8 string. returns the original @@ -82,13 +121,13 @@ class UTF8 : AllStatic { while(--length >= 0 && base[length] != c); return (length < 0) ? nullptr : &base[length]; } - static bool equal(const jbyte* base1, int length1, const jbyte* base2,int length2); + static bool equal(const jbyte* base1, int length1, const jbyte* base2, int length2); static bool is_supplementary_character(const unsigned char* str); static jint get_supplementary_character(const unsigned char* str); - static bool is_legal_utf8(const unsigned char* buffer, int length, + static bool is_legal_utf8(const unsigned char* buffer, size_t length, bool version_leq_47); - static void truncate_to_legal_utf8(unsigned char* buffer, int length); + static void truncate_to_legal_utf8(unsigned char* buffer, size_t length); }; @@ -99,6 +138,12 @@ class UTF8 : AllStatic { // units, so a supplementary character uses two positions in a unicode string. class UNICODE : AllStatic { + + // returns the utf8 size of a unicode character + // uses size_t for convenience in overflow checks + static size_t utf8_size(jchar c); + static size_t utf8_size(jbyte c); + public: // checks if the given unicode character can be encoded as latin1 static bool is_latin1(jchar c); @@ -106,28 +151,27 @@ class UNICODE : AllStatic { // checks if the given string can be encoded as latin1 static bool is_latin1(const jchar* base, int length); - // returns the utf8 size of a unicode character - static int utf8_size(jchar c); - static int utf8_size(jbyte c); - // returns the utf8 length of a unicode string - template static int utf8_length(const T* base, int length); + template static size_t utf8_length(const T* base, int length); + + // returns the utf8 length of a unicode string as an int - truncated if needed + template static int utf8_length_as_int(const T* base, int length); // converts a unicode string to utf8 string static void convert_to_utf8(const jchar* base, int length, char* utf8_buffer); // converts a unicode string to a utf8 string; result is allocated // in resource area unless a buffer is provided. The unicode 'length' - // parameter is set to the length of the result utf8 string. - template static char* as_utf8(const T* base, int& length); - static char* as_utf8(const jchar* base, int length, char* buf, int buflen); - static char* as_utf8(const jbyte* base, int length, char* buf, int buflen); + // parameter is set to the length of the resulting utf8 string. + template static char* as_utf8(const T* base, size_t& length); + static char* as_utf8(const jchar* base, int length, char* buf, size_t buflen); + static char* as_utf8(const jbyte* base, int length, char* buf, size_t buflen); // returns the quoted ascii length of a unicode string - template static int quoted_ascii_length(const T* base, int length); + template static size_t quoted_ascii_length(const T* base, int length); // converts a unicode string to quoted ascii - template static void as_quoted_ascii(const T* base, int length, char* buf, int buflen); + template static void as_quoted_ascii(const T* base, int length, char* buf, size_t buflen); }; #endif // SHARE_UTILITIES_UTF8_HPP