8338257: UTF8 lengths should be size_t not int

Reviewed-by: stuefe, coleenp, dlong
2024-08-29 20:38:52 +00:00 · 2024-08-29 20:38:52 +00:00 · a4962ace4d
commit a4962ace4d
parent 777ed2b5d2
16 changed files with 229 additions and 122 deletions
--- a/src/hotspot/share/classfile/compactHashtable.cpp
+++ b/src/hotspot/share/classfile/compactHashtable.cpp
@ -431,10 +431,10 @@ void HashtableTextDump::get_utf8(char* utf8_buffer, int utf8_length) {
 }

 // NOTE: the content is NOT the same as
-// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen).
+// UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, size_t buflen).
 // We want to escape \r\n\t so that output [1] is more readable; [2] can be more easily
 // parsed by scripts; [3] quickly processed by HashtableTextDump::get_utf8()
-void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, int utf8_length) {
+void HashtableTextDump::put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length) {
  const char *c = utf8_string;
  const char *end = c + utf8_length;
  for (; c < end; c++) {
--- a/src/hotspot/share/classfile/compactHashtable.hpp
+++ b/src/hotspot/share/classfile/compactHashtable.hpp
@ -431,7 +431,7 @@ public:

  int unescape(const char* from, const char* end, int count);
  void get_utf8(char* utf8_buffer, int utf8_length);
-  static void put_utf8(outputStream* st, const char* utf8_string, int utf8_length);
+  static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
 };

 #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
--- a/src/hotspot/share/classfile/javaClasses.cpp
+++ b/src/hotspot/share/classfile/javaClasses.cpp
@ -304,7 +304,8 @@ Handle java_lang_String::create_from_unicode(const jchar* unicode, int length, T
 #ifdef ASSERT
  {
    ResourceMark rm;
-    char* expected = UNICODE::as_utf8(unicode, length);
+    size_t utf8_len = static_cast<size_t>(length);
+    char* expected = UNICODE::as_utf8(unicode, utf8_len);
    char* actual = as_utf8_string(h_obj());
    if (strcmp(expected, actual) != 0) {
      fatal("Unicode conversion failure: %s --> %s", expected, actual);
@ -346,7 +347,7 @@ Handle java_lang_String::create_from_str(const char* utf8_str, TRAPS) {
 #ifdef ASSERT
  // This check is too strict when the input string is not a valid UTF8.
  // For example, it may be created with arbitrary content via jni_NewStringUTF.
-  if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, (int)strlen(utf8_str), false)) {
+  if (UTF8::is_legal_utf8((const unsigned char*)utf8_str, strlen(utf8_str), false)) {
    ResourceMark rm;
    const char* expected = utf8_str;
    char* actual = as_utf8_string(h_obj());
@ -554,7 +555,7 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
  if (length == 0) return nullptr;

  char* result;
-  int result_length;
+  size_t result_length;
  if (!is_latin1) {
    jchar* base = value->char_at_addr(0);
    result_length = UNICODE::quoted_ascii_length(base, length) + 1;
@ -566,8 +567,8 @@ char* java_lang_String::as_quoted_ascii(oop java_string) {
    result = NEW_RESOURCE_ARRAY(char, result_length);
    UNICODE::as_quoted_ascii(base, length, result, result_length);
  }
-  assert(result_length >= length + 1, "must not be shorter");
-  assert(result_length == (int)strlen(result) + 1, "must match");
+  assert(result_length >= (size_t)length + 1, "must not be shorter");
+  assert(result_length == strlen(result) + 1, "must match");
  return result;
 }

@ -582,8 +583,9 @@ Symbol* java_lang_String::as_symbol(oop java_string) {
  } else {
    ResourceMark rm;
    jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
-    const char* base = UNICODE::as_utf8(position, length);
-    Symbol* sym = SymbolTable::new_symbol(base, length);
+    size_t utf8_len = static_cast<size_t>(length);
+    const char* base = UNICODE::as_utf8(position, utf8_len);
+    Symbol* sym = SymbolTable::new_symbol(base, checked_cast<int>(utf8_len));
    return sym;
  }
 }
@ -598,12 +600,13 @@ Symbol* java_lang_String::as_symbol_or_null(oop java_string) {
  } else {
    ResourceMark rm;
    jbyte* position = (length == 0) ? nullptr : value->byte_at_addr(0);
-    const char* base = UNICODE::as_utf8(position, length);
-    return SymbolTable::probe(base, length);
+    size_t utf8_len = static_cast<size_t>(length);
+    const char* base = UNICODE::as_utf8(position, utf8_len);
+    return SymbolTable::probe(base, checked_cast<int>(utf8_len));
  }
 }

-int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
+size_t java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
  assert(value_equals(value, java_lang_String::value(java_string)),
         "value must be same as java_lang_String::value(java_string)");
  int length = java_lang_String::length(java_string, value);
@ -617,18 +620,39 @@ int java_lang_String::utf8_length(oop java_string, typeArrayOop value) {
  }
 }

-int java_lang_String::utf8_length(oop java_string) {
+size_t java_lang_String::utf8_length(oop java_string) {
  typeArrayOop value = java_lang_String::value(java_string);
  return utf8_length(java_string, value);
 }

+int java_lang_String::utf8_length_as_int(oop java_string) {
+  typeArrayOop value = java_lang_String::value(java_string);
+  return utf8_length_as_int(java_string, value);
+}
+
+int java_lang_String::utf8_length_as_int(oop java_string, typeArrayOop value) {
+  assert(value_equals(value, java_lang_String::value(java_string)),
+         "value must be same as java_lang_String::value(java_string)");
+  int length = java_lang_String::length(java_string, value);
+  if (length == 0) {
+    return 0;
+  }
+  if (!java_lang_String::is_latin1(java_string)) {
+    return UNICODE::utf8_length_as_int(value->char_at_addr(0), length);
+  } else {
+    return UNICODE::utf8_length_as_int(value->byte_at_addr(0), length);
+  }
+}
+
 char* java_lang_String::as_utf8_string(oop java_string) {
-  int length;
+  size_t length;
  return as_utf8_string(java_string, length);
 }

-char* java_lang_String::as_utf8_string(oop java_string, int& length) {
+char* java_lang_String::as_utf8_string(oop java_string, size_t& length) {
  typeArrayOop value = java_lang_String::value(java_string);
+  // `length` is used as the incoming number of characters to
+  // convert, and then set as the number of bytes in the UTF8 sequence.
  length             = java_lang_String::length(java_string, value);
  bool     is_latin1 = java_lang_String::is_latin1(java_string);
  if (!is_latin1) {
@ -642,7 +666,7 @@ char* java_lang_String::as_utf8_string(oop java_string, int& length) {

 // Uses a provided buffer if it's sufficiently large, otherwise allocates
 // a resource array to fit
-char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int buflen, int& utf8_len) {
+char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& utf8_len) {
  typeArrayOop value = java_lang_String::value(java_string);
  int            len = java_lang_String::length(java_string, value);
  bool     is_latin1 = java_lang_String::is_latin1(java_string);
@ -663,7 +687,7 @@ char* java_lang_String::as_utf8_string_full(oop java_string, char* buf, int bufl
  }
 }

-char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen) {
+char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen) {
  assert(value_equals(value, java_lang_String::value(java_string)),
         "value must be same as java_lang_String::value(java_string)");
  int     length = java_lang_String::length(java_string, value);
@ -677,25 +701,28 @@ char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, char
  }
 }

-char* java_lang_String::as_utf8_string(oop java_string, char* buf, int buflen) {
+char* java_lang_String::as_utf8_string(oop java_string, char* buf, size_t buflen) {
  typeArrayOop value = java_lang_String::value(java_string);
  return as_utf8_string(java_string, value, buf, buflen);
 }

 char* java_lang_String::as_utf8_string(oop java_string, int start, int len) {
+  // `length` is used as the incoming number of characters to
+  // convert, and then set as the number of bytes in the UTF8 sequence.
+  size_t  length = static_cast<size_t>(len);
  typeArrayOop value  = java_lang_String::value(java_string);
  bool      is_latin1 = java_lang_String::is_latin1(java_string);
  assert(start + len <= java_lang_String::length(java_string), "just checking");
  if (!is_latin1) {
    jchar* position = value->char_at_addr(start);
-    return UNICODE::as_utf8(position, len);
+    return UNICODE::as_utf8(position, length);
  } else {
    jbyte* position = value->byte_at_addr(start);
-    return UNICODE::as_utf8(position, len);
+    return UNICODE::as_utf8(position, length);
  }
 }

-char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen) {
+char* java_lang_String::as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen) {
  assert(value_equals(value, java_lang_String::value(java_string)),
         "value must be same as java_lang_String::value(java_string)");
  assert(start + len <= java_lang_String::length(java_string), "just checking");
--- a/src/hotspot/share/classfile/javaClasses.hpp
+++ b/src/hotspot/share/classfile/javaClasses.hpp
@ -131,17 +131,21 @@ class java_lang_String : AllStatic {
  static inline bool deduplication_requested(oop java_string);
  static inline int length(oop java_string);
  static inline int length(oop java_string, typeArrayOop string_value);
-  static int utf8_length(oop java_string);
-  static int utf8_length(oop java_string, typeArrayOop string_value);
+  static size_t utf8_length(oop java_string);
+  static size_t utf8_length(oop java_string, typeArrayOop string_value);
+  // Legacy variants that truncate the length if needed
+  static int    utf8_length_as_int(oop java_string);
+  static int    utf8_length_as_int(oop java_string, typeArrayOop string_value);

  // String converters
  static char*  as_utf8_string(oop java_string);
-  static char*  as_utf8_string(oop java_string, int& length);
-  static char*  as_utf8_string_full(oop java_string, char* buf, int buflen, int& length);
-  static char*  as_utf8_string(oop java_string, char* buf, int buflen);
+  // `length` is set to the length of the utf8 sequence.
+  static char*  as_utf8_string(oop java_string, size_t& length);
+  static char*  as_utf8_string_full(oop java_string, char* buf, size_t buflen, size_t& length);
+  static char*  as_utf8_string(oop java_string, char* buf, size_t buflen);
  static char*  as_utf8_string(oop java_string, int start, int len);
-  static char*  as_utf8_string(oop java_string, typeArrayOop value, char* buf, int buflen);
-  static char*  as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, int buflen);
+  static char*  as_utf8_string(oop java_string, typeArrayOop value, char* buf, size_t buflen);
+  static char*  as_utf8_string(oop java_string, typeArrayOop value, int start, int len, char* buf, size_t buflen);
  static char*  as_platform_dependent_str(Handle java_string, TRAPS);
  static jchar* as_unicode_string(oop java_string, int& length, TRAPS);
  static jchar* as_unicode_string_or_null(oop java_string, int& length);
--- a/src/hotspot/share/classfile/modules.cpp
+++ b/src/hotspot/share/classfile/modules.cpp
@ -72,7 +72,9 @@ static char* get_module_name(oop module, int& len, TRAPS) {
  if (name_oop == nullptr) {
    THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), "Null module name");
  }
-  char* module_name = java_lang_String::as_utf8_string(name_oop, len);
+  size_t utf8_len;
+  char* module_name = java_lang_String::as_utf8_string(name_oop, utf8_len);
+  len = checked_cast<int>(utf8_len); // module names are < 64K
  if (!verify_module_name(module_name, len)) {
    THROW_MSG_NULL(vmSymbols::java_lang_IllegalArgumentException(),
                   err_msg("Invalid module name: %s", module_name));
@ -84,9 +86,9 @@ static Symbol* as_symbol(jstring str_object) {
  if (str_object == nullptr) {
    return nullptr;
  }
-  int len;
+  size_t len;
  char* str = java_lang_String::as_utf8_string(JNIHandles::resolve_non_null(str_object), len);
-  return SymbolTable::new_symbol(str, len);
+  return SymbolTable::new_symbol(str, checked_cast<int>(len));
 }

 ModuleEntryTable* Modules::get_module_entry_table(Handle h_loader) {
@ -142,8 +144,10 @@ bool Modules::is_package_defined(Symbol* package, Handle h_loader) {
 // Will use the provided buffer if it's sufficiently large, otherwise allocates
 // a resource array
 // The length of the resulting string will be assigned to utf8_len
-static const char* as_internal_package(oop package_string, char* buf, int buflen, int& utf8_len) {
-  char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, utf8_len);
+static const char* as_internal_package(oop package_string, char* buf, size_t buflen, int& utf8_len) {
+  size_t full_utf8_len;
+  char* package_name = java_lang_String::as_utf8_string_full(package_string, buf, buflen, full_utf8_len);
+  utf8_len = checked_cast<int>(full_utf8_len); // package names are < 64K

  // Turn all '/'s into '.'s
  for (int index = 0; index < utf8_len; index++) {
--- a/src/hotspot/share/classfile/stringTable.cpp
+++ b/src/hotspot/share/classfile/stringTable.cpp
@ -686,7 +686,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
    st->print("%d: ", length);
  } else {
    ResourceMark rm(current);
-    int utf8_length = length;
+    size_t utf8_length = length;
    char* utf8_string;

    if (!is_latin1) {
@ -697,7 +697,7 @@ static void print_string(Thread* current, outputStream* st, oop s) {
      utf8_string = UNICODE::as_utf8(bytes, utf8_length);
    }

-    st->print("%d: ", utf8_length);
+    st->print("%zu: ", utf8_length);
    HashtableTextDump::put_utf8(st, utf8_string, utf8_length);
  }
  st->cr();
--- a/src/hotspot/share/classfile/symbolTable.cpp
+++ b/src/hotspot/share/classfile/symbolTable.cpp
@ -349,6 +349,7 @@ Symbol* SymbolTable::lookup_common(const char* name,
 // to be used for arbitrary strings. For debug builds we will assert if
 // a string is too long, whereas product builds will truncate it.
 static int check_length(const char* name, int len) {
+  assert(len >= 0, "negative length %d suggests integer overflow in the caller", len);
  assert(len <= Symbol::max_length(),
         "String length %d exceeds the maximum Symbol length of %d", len, Symbol::max_length());
  if (len > Symbol::max_length()) {
@ -461,33 +462,33 @@ Symbol* SymbolTable::lookup_only(const char* name, int len, unsigned int& hash)
 // and probing logic, so there is no need for convert_to_utf8 until
 // an actual new Symbol* is created.
 Symbol* SymbolTable::new_symbol(const jchar* name, int utf16_length) {
-  int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
+  size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
  char stack_buf[ON_STACK_BUFFER_LENGTH];
-  if (utf8_length < (int) sizeof(stack_buf)) {
+  if (utf8_length < sizeof(stack_buf)) {
    char* chars = stack_buf;
    UNICODE::convert_to_utf8(name, utf16_length, chars);
-    return new_symbol(chars, utf8_length);
+    return new_symbol(chars, checked_cast<int>(utf8_length));
  } else {
    ResourceMark rm;
    char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
    UNICODE::convert_to_utf8(name, utf16_length, chars);
-    return new_symbol(chars, utf8_length);
+    return new_symbol(chars, checked_cast<int>(utf8_length));
  }
 }

 Symbol* SymbolTable::lookup_only_unicode(const jchar* name, int utf16_length,
                                         unsigned int& hash) {
-  int utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
+  size_t utf8_length = UNICODE::utf8_length((jchar*) name, utf16_length);
  char stack_buf[ON_STACK_BUFFER_LENGTH];
-  if (utf8_length < (int) sizeof(stack_buf)) {
+  if (utf8_length < sizeof(stack_buf)) {
    char* chars = stack_buf;
    UNICODE::convert_to_utf8(name, utf16_length, chars);
-    return lookup_only(chars, utf8_length, hash);
+    return lookup_only(chars, checked_cast<int>(utf8_length), hash);
  } else {
    ResourceMark rm;
    char* chars = NEW_RESOURCE_ARRAY(char, utf8_length + 1);
    UNICODE::convert_to_utf8(name, utf16_length, chars);
-    return lookup_only(chars, utf8_length, hash);
+    return lookup_only(chars, checked_cast<int>(utf8_length), hash);
  }
 }

--- a/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp
+++ b/src/hotspot/share/jfr/dcmd/jfrDcmds.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -292,10 +292,10 @@ static const char* get_as_dcmd_arena_string(oop string) {
  char* str = nullptr;
  const typeArrayOop value = java_lang_String::value(string);
  if (value != nullptr) {
-    const size_t length = static_cast<size_t>(java_lang_String::utf8_length(string, value)) + 1;
+    const size_t length = java_lang_String::utf8_length(string, value) + 1;
    str = dcmd_arena_allocate(length);
    assert(str != nullptr, "invariant");
-    java_lang_String::as_utf8_string(string, value, str, static_cast<int>(length));
+    java_lang_String::as_utf8_string(string, value, str, length);
  }
  return str;
 }
--- a/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp
+++ b/src/hotspot/share/jfr/jni/jfrJavaSupport.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -502,7 +502,7 @@ Klass* JfrJavaSupport::klass(const jobject handle) {
  return obj->klass();
 }

-static char* allocate_string(bool c_heap, int length, Thread* thread) {
+static char* allocate_string(bool c_heap, size_t length, Thread* thread) {
  return c_heap ? NEW_C_HEAP_ARRAY(char, length, mtTracing) :
                  NEW_RESOURCE_ARRAY_IN_THREAD(thread, char, length);
 }
@ -511,7 +511,7 @@ const char* JfrJavaSupport::c_str(oop string, Thread* thread, bool c_heap /* fal
  char* str = nullptr;
  const typeArrayOop value = java_lang_String::value(string);
  if (value != nullptr) {
-    const int length = java_lang_String::utf8_length(string, value);
+    const size_t length = java_lang_String::utf8_length(string, value);
    str = allocate_string(c_heap, length + 1, thread);
    if (str == nullptr) {
      return nullptr;
--- a/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp
+++ b/src/hotspot/share/jfr/recorder/checkpoint/types/jfrThreadState.cpp
@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -121,7 +121,10 @@ static const char* get_java_thread_name(const JavaThread* jt, int& length, oop v
  }
  assert(thread_obj != nullptr, "invariant");
  const oop name = java_lang_Thread::name(thread_obj);
-  return name != nullptr ? java_lang_String::as_utf8_string(name, length) : nullptr;
+  size_t utf8_len;
+  const char* ret = name != nullptr ? java_lang_String::as_utf8_string(name, utf8_len) : nullptr;
+  length = checked_cast<int>(utf8_len); // Thread names should be short
+  return ret;
 }

 const char* JfrThreadName::name(const Thread* t, int& length, oop vthread) {
--- a/src/hotspot/share/oops/symbol.cpp
+++ b/src/hotspot/share/oops/symbol.cpp
@ -166,7 +166,7 @@ void Symbol::print_symbol_on(outputStream* st) const {

 char* Symbol::as_quoted_ascii() const {
  const char *ptr = (const char *)&_body[0];
-  int quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
+  size_t quoted_length = UTF8::quoted_ascii_length(ptr, utf8_length());
  char* result = NEW_RESOURCE_ARRAY(char, quoted_length + 1);
  UTF8::as_quoted_ascii(ptr, utf8_length(), result, quoted_length + 1);
  return result;
--- a/src/hotspot/share/prims/jni.cpp
+++ b/src/hotspot/share/prims/jni.cpp
@ -2223,7 +2223,7 @@ JNI_END
 JNI_ENTRY(jsize, jni_GetStringUTFLength(JNIEnv *env, jstring string))
 HOTSPOT_JNI_GETSTRINGUTFLENGTH_ENTRY(env, string);
  oop java_string = JNIHandles::resolve_non_null(string);
-  jsize ret = java_lang_String::utf8_length(java_string);
+  jsize ret = java_lang_String::utf8_length_as_int(java_string);
  HOTSPOT_JNI_GETSTRINGUTFLENGTH_RETURN(ret);
  return ret;
 JNI_END
@ -2236,10 +2236,11 @@ JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboole
  typeArrayOop s_value = java_lang_String::value(java_string);
  if (s_value != nullptr) {
    size_t length = java_lang_String::utf8_length(java_string, s_value);
-    /* JNI Specification states return null on OOM */
+    // JNI Specification states return null on OOM.
+    // The resulting sequence doesn't have to be NUL-terminated but we do.
    result = AllocateHeap(length + 1, mtInternal, AllocFailStrategy::RETURN_NULL);
    if (result != nullptr) {
-      java_lang_String::as_utf8_string(java_string, s_value, result, (int) length + 1);
+      java_lang_String::as_utf8_string(java_string, s_value, result, length + 1);
      if (isCopy != nullptr) {
        *isCopy = JNI_TRUE;
      }
--- a/src/hotspot/share/prims/jvmtiEnv.cpp
+++ b/src/hotspot/share/prims/jvmtiEnv.cpp
@ -1321,7 +1321,7 @@ JvmtiEnv::GetThreadInfo(jthread thread, jvmtiThreadInfo* info_ptr) {
    if (name() != nullptr) {
      n = java_lang_String::as_utf8_string(name());
    } else {
-      int utf8_length = 0;
+      size_t utf8_length = 0;
      n = UNICODE::as_utf8((jchar*) nullptr, utf8_length);
    }

--- a/src/hotspot/share/services/finalizerService.cpp
+++ b/src/hotspot/share/services/finalizerService.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -45,7 +45,7 @@ static const char* allocate(oop string) {
  char* str = nullptr;
  const typeArrayOop value = java_lang_String::value(string);
  if (value != nullptr) {
-    const int length = java_lang_String::utf8_length(string, value);
+    const size_t length = java_lang_String::utf8_length(string, value);
    str = NEW_C_HEAP_ARRAY(char, length + 1, mtServiceability);
    java_lang_String::as_utf8_string(string, value, str, length + 1);
  }
--- a/src/hotspot/share/utilities/utf8.cpp
+++ b/src/hotspot/share/utilities/utf8.cpp
@ -98,15 +98,21 @@ char* UTF8::next_character(const char* str, jint* value) {
  return next_ch;
 }

-// Count bytes of the form 10xxxxxx and deduct this count
+// The number of unicode characters in a utf8 sequence can be easily
+// determined by noting that bytes of the form 10xxxxxx are part of
+// a 2 or 3-byte multi-byte sequence, all others are either characters
+// themselves or else the start of a multi-byte character.
+
+// Calculate the unicode length of a utf8 string of known size
+// by counting bytes of the form 10xxxxxx and deducting this count
 // from the total byte count.  The utf8 string must be in
 // legal form which has been verified in the format checker.
-int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
-  int num_chars = len;
+int UTF8::unicode_length(const char* str, size_t len, bool& is_latin1, bool& has_multibyte) {
+  size_t num_chars = len;
  has_multibyte = false;
  is_latin1 = true;
  unsigned char prev = 0;
-  for (int i = 0; i < len; i++) {
+  for (size_t i = 0; i < len; i++) {
    unsigned char c = str[i];
    if ((c & 0xC0) == 0x80) {
      // Multibyte, check if valid latin1 character.
@ -118,12 +124,12 @@ int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_mu
    }
    prev = c;
  }
-  return num_chars;
+  return checked_cast<int>(num_chars);
 }

-// Count bytes of the utf8 string except those in form
-// 10xxxxxx which only appear in multibyte characters.
-// The utf8 string must be in legal form and has been
+// Calculate the unicode length of a nul-terminated utf8 string
+// by counting bytes of the utf8 string except those in the form
+// 10xxxxxx. The utf8 string must be in legal form and has been
 // verified in the format checker.
 int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
  int num_chars = 0;
@ -195,10 +201,10 @@ template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unico
 template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);

 // returns the quoted ascii length of a 0-terminated utf8 string
-int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
+size_t UTF8::quoted_ascii_length(const char* utf8_str, size_t utf8_length) {
  const char *ptr = utf8_str;
  const char* end = ptr + utf8_length;
-  int result = 0;
+  size_t result = 0;
  while (ptr < end) {
    jchar c;
    ptr = UTF8::next(ptr, &c);
@ -212,7 +218,7 @@ int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
 }

 // converts a utf8 string to quoted ascii
-void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
+void UTF8::as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen) {
  const char *ptr = utf8_str;
  const char *utf8_end = ptr + utf8_length;
  char* p = buf;
@ -248,7 +254,7 @@ const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
    return quoted_ascii_str;
  }
  // everything up to this point was ok.
-  int length = ptr - quoted_ascii_str;
+  size_t length = ptr - quoted_ascii_str;
  char* buffer = nullptr;
  for (int round = 0; round < 2; round++) {
    while (*ptr != '\0') {
@ -330,11 +336,11 @@ jint UTF8::get_supplementary_character(const unsigned char* str) {
                 + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
 }

-bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
+bool UTF8::is_legal_utf8(const unsigned char* buffer, size_t length,
                         bool version_leq_47) {
-  int i = 0;
-  int count = length >> 2;
-  for (int k=0; k<count; k++) {
+  size_t i = 0;
+  size_t count = length >> 2;
+  for (size_t k = 0; k < count; k++) {
    unsigned char b0 = buffer[i];
    unsigned char b1 = buffer[i+1];
    unsigned char b2 = buffer[i+2];
@ -405,7 +411,7 @@ static bool is_starting_byte(unsigned char b) {
 // To avoid that the caller can choose to check for validity first.
 // The incoming buffer is still expected to be NUL-terminated.
 // The incoming buffer is expected to be a realistic size - we assert if it is too small.
-void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
+void UTF8::truncate_to_legal_utf8(unsigned char* buffer, size_t length) {
  assert(length > 5, "invalid length");
  assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated");

@ -433,7 +439,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
  // then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte
  // encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding.

-  for (int index = length - 2; index > 0; index--) {
+  for (size_t index = length - 2; index > 0; index--) {
    if (is_starting_byte(buffer[index])) {
      if (buffer[index] == 0xED) {
        // Could be first byte of 3 or 6, or fourth byte of 6.
@ -441,7 +447,7 @@ void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) {
        // surrogate value in the range EDA080 to EDAFBF. We only
        // need to check for EDA to establish this as the "missing"
        // values in EDAxxx would not be valid 3 byte encodings.
-        if ((index - 3) >= 0 &&
+        if (index >= 3 &&
            (buffer[index - 3] == 0xED) &&
            ((buffer[index - 2] & 0xF0) == 0xA0)) {
          assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check");
@ -470,7 +476,7 @@ bool UNICODE::is_latin1(const jchar* base, int length) {
  return true;
 }

-int UNICODE::utf8_size(jchar c) {
+size_t UNICODE::utf8_size(jchar c) {
  if ((0x0001 <= c) && (c <= 0x007F)) {
    // ASCII character
    return 1;
@ -481,7 +487,7 @@ int UNICODE::utf8_size(jchar c) {
  }
 }

-int UNICODE::utf8_size(jbyte c) {
+size_t UNICODE::utf8_size(jbyte c) {
  if (c >= 0x01) {
    // ASCII character. Check is equivalent to
    // (0x01 <= c) && (c <= 0x7F) because c is signed.
@ -494,11 +500,23 @@ int UNICODE::utf8_size(jbyte c) {
 }

 template<typename T>
-int UNICODE::utf8_length(const T* base, int length) {
+size_t UNICODE::utf8_length(const T* base, int length) {
+  size_t result = 0;
+  for (int index = 0; index < length; index++) {
+    result += utf8_size(base[index]);
+  }
+  return result;
+}
+
+template<typename T>
+int UNICODE::utf8_length_as_int(const T* base, int length) {
  size_t result = 0;
  for (int index = 0; index < length; index++) {
    T c = base[index];
-    int sz = utf8_size(c);
+    size_t sz = utf8_size(c);
+    // If the length is > INT_MAX-1 we truncate at a completed
+    // modified-UTF8 encoding. This allows for +1 to be added
+    // by the caller for NUL-termination, without overflow.
    if (result + sz > INT_MAX-1) {
      break;
    }
@ -508,41 +526,44 @@ int UNICODE::utf8_length(const T* base, int length) {
 }

 template<typename T>
-char* UNICODE::as_utf8(const T* base, int& length) {
-  int utf8_len = utf8_length(base, length);
+char* UNICODE::as_utf8(const T* base, size_t& length) {
+  // Incoming length must be <= INT_MAX
+  size_t utf8_len = utf8_length(base, static_cast<int>(length));
  u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
-  char* result = as_utf8(base, length, (char*) buf, utf8_len + 1);
-  assert((int) strlen(result) == utf8_len, "length prediction must be correct");
-  // Set string length to uft8 length
+  char* result = as_utf8(base, static_cast<int>(length), (char*) buf, utf8_len + 1);
+  assert(strlen(result) == utf8_len, "length prediction must be correct");
+  // Set outgoing string length to uft8 length
  length = utf8_len;
  return (char*) result;
 }

-char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) {
+char* UNICODE::as_utf8(const jchar* base, int length, char* buf, size_t buflen) {
  assert(buflen > 0, "zero length output buffer");
  u_char* p = (u_char*)buf;
  for (int index = 0; index < length; index++) {
    jchar c = base[index];
-    buflen -= utf8_size(c);
-    if (buflen <= 0) break; // string is truncated
+    size_t sz = utf8_size(c);
+    if (sz >= buflen) break; // string is truncated
+    buflen -= sz;
    p = utf8_write(p, c);
  }
  *p = '\0';
  return buf;
 }

-char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) {
+char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, size_t buflen) {
  assert(buflen > 0, "zero length output buffer");
  u_char* p = (u_char*)buf;
  for (int index = 0; index < length; index++) {
    jbyte c = base[index];
-    int sz = utf8_size(c);
+    size_t sz = utf8_size(c);
+    if (sz >= buflen) break; // string is truncated
    buflen -= sz;
-    if (buflen <= 0) break; // string is truncated
    if (sz == 1) {
      // Copy ASCII characters (UTF-8 is ASCII compatible)
      *p++ = c;
    } else {
+      assert(sz == 2, "must be!");
      // Non-ASCII character or 0x00 which should
      // be encoded as 0xC080 in "modified" UTF8.
      p = utf8_write(p, ((jchar) c) & 0xff);
@ -561,8 +582,8 @@ void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer)

 // returns the quoted ascii length of a unicode string
 template<typename T>
-int UNICODE::quoted_ascii_length(const T* base, int length) {
-  int result = 0;
+size_t UNICODE::quoted_ascii_length(const T* base, int length) {
+  size_t result = 0;
  for (int i = 0; i < length; i++) {
    T c = base[i];
    if (c >= 32 && c < 127) {
@ -576,7 +597,7 @@ int UNICODE::quoted_ascii_length(const T* base, int length) {

 // converts a unicode string to quoted ascii
 template<typename T>
-void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
+void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, size_t buflen) {
  char* p = buf;
  char* end = buf + buflen;
  for (int index = 0; index < length; index++) {
@ -594,11 +615,13 @@ void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen)
 }

 // Explicit instantiation for all supported types.
-template int UNICODE::utf8_length(const jbyte* base, int length);
-template int UNICODE::utf8_length(const jchar* base, int length);
-template char* UNICODE::as_utf8(const jbyte* base, int& length);
-template char* UNICODE::as_utf8(const jchar* base, int& length);
-template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
-template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
-template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
-template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);
+template size_t UNICODE::utf8_length(const jbyte* base, int length);
+template size_t UNICODE::utf8_length(const jchar* base, int length);
+template int UNICODE::utf8_length_as_int(const jbyte* base, int length);
+template int UNICODE::utf8_length_as_int(const jchar* base, int length);
+template char* UNICODE::as_utf8(const jbyte* base, size_t& length);
+template char* UNICODE::as_utf8(const jchar* base, size_t& length);
+template size_t UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
+template size_t UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
+template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, size_t buflen);
+template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, size_t buflen);
--- a/src/hotspot/share/utilities/utf8.hpp
+++ b/src/hotspot/share/utilities/utf8.hpp
@ -29,6 +29,45 @@
 #include "memory/allStatic.hpp"
 #include "utilities/debug.hpp"

+/**
+
+String handling within Java and the VM requires a bit of explanation.
+
+Logically a java.lang.String is a sequence of 16-bit Unicode characters
+encoded in UTF-16. In the past a String contained a Java char[] and so
+could theoretically contain INT_MAX 16-bit characters. Then came JEP 254:
+Compact Strings.
+
+With Compact Strings the Java char[] becomes a Java byte[], and that byte[]
+contains either latin-1 characters all of which fit in 8-bits, or else each
+pair of bytes represents a UTF-16 character. Consequently the maximum length
+in characters of a latin-1 string is INT_MAX, whilst for non-latin-1 it is INT_MAX/2.
+
+In the code below if we have latin-1 content then we treat the String's data
+array as a jbyte[], else a jchar[]. The lengths of these arrays are specified
+as an int value, with a nominal maximum of INT_MAX.
+
+The modified UTF-8 encoding specified for the VM, nominally encodes characters
+in 1, 2, 3 or 6 bytes. The 6-byte representation is actually two 3-byte representations
+for two UTF-16 characters forming a surrogate pair. If we are dealing with
+a latin-1 string then each character will be encoded as either 1 or 2 bytes and so the
+maximum UTF8 length is 2*INT_MAX. This can't be stored in an int so utf8 buffers must
+use a size_t length. For non-latin-1 strings each UTF-16 character will encode as either
+2 or 3 bytes, so the maximum UTF8 length in that case is 3 * INT_MAX/2 i.e. 1.5*INT_MAX.
+
+The "quoted ascii" form of a unicode string is at worst 6 times longer than its
+regular form, and so these lengths must always be size_t - though if we know we only
+ever do this to symbols (or small symbol combinations) then we could use int.
+
+There is an additional assumption/expectation that our UTF8 API's are never dealing with
+invalid UTF8, and more generally that all UTF8 sequences could form valid Strings.
+Consequently the Unicode length of a UTF8 sequence is assumed to always be representable
+by an int. However, there are API's, such as JNI NewStringUTF, that do deal with such input
+and could potentially have an unrepresentable string. The long standing position with JNI
+is that the user must supply valid input so we do not try to account for these cases.
+
+*/
+
 // Low-level interface for UTF8 strings

 class UTF8 : AllStatic {
@ -41,20 +80,20 @@ class UTF8 : AllStatic {
  static int unicode_length(const char* utf8_str, bool& is_latin1, bool& has_multibyte);

  // returns the unicode length of a non-0-terminated utf8 string
-  static int unicode_length(const char* utf8_str, int len) {
+  static int unicode_length(const char* utf8_str, size_t len) {
    bool is_latin1, has_multibyte;
    return unicode_length(utf8_str, len, is_latin1, has_multibyte);
  }
-  static int unicode_length(const char* utf8_str, int len, bool& is_latin1, bool& has_multibyte);
+  static int unicode_length(const char* utf8_str, size_t len, bool& is_latin1, bool& has_multibyte);

  // converts a utf8 string to a unicode string
  template<typename T> static void convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length);

  // returns the quoted ascii length of a utf8 string
-  static int quoted_ascii_length(const char* utf8_str, int utf8_length);
+  static size_t quoted_ascii_length(const char* utf8_str, size_t utf8_length);

  // converts a utf8 string to quoted ascii
-  static void as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen);
+  static void as_quoted_ascii(const char* utf8_str, size_t utf8_length, char* buf, size_t buflen);

 #ifndef PRODUCT
  // converts a quoted ascii string to utf8 string.  returns the original
@ -82,13 +121,13 @@ class UTF8 : AllStatic {
    while(--length >= 0 && base[length] != c);
    return (length < 0) ? nullptr : &base[length];
  }
-  static bool   equal(const jbyte* base1, int length1, const jbyte* base2,int length2);
+  static bool   equal(const jbyte* base1, int length1, const jbyte* base2, int length2);
  static bool   is_supplementary_character(const unsigned char* str);
  static jint   get_supplementary_character(const unsigned char* str);

-  static bool   is_legal_utf8(const unsigned char* buffer, int length,
+  static bool   is_legal_utf8(const unsigned char* buffer, size_t length,
                              bool version_leq_47);
-  static void   truncate_to_legal_utf8(unsigned char* buffer, int length);
+  static void   truncate_to_legal_utf8(unsigned char* buffer, size_t length);
 };


@ -99,6 +138,12 @@ class UTF8 : AllStatic {
 // units, so a supplementary character uses two positions in a unicode string.

 class UNICODE : AllStatic {
+
+  // returns the utf8 size of a unicode character
+  // uses size_t for convenience in overflow checks
+  static size_t utf8_size(jchar c);
+  static size_t utf8_size(jbyte c);
+
 public:
  // checks if the given unicode character can be encoded as latin1
  static bool is_latin1(jchar c);
@ -106,28 +151,27 @@ class UNICODE : AllStatic {
  // checks if the given string can be encoded as latin1
  static bool is_latin1(const jchar* base, int length);

-  // returns the utf8 size of a unicode character
-  static int utf8_size(jchar c);
-  static int utf8_size(jbyte c);
-
  // returns the utf8 length of a unicode string
-  template<typename T> static int utf8_length(const T* base, int length);
+  template<typename T> static size_t utf8_length(const T* base, int length);
+
+  // returns the utf8 length of a unicode string as an int - truncated if needed
+  template<typename T> static int utf8_length_as_int(const T* base, int length);

  // converts a unicode string to utf8 string
  static void convert_to_utf8(const jchar* base, int length, char* utf8_buffer);

  // converts a unicode string to a utf8 string; result is allocated
  // in resource area unless a buffer is provided. The unicode 'length'
-  // parameter is set to the length of the result utf8 string.
-  template<typename T> static char* as_utf8(const T* base, int& length);
-  static char* as_utf8(const jchar* base, int length, char* buf, int buflen);
-  static char* as_utf8(const jbyte* base, int length, char* buf, int buflen);
+  // parameter is set to the length of the resulting utf8 string.
+  template<typename T> static char* as_utf8(const T* base, size_t& length);
+  static char* as_utf8(const jchar* base, int length, char* buf, size_t buflen);
+  static char* as_utf8(const jbyte* base, int length, char* buf, size_t buflen);

  // returns the quoted ascii length of a unicode string
-  template<typename T> static int quoted_ascii_length(const T* base, int length);
+  template<typename T> static size_t quoted_ascii_length(const T* base, int length);

  // converts a unicode string to quoted ascii
-  template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, int buflen);
+  template<typename T> static void as_quoted_ascii(const T* base, int length, char* buf, size_t buflen);
 };

 #endif // SHARE_UTILITIES_UTF8_HPP