8292697: ZGC: Hangs when almost out of metaspace memory

Reviewed-by: eosterlund, coleenp
2022-08-26 13:44:28 +00:00 · 2022-08-26 13:44:28 +00:00 · 3844685be0
commit 3844685be0
parent 2b803493ba
7 changed files with 79 additions and 33 deletions
--- a/src/hotspot/share/classfile/classLoaderDataGraph.cpp
+++ b/src/hotspot/share/classfile/classLoaderDataGraph.cpp
@ -550,10 +550,12 @@ void ClassLoaderDataGraph::purge(bool at_safepoint) {
    delete purge_me;
    classes_unloaded = true;
  }
+
+  Metaspace::purge(classes_unloaded);
  if (classes_unloaded) {
-    Metaspace::purge();
    set_metaspace_oom(false);
  }
+
  DependencyContext::purge_dependency_contexts();

  // If we're purging metadata at a safepoint, clean remaining
--- a/src/hotspot/share/memory/metaspace.cpp
+++ b/src/hotspot/share/memory/metaspace.cpp
@ -995,23 +995,30 @@ const char* Metaspace::metadata_type_name(Metaspace::MetadataType mdtype) {
  }
 }

-void Metaspace::purge() {
+void Metaspace::purge(bool classes_unloaded) {
  // The MetaspaceCritical_lock is used by a concurrent GC to block out concurrent metaspace
  // allocations, that would starve critical metaspace allocations, that are about to throw
  // OOM if they fail; they need precedence for correctness.
  MutexLocker ml(MetaspaceCritical_lock, Mutex::_no_safepoint_check_flag);
-  ChunkManager* cm = ChunkManager::chunkmanager_nonclass();
-  if (cm != NULL) {
-    cm->purge();
-  }
-  if (using_class_space()) {
-    cm = ChunkManager::chunkmanager_class();
+  if (classes_unloaded) {
+    ChunkManager* cm = ChunkManager::chunkmanager_nonclass();
    if (cm != NULL) {
      cm->purge();
    }
+    if (using_class_space()) {
+      cm = ChunkManager::chunkmanager_class();
+      if (cm != NULL) {
+        cm->purge();
+      }
+    }
  }

-  MetaspaceCriticalAllocation::satisfy();
+  // Try to satisfy queued metaspace allocation requests.
+  //
+  // It might seem unnecessary to try to process allocation requests if no
+  // classes have been unloaded. However, this call is required for the code
+  // in MetaspaceCriticalAllocation::try_allocate_critical to work.
+  MetaspaceCriticalAllocation::process();
 }

 bool Metaspace::contains(const void* ptr) {
--- a/src/hotspot/share/memory/metaspace.hpp
+++ b/src/hotspot/share/memory/metaspace.hpp
@ -121,7 +121,7 @@ public:
  static bool contains_non_shared(const void* ptr);

  // Free empty virtualspaces
-  static void purge();
+  static void purge(bool classes_unloaded);

  static void report_metadata_oome(ClassLoaderData* loader_data, size_t word_size,
                                   MetaspaceObj::Type type, MetadataType mdtype, TRAPS);
--- a/src/hotspot/share/memory/metaspaceCriticalAllocation.cpp
+++ b/src/hotspot/share/memory/metaspaceCriticalAllocation.cpp
@ -33,12 +33,12 @@
 #include "runtime/mutexLocker.hpp"

 class MetadataAllocationRequest {
-  ClassLoaderData*           _loader_data;
-  size_t                     _word_size;
-  Metaspace::MetadataType    _type;
-  MetadataAllocationRequest* _next;
-  MetaWord*                  _result;
-  bool                       _has_result;
+  ClassLoaderData* const        _loader_data;
+  const size_t                  _word_size;
+  const Metaspace::MetadataType _type;
+  MetadataAllocationRequest*    _next;
+  MetaWord*                     _result;
+  bool                          _is_processed;

 public:
  MetadataAllocationRequest(ClassLoaderData* loader_data,
@ -49,7 +49,7 @@ public:
      _type(type),
      _next(NULL),
      _result(NULL),
-      _has_result(false) {
+      _is_processed(false) {
    MetaspaceCriticalAllocation::add(this);
  }

@ -57,17 +57,17 @@ public:
    MetaspaceCriticalAllocation::remove(this);
  }

-  ClassLoaderData*           loader_data() const { return _loader_data; }
-  size_t                     word_size() const   { return _word_size; }
-  Metaspace::MetadataType    type() const        { return _type; }
-  MetadataAllocationRequest* next() const        { return _next; }
-  MetaWord*                  result() const      { return _result; }
-  bool                       has_result() const  { return _has_result; }
+  ClassLoaderData*           loader_data() const   { return _loader_data; }
+  size_t                     word_size() const     { return _word_size; }
+  Metaspace::MetadataType    type() const          { return _type; }
+  MetadataAllocationRequest* next() const          { return _next; }
+  MetaWord*                  result() const        { return _result; }
+  bool                       is_processed() const  { return _is_processed; }

  void set_next(MetadataAllocationRequest* next) { _next = next; }
  void set_result(MetaWord* result) {
    _result = result;
-    _has_result = true;
+    _is_processed = true;
  }
 };

@ -113,13 +113,47 @@ void MetaspaceCriticalAllocation::remove(MetadataAllocationRequest* request) {
 }

 bool MetaspaceCriticalAllocation::try_allocate_critical(MetadataAllocationRequest* request) {
+  // This function uses an optimized scheme to limit the number of triggered
+  // GCs. The idea is that only one request in the list is responsible for
+  // triggering a GC, and later requests will try to piggy-back on that
+  // request.
+  //
+  // For this to work, it is important that we can tell which requests were
+  // seen by the GC's call to process(), and which requests were added after
+  // last proccess() call. The property '_is_processed' tells this. Because the
+  // logic below relies on that property, it is important that the GC calls
+  // process() even when the GC didn't unload any classes.
+  //
+  // Note that process() leaves the requests in the queue, so that threads
+  // in wait_for_purge, which had their requests processed, but didn't get any
+  // memory can exit that function and trigger a new GC as a last effort to get
+  // memory before throwing an OOME.
+  //
+  // Requests that have been processed once, will not trigger new GCs, we
+  // therefore filter them out when we determine if the current 'request'
+  // needs to trigger a GC, or if there are earlier requests that will
+  // trigger a GC.
+
  {
    MutexLocker ml(MetaspaceCritical_lock, Mutex::_no_safepoint_check_flag);
-    if (_requests_head == request) {
-      // The first request can't opportunistically ride on a previous GC
+    auto is_first_unprocessed = [&]() {
+      for (MetadataAllocationRequest* curr = _requests_head; curr != NULL; curr = curr->next()) {
+        if (!curr->is_processed()) {
+          // curr is the first not satisfied request
+          return curr == request;
+        }
+      }
+
+      return false;
+    };
+
+    if (is_first_unprocessed()) {
+      // The first non-processed request takes ownership of triggering the GC
+      // on behalf of itself, and all trailing requests in the list.
      return false;
    }
  }
+
  // Try to ride on a previous GC and hope for early satisfaction
  wait_for_purge(request);
  return request->result() != NULL;
@ -129,7 +163,9 @@ void MetaspaceCriticalAllocation::wait_for_purge(MetadataAllocationRequest* requ
  ThreadBlockInVM tbivm(JavaThread::current());
  MutexLocker ml(MetaspaceCritical_lock, Mutex::_no_safepoint_check_flag);
  for (;;) {
-    if (request->has_result()) {
+    if (request->is_processed()) {
+      // The GC has procesed this request during the purge.
+      // Return and check the result, and potentially call a last-effort GC.
      break;
    }
    MetaspaceCritical_lock->wait_without_safepoint_check();
@ -144,12 +180,12 @@ void MetaspaceCriticalAllocation::block_if_concurrent_purge() {
  }
 }

-void MetaspaceCriticalAllocation::satisfy() {
+void MetaspaceCriticalAllocation::process() {
  assert_lock_strong(MetaspaceCritical_lock);
  bool all_satisfied = true;
  for (MetadataAllocationRequest* curr = _requests_head; curr != NULL; curr = curr->next()) {
    if (curr->result() != NULL) {
-      // Don't satisfy twice
+      // Don't satisfy twice (can still be processed twice)
      continue;
    }
    // Try to allocate metadata.
--- a/src/hotspot/share/memory/metaspaceCriticalAllocation.hpp
+++ b/src/hotspot/share/memory/metaspaceCriticalAllocation.hpp
@ -52,7 +52,7 @@ class ClassLoaderData;
 // survived that situation in theory. The motivation is that we are at this point so close
 // to being out of memory, and the VM is not having a good time, so the user really ought
 // to increase the amount of available metaspace anyway, instead of GC:ing around more
-// to satisfy a very small number of additional allocations. But it does solve pathologial
+// to satisfy a very small number of additional allocations. But it does solve pathological
 // unbounded starvation scenarios where OOM can get thrown even though most of metaspace
 // is full of dead metadata.
 //
@ -77,7 +77,7 @@ class MetaspaceCriticalAllocation : public AllStatic {

 public:
  static void block_if_concurrent_purge();
-  static void satisfy();
+  static void process();
  static MetaWord* allocate(ClassLoaderData* loader_data, size_t word_size, Metaspace::MetadataType type);
 };

--- a/test/hotspot/jtreg/vmTestbase/metaspace/shrink_grow/ShrinkGrowMultiJVM/ShrinkGrowMultiJVM.java
+++ b/test/hotspot/jtreg/vmTestbase/metaspace/shrink_grow/ShrinkGrowMultiJVM/ShrinkGrowMultiJVM.java
@ -58,7 +58,7 @@ public class ShrinkGrowMultiJVM {
                 .resolve("java")
                 .toAbsolutePath()
                 .toString(),
-            "-Xlog:gc:gc_$i.log", // LOG_GC_ARG_INDEX
+            "UNSET_LOG_GC_ARG", // LOG_GC_ARG_INDEX
            "-XX:MetaspaceSize=10m",
            "-XX:MaxMetaspaceSize=20m",
            "-cp",
@ -81,7 +81,7 @@ public class ShrinkGrowMultiJVM {
        for (int i = 0; i < 5; i++) {
            // will be used as jvm id
            args[args.length - 1] = "jvm#" + i;
-            args[LOG_GC_ARG_INDEX] = "-Xlog:gc:gc_" + i + ".log";
+            args[LOG_GC_ARG_INDEX] = "-Xlog:gc*:gc_" + i + ".log::filecount=0";
            ProcessBuilder pb = new ProcessBuilder(args);
            try {
                Process p = pb.start();
--- a/test/hotspot/jtreg/vmTestbase/metaspace/shrink_grow/ShrinkGrowTest/ShrinkGrowTest.java
+++ b/test/hotspot/jtreg/vmTestbase/metaspace/shrink_grow/ShrinkGrowTest/ShrinkGrowTest.java
@ -145,6 +145,7 @@ public class ShrinkGrowTest {
        // step 2: try to load one more class
        // it should be impossible
        try {
+            log("and finally, a wafer-thin mint");
            eatALittleMemory();
            throwFault("We haven't cleaned metaspace yet!");
        } catch (OutOfMemoryError error) {