8280087: G1: Handle out-of-mark stack situations during reference processing more gracefully

Reviewed-by: tschatzl, ayang
2023-12-18 09:43:53 +00:00 · 2023-12-18 09:43:53 +00:00 · f696796e88
commit f696796e88
parent 413dbf8757
5 changed files with 324 additions and 99 deletions
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@ -126,7 +126,6 @@ void G1Arguments::initialize_mark_stack_size() {
    FLAG_SET_ERGO(MarkStackSize, mark_stack_size);
  }

-  log_trace(gc)("MarkStackSize: %uk  MarkStackSizeMax: %uk", (uint)(MarkStackSize / K), (uint)(MarkStackSizeMax / K));
 }


--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
@ -75,6 +75,7 @@
 #include "utilities/align.hpp"
 #include "utilities/formatBuffer.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/powerOfTwo.hpp"

 bool G1CMBitMapClosure::do_addr(HeapWord* const addr) {
  assert(addr < _cm->finger(), "invariant");
@ -94,80 +95,173 @@ bool G1CMBitMapClosure::do_addr(HeapWord* const addr) {
 }

 G1CMMarkStack::G1CMMarkStack() :
-  _max_chunk_capacity(0),
-  _base(nullptr),
-  _chunk_capacity(0) {
+  _chunk_allocator() {
  set_empty();
 }

-bool G1CMMarkStack::resize(size_t new_capacity) {
-  assert(is_empty(), "Only resize when stack is empty.");
-  assert(new_capacity <= _max_chunk_capacity,
-         "Trying to resize stack to " SIZE_FORMAT " chunks when the maximum is " SIZE_FORMAT, new_capacity, _max_chunk_capacity);
-
-  TaskQueueEntryChunk* new_base = MmapArrayAllocator<TaskQueueEntryChunk>::allocate_or_null(new_capacity, mtGC);
-
-  if (new_base == nullptr) {
-    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.", new_capacity, new_capacity * sizeof(TaskQueueEntryChunk));
-    return false;
-  }
-  // Release old mapping.
-  if (_base != nullptr) {
-    MmapArrayAllocator<TaskQueueEntryChunk>::free(_base, _chunk_capacity);
-  }
-
-  _base = new_base;
-  _chunk_capacity = new_capacity;
-  set_empty();
-
-  return true;
-}
-
 size_t G1CMMarkStack::capacity_alignment() {
  return (size_t)lcm(os::vm_allocation_granularity(), sizeof(TaskQueueEntryChunk)) / sizeof(G1TaskQueueEntry);
 }

-bool G1CMMarkStack::initialize(size_t initial_capacity, size_t max_capacity) {
-  guarantee(_max_chunk_capacity == 0, "G1CMMarkStack already initialized.");
+bool G1CMMarkStack::initialize() {
+  guarantee(_chunk_allocator.capacity() == 0, "G1CMMarkStack already initialized.");
+
+  size_t initial_capacity = MarkStackSize;
+  size_t max_capacity = MarkStackSizeMax;

  size_t const TaskEntryChunkSizeInVoidStar = sizeof(TaskQueueEntryChunk) / sizeof(G1TaskQueueEntry);

-  _max_chunk_capacity = align_up(max_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;
-  size_t initial_chunk_capacity = align_up(initial_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;
+  size_t max_num_chunks = align_up(max_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;
+  size_t initial_num_chunks = align_up(initial_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;

-  guarantee(initial_chunk_capacity <= _max_chunk_capacity,
-            "Maximum chunk capacity " SIZE_FORMAT " smaller than initial capacity " SIZE_FORMAT,
-            _max_chunk_capacity,
-            initial_chunk_capacity);
+  initial_num_chunks = round_up_power_of_2(initial_num_chunks);
+  max_num_chunks = MAX2(initial_num_chunks, max_num_chunks);
+
+  size_t limit = (INT_MAX - 1);
+  max_capacity = MIN2((max_num_chunks * TaskEntryChunkSizeInVoidStar), limit);
+  initial_capacity = MIN2((initial_num_chunks * TaskEntryChunkSizeInVoidStar), limit);
+
+  FLAG_SET_ERGO(MarkStackSizeMax, max_capacity);
+  FLAG_SET_ERGO(MarkStackSize, initial_capacity);
+
+  log_trace(gc)("MarkStackSize: %uk  MarkStackSizeMax: %uk", (uint)(MarkStackSize / K), (uint)(MarkStackSizeMax / K));

  log_debug(gc)("Initialize mark stack with " SIZE_FORMAT " chunks, maximum " SIZE_FORMAT,
-                initial_chunk_capacity, _max_chunk_capacity);
+                initial_num_chunks, max_capacity);

-  return resize(initial_chunk_capacity);
+  return _chunk_allocator.initialize(initial_num_chunks, max_num_chunks);
+}
+
+G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::ChunkAllocator::allocate_new_chunk() {
+  if (_size >= _max_capacity) {
+    return nullptr;
+  }
+
+  size_t cur_idx = Atomic::fetch_then_add(&_size, 1u);
+
+  if (cur_idx >= _max_capacity) {
+    return nullptr;
+  }
+
+  size_t bucket = get_bucket(cur_idx);
+  if (Atomic::load_acquire(&_buckets[bucket]) == nullptr) {
+    if (!_should_grow) {
+      // Prefer to restart the CM.
+      return nullptr;
+    }
+
+    MutexLocker x(MarkStackChunkList_lock, Mutex::_no_safepoint_check_flag);
+    if (Atomic::load_acquire(&_buckets[bucket]) == nullptr) {
+      if (!expand()) {
+        return nullptr;
+      }
+    }
+  }
+
+  size_t bucket_idx = get_bucket_index(cur_idx);
+  TaskQueueEntryChunk* result = ::new (&_buckets[bucket][bucket_idx]) TaskQueueEntryChunk;
+  result->next = nullptr;
+  return result;
+}
+
+G1CMMarkStack::ChunkAllocator::ChunkAllocator() :
+  _min_capacity(0),
+  _max_capacity(0),
+  _capacity(0),
+  _num_buckets(0),
+  _should_grow(false),
+  _buckets(nullptr),
+  _size(0)
+{ }
+
+bool G1CMMarkStack::ChunkAllocator::initialize(size_t initial_capacity, size_t max_capacity) {
+  guarantee(is_power_of_2(initial_capacity), "Invalid initial_capacity");
+
+  _min_capacity = initial_capacity;
+  _max_capacity = max_capacity;
+  _num_buckets  = get_bucket(_max_capacity) + 1;
+
+  _buckets = NEW_C_HEAP_ARRAY(TaskQueueEntryChunk*, _num_buckets, mtGC);
+
+  for (size_t i = 0; i < _num_buckets; i++) {
+    _buckets[i] = nullptr;
+  }
+
+  size_t new_capacity = bucket_size(0);
+
+  if (!reserve(new_capacity)) {
+    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.", new_capacity, new_capacity * sizeof(TaskQueueEntryChunk));
+    return false;
+  }
+  return true;
+}
+
+bool G1CMMarkStack::ChunkAllocator::expand() {
+  if (_capacity == _max_capacity) {
+    log_debug(gc)("Can not expand overflow mark stack further, already at maximum capacity of " SIZE_FORMAT " chunks.", _capacity);
+    return false;
+  }
+  size_t old_capacity = _capacity;
+  // Double capacity if possible.
+  size_t new_capacity = MIN2(old_capacity * 2, _max_capacity);
+
+  if (reserve(new_capacity)) {
+    log_debug(gc)("Expanded the mark stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " chunks",
+                  old_capacity, new_capacity);
+    return true;
+  }
+  return false;
+}
+
+G1CMMarkStack::ChunkAllocator::~ChunkAllocator() {
+  if (_buckets == nullptr) {
+    return;
+  }
+
+  for (size_t i = 0; i < _num_buckets; i++) {
+    if (_buckets[i] != nullptr) {
+      MmapArrayAllocator<TaskQueueEntryChunk>::free(_buckets[i],  bucket_size(i));
+      _buckets[i] = nullptr;
+    }
+  }
+
+  FREE_C_HEAP_ARRAY(TaskQueueEntryChunk*, _buckets);
+}
+
+bool G1CMMarkStack::ChunkAllocator::reserve(size_t new_capacity) {
+  assert(new_capacity <= _max_capacity, "Cannot expand overflow mark stack beyond the max_capacity" SIZE_FORMAT " chunks.", _max_capacity);
+
+  size_t highest_bucket = get_bucket(new_capacity - 1);
+  size_t i = get_bucket(_capacity);
+
+  for (; i <= highest_bucket; i++) {
+    if (Atomic::load_acquire(&_buckets[i]) != nullptr) {
+      continue; // Skip over already allocated buckets.
+    }
+
+    size_t bucket_capacity = bucket_size(i);
+
+    // Trim bucket size so that we do not exceed the _max_capacity.
+    bucket_capacity = (_capacity + bucket_capacity) <= _max_capacity ?
+                      bucket_capacity :
+                      _max_capacity - _capacity;
+
+
+    TaskQueueEntryChunk* bucket_base = MmapArrayAllocator<TaskQueueEntryChunk>::allocate_or_null(bucket_capacity, mtGC);
+
+    if (bucket_base == nullptr) {
+      log_warning(gc)("Failed to reserve memory for increasing the overflow mark stack capacity with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.",
+                      bucket_capacity, bucket_capacity * sizeof(TaskQueueEntryChunk));
+      return false;
+    }
+    _capacity += bucket_capacity;
+    Atomic::release_store(&_buckets[i], bucket_base);
+  }
+  return true;
 }

 void G1CMMarkStack::expand() {
-  if (_chunk_capacity == _max_chunk_capacity) {
-    log_debug(gc)("Can not expand overflow mark stack further, already at maximum capacity of " SIZE_FORMAT " chunks.", _chunk_capacity);
-    return;
-  }
-  size_t old_capacity = _chunk_capacity;
-  // Double capacity if possible
-  size_t new_capacity = MIN2(old_capacity * 2, _max_chunk_capacity);
-
-  if (resize(new_capacity)) {
-    log_debug(gc)("Expanded mark stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " chunks",
-                  old_capacity, new_capacity);
-  } else {
-    log_warning(gc)("Failed to expand mark stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " chunks",
-                    old_capacity, new_capacity);
-  }
-}
-
-G1CMMarkStack::~G1CMMarkStack() {
-  if (_base != nullptr) {
-    MmapArrayAllocator<TaskQueueEntryChunk>::free(_base, _chunk_capacity);
-  }
+  _chunk_allocator.expand();
 }

 void G1CMMarkStack::add_chunk_to_list(TaskQueueEntryChunk* volatile* list, TaskQueueEntryChunk* elem) {
@ -208,31 +302,13 @@ G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::remove_chunk_from_free_list()
  return remove_chunk_from_list(&_free_list);
 }

-G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::allocate_new_chunk() {
-  // This dirty read of _hwm is okay because we only ever increase the _hwm in parallel code.
-  // Further this limits _hwm to a value of _chunk_capacity + #threads, avoiding
-  // wraparound of _hwm.
-  if (_hwm >= _chunk_capacity) {
-    return nullptr;
-  }
-
-  size_t cur_idx = Atomic::fetch_then_add(&_hwm, 1u);
-  if (cur_idx >= _chunk_capacity) {
-    return nullptr;
-  }
-
-  TaskQueueEntryChunk* result = ::new (&_base[cur_idx]) TaskQueueEntryChunk;
-  result->next = nullptr;
-  return result;
-}
-
 bool G1CMMarkStack::par_push_chunk(G1TaskQueueEntry* ptr_arr) {
  // Get a new chunk.
  TaskQueueEntryChunk* new_chunk = remove_chunk_from_free_list();

  if (new_chunk == nullptr) {
    // Did not get a chunk from the free list. Allocate from backing memory.
-    new_chunk = allocate_new_chunk();
+    new_chunk = _chunk_allocator.allocate_new_chunk();

    if (new_chunk == nullptr) {
      return false;
@ -261,9 +337,9 @@ bool G1CMMarkStack::par_pop_chunk(G1TaskQueueEntry* ptr_arr) {

 void G1CMMarkStack::set_empty() {
  _chunks_in_chunk_list = 0;
-  _hwm = 0;
  _chunk_list = nullptr;
  _free_list = nullptr;
+  _chunk_allocator.reset();
 }

 G1CMRootMemRegions::G1CMRootMemRegions(uint const max_regions) :
@ -440,7 +516,7 @@ G1ConcurrentMark::G1ConcurrentMark(G1CollectedHeap* g1h,
  _concurrent_workers->initialize_workers();
  _num_concurrent_workers = _concurrent_workers->active_workers();

-  if (!_global_mark_stack.initialize(MarkStackSize, MarkStackSizeMax)) {
+  if (!_global_mark_stack.initialize()) {
    vm_exit_during_initialization("Failed to allocate initial concurrent mark overflow mark stack.");
  }

@ -1635,6 +1711,9 @@ void G1ConcurrentMark::weak_refs_work() {

    assert(_global_mark_stack.is_empty(), "mark stack should be empty");

+    // Prefer to grow the stack until the max capacity.
+    _global_mark_stack.set_should_grow();
+
    // We need at least one active thread. If reference processing
    // is not multi-threaded we use the current (VMThread) thread,
    // otherwise we use the workers from the G1CollectedHeap and
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
@ -136,10 +136,101 @@ private:
    G1TaskQueueEntry data[EntriesPerChunk];
  };

-  size_t _max_chunk_capacity;    // Maximum number of TaskQueueEntryChunk elements on the stack.
+  class ChunkAllocator {
+    // The chunk allocator relies on a growable array data structure that allows resizing without the
+    // need to copy existing items. The basic approach involves organizing the array into chunks,
+    // essentially creating an "array of arrays"; referred to as buckets in this implementation. To
+    // facilitate efficient indexing, the size of the first bucket is set to a power of 2. This choice
+    // allows for quick conversion of an array index into a bucket index and the corresponding offset
+    // within the bucket. Additionally, each new bucket added to the growable array doubles the capacity of
+    // the growable array.
+    //
+    // Illustration of the Growable Array data structure.
+    //
+    //        +----+        +----+----+
+    //        |    |------->|    |    |
+    //        |    |        +----+----+
+    //        +----+        +----+----+
+    //        |    |------->|    |    |
+    //        |    |        +----+----+
+    //        +----+        +-----+-----+-----+-----+
+    //        |    |------->|     |     |     |     |
+    //        |    |        +-----+-----+-----+-----+
+    //        +----+        +-----+-----+-----+-----+-----+-----+-----+----+
+    //        |    |------->|     |     |     |     |     |     |     |    |
+    //        |    |        +-----+-----+-----+-----+-----+-----+-----+----+
+    //        +----+
+    //
+    size_t _min_capacity;
+    size_t _max_capacity;
+    size_t _capacity;
+    size_t _num_buckets;
+    bool _should_grow;
+    TaskQueueEntryChunk* volatile* _buckets;
+    char _pad0[DEFAULT_CACHE_LINE_SIZE];
+    volatile size_t _size;
+    char _pad4[DEFAULT_CACHE_LINE_SIZE - sizeof(size_t)];

-  TaskQueueEntryChunk* _base;    // Bottom address of allocated memory area.
-  size_t _chunk_capacity;        // Current maximum number of TaskQueueEntryChunk elements.
+    size_t bucket_size(size_t bucket) {
+      return (bucket == 0) ?
+              _min_capacity :
+              _min_capacity * ( 1ULL << (bucket -1));
+    }
+
+    static unsigned int find_highest_bit(uintptr_t mask) {
+      return count_leading_zeros(mask) ^ (BitsPerWord - 1U);
+    }
+
+    size_t get_bucket(size_t array_idx) {
+      if (array_idx < _min_capacity) {
+        return 0;
+      }
+
+      return find_highest_bit(array_idx) - find_highest_bit(_min_capacity) + 1;
+    }
+
+    size_t get_bucket_index(size_t array_idx) {
+      if (array_idx < _min_capacity) {
+        return array_idx;
+      }
+      return array_idx - (1ULL << find_highest_bit(array_idx));
+    }
+
+    bool reserve(size_t new_capacity);
+
+  public:
+    ChunkAllocator();
+
+    ~ChunkAllocator();
+
+    bool initialize(size_t initial_capacity, size_t max_capacity);
+
+    void reset() {
+      _size = 0;
+      _should_grow = false;
+    }
+
+    // During G1CMConcurrentMarkingTask or finalize_marking phases, we prefer to restart the marking when
+    // the G1CMMarkStack overflows. Attempts to expand the G1CMMarkStack should be followed with a restart
+    // of the marking. On failure to allocate a new chuck, the caller just returns and forces a restart.
+    // This approach offers better memory utilization for the G1CMMarkStack, as each iteration of the
+    // marking potentially involves traversing fewer unmarked nodes in the graph.
+
+    // However, during the reference processing phase, instead of restarting the marking process, the
+    // G1CMMarkStack is expanded upon failure to allocate a new chunk. The decision between these two
+    // modes of expansion is determined by the _should_grow parameter.
+    void set_should_grow() {
+      _should_grow = true;
+    }
+
+    size_t capacity() const { return _capacity; }
+
+    bool expand();
+
+    TaskQueueEntryChunk* allocate_new_chunk();
+  };
+
+  ChunkAllocator _chunk_allocator;

  char _pad0[DEFAULT_CACHE_LINE_SIZE];
  TaskQueueEntryChunk* volatile _free_list;  // Linked list of free chunks that can be allocated by users.
@ -148,13 +239,6 @@ private:
  volatile size_t _chunks_in_chunk_list;
  char _pad2[DEFAULT_CACHE_LINE_SIZE - sizeof(TaskQueueEntryChunk*) - sizeof(size_t)];

-  volatile size_t _hwm;          // High water mark within the reserved space.
-  char _pad4[DEFAULT_CACHE_LINE_SIZE - sizeof(size_t)];
-
-  // Allocate a new chunk from the reserved memory, using the high water mark. Returns
-  // null if out of memory.
-  TaskQueueEntryChunk* allocate_new_chunk();
-
  // Atomically add the given chunk to the list.
  void add_chunk_to_list(TaskQueueEntryChunk* volatile* list, TaskQueueEntryChunk* elem);
  // Atomically remove and return a chunk from the given list. Returns null if the
@ -167,19 +251,15 @@ private:
  TaskQueueEntryChunk* remove_chunk_from_chunk_list();
  TaskQueueEntryChunk* remove_chunk_from_free_list();

-  // Resizes the mark stack to the given new capacity. Releases any previous
-  // memory if successful.
-  bool resize(size_t new_capacity);
-
 public:
  G1CMMarkStack();
-  ~G1CMMarkStack();
+  ~G1CMMarkStack() = default;

  // Alignment and minimum capacity of this mark stack in number of oops.
  static size_t capacity_alignment();

-  // Allocate and initialize the mark stack with the given number of oops.
-  bool initialize(size_t initial_capacity, size_t max_capacity);
+  // Allocate and initialize the mark stack.
+  bool initialize();

  // Pushes the given buffer containing at most EntriesPerChunk elements on the mark
  // stack. If less than EntriesPerChunk elements are to be pushed, the array must
@ -197,7 +277,11 @@ private:
  // _chunk_list.
  bool is_empty() const { return _chunk_list == nullptr; }

-  size_t capacity() const  { return _chunk_capacity; }
+  size_t capacity() const  { return _chunk_allocator.capacity(); }
+
+  void set_should_grow() {
+    _chunk_allocator.set_should_grow();
+  }

  // Expand the stack, typically in response to an overflow condition
  void expand();
--- a/test/hotspot/jtreg/gc/g1/TestMarkStackOverflow.java
+++ b/test/hotspot/jtreg/gc/g1/TestMarkStackOverflow.java
@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package gc.g1;
+
+import java.util.LinkedHashMap;
+
+/* @test
+ * @bug 8313212
+ * @summary Finalizing objects may create new concurrent marking work during reference processing.
+ * If the marking work overflows the global mark stack, we should resize the global mark stack
+ * until MarkStackSizeMax if possible.
+ * @requires vm.gc.G1
+ * @run main/othervm -XX:ActiveProcessorCount=2 -XX:MarkStackSize=1 -Xmx250m gc.g1.TestMarkStackOverflow
+ */
+
+public class TestMarkStackOverflow {
+    public static void main(String[] args) throws Exception {
+        for (int i = 0; i < 10; i++) {
+            Finalizable holder1 = new Finalizable();
+            System.out.printf("Used mem %.2f MB\n", getUsedMem());
+        }
+    }
+
+    private static double getUsedMem() {
+        return (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / (double) (1024 * 1024);
+    }
+
+    private static class Finalizable {
+        public static final int NUM_OBJECTS = 200_000;
+        private final LinkedHashMap<Object, Object> list = new LinkedHashMap<>();
+
+        public Finalizable() {
+            for (int i = 0; i < NUM_OBJECTS; i++) {
+                Object entry = new Object();
+                list.put(entry, entry);
+            }
+        }
+
+        @SuppressWarnings("removal")
+        protected void finalize() {
+            System.out.print("");
+        }
+    }
+}
--- a/test/langtools/ProblemList.txt
+++ b/test/langtools/ProblemList.txt
@ -66,8 +66,6 @@ tools/javac/annotations/typeAnnotations/referenceinfos/Lambda.java
 tools/javac/annotations/typeAnnotations/referenceinfos/NestedTypes.java         8057687    generic-all    emit correct byte code an attributes for type annotations
 tools/javac/warnings/suppress/TypeAnnotations.java                              8057683    generic-all    improve ordering of errors with type annotations
 tools/javac/modules/SourceInSymlinkTest.java                                    8180263    windows-all    fails when run on a subst drive
-tools/javac/lambda/bytecode/TestLambdaBytecodeTargetRelease14.java              8312534    linux-i586     fails with assert "g1ConcurrentMark.cpp: Overflow during reference processing"
-tools/javac/varargs/warning/Warn5.java                                          8312534    linux-i586     fails with assert "g1ConcurrentMark.cpp: Overflow during reference processing"

 ###########################################################################
 #