8087198: G1 card refinement: batching, sorting

Reviewed-by: tschatzl, kbarrett
2019-11-22 17:03:55 -08:00 · 2019-11-22 17:03:55 -08:00 · 5f2ac35cd0
commit 5f2ac35cd0
parent 6025207be8
3 changed files with 168 additions and 37 deletions
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -41,6 +41,7 @@
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
+#include "utilities/quickSort.hpp"

 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
  // Dirty card queues are always active, so we create them with their
@ -226,21 +227,127 @@ G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
  return result;
 }

+class G1RefineBufferedCards : public StackObj {
+  BufferNode* const _node;
+  CardTable::CardValue** const _node_buffer;
+  const size_t _node_buffer_size;
+  const uint _worker_id;
+  size_t* _total_refined_cards;
+  G1RemSet* const _g1rs;
+
+  static inline int compare_card(const CardTable::CardValue* p1,
+                                 const CardTable::CardValue* p2) {
+    return p2 - p1;
+  }
+
+  // Sorts the cards from start_index to _node_buffer_size in *decreasing*
+  // address order. Tests showed that this order is preferable to not sorting
+  // or increasing address order.
+  void sort_cards(size_t start_index) {
+    QuickSort::sort(&_node_buffer[start_index],
+                    _node_buffer_size - start_index,
+                    compare_card,
+                    false);
+  }
+
+  // Returns the index to the first clean card in the buffer.
+  size_t clean_cards() {
+    const size_t start = _node->index();
+    assert(start <= _node_buffer_size, "invariant");
+
+    // Two-fingered compaction algorithm similar to the filtering mechanism in
+    // SATBMarkQueue. The main difference is that clean_card_before_refine()
+    // could change the buffer element in-place.
+    // We don't check for SuspendibleThreadSet::should_yield(), because
+    // cleaning and redirtying the cards is fast.
+    CardTable::CardValue** src = &_node_buffer[start];
+    CardTable::CardValue** dst = &_node_buffer[_node_buffer_size];
+    assert(src <= dst, "invariant");
+    for ( ; src < dst; ++src) {
+      // Search low to high for a card to keep.
+      if (_g1rs->clean_card_before_refine(src)) {
+        // Found keeper.  Search high to low for a card to discard.
+        while (src < --dst) {
+          if (!_g1rs->clean_card_before_refine(dst)) {
+            *dst = *src;         // Replace discard with keeper.
+            break;
+          }
+        }
+        // If discard search failed (src == dst), the outer loop will also end.
+      }
+    }
+
+    // dst points to the first retained clean card, or the end of the buffer
+    // if all the cards were discarded.
+    const size_t first_clean = dst - _node_buffer;
+    assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant");
+    // Discarded cards are considered as refined.
+    *_total_refined_cards += first_clean - start;
+    return first_clean;
+  }
+
+  bool refine_cleaned_cards(size_t start_index) {
+    bool result = true;
+    size_t i = start_index;
+    for ( ; i < _node_buffer_size; ++i) {
+      if (SuspendibleThreadSet::should_yield()) {
+        redirty_unrefined_cards(i);
+        result = false;
+        break;
+      }
+      _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
+    }
+    _node->set_index(i);
+    *_total_refined_cards += i - start_index;
+    return result;
+  }
+
+  void redirty_unrefined_cards(size_t start) {
+    for ( ; start < _node_buffer_size; ++start) {
+      *_node_buffer[start] = G1CardTable::dirty_card_val();
+    }
+  }
+
+public:
+  G1RefineBufferedCards(BufferNode* node,
+                        size_t node_buffer_size,
+                        uint worker_id,
+                        size_t* total_refined_cards) :
+    _node(node),
+    _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
+    _node_buffer_size(node_buffer_size),
+    _worker_id(worker_id),
+    _total_refined_cards(total_refined_cards),
+    _g1rs(G1CollectedHeap::heap()->rem_set()) {}
+
+  bool refine() {
+    size_t first_clean_index = clean_cards();
+    if (first_clean_index == _node_buffer_size) {
+      _node->set_index(first_clean_index);
+      return true;
+    }
+    // This fence serves two purposes. First, the cards must be cleaned
+    // before processing the contents. Second, we can't proceed with
+    // processing a region until after the read of the region's top in
+    // collect_and_clean_cards(), for synchronization with possibly concurrent
+    // humongous object allocation (see comment at the StoreStore fence before
+    // setting the regions' tops in humongous allocation path).
+    // It's okay that reading region's top and reading region's type were racy
+    // wrto each other. We need both set, in any order, to proceed.
+    OrderAccess::fence();
+    sort_cards(first_clean_index);
+    return refine_cleaned_cards(first_clean_index);
+  }
+};
+
 bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
                                        uint worker_id,
                                        size_t* total_refined_cards) {
-  G1RemSet* rem_set = G1CollectedHeap::heap()->rem_set();
-  size_t size = buffer_size();
-  void** buffer = BufferNode::make_buffer_from_node(node);
-  size_t i = node->index();
-  assert(i <= size, "invariant");
-  for ( ; (i < size) && !SuspendibleThreadSet::should_yield(); ++i) {
-    CardTable::CardValue* cp = static_cast<CardTable::CardValue*>(buffer[i]);
-    rem_set->refine_card_concurrently(cp, worker_id);
-  }
-  *total_refined_cards += (i - node->index());
-  node->set_index(i);
-  return i == size;
+  G1RefineBufferedCards buffered_cards(node,
+                                       buffer_size(),
+                                       worker_id,
+                                       total_refined_cards);
+  return buffered_cards.refine();
 }

 #ifndef ASSERT
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@ -1261,25 +1261,27 @@ inline void check_card_ptr(CardTable::CardValue* card_ptr, G1CardTable* ct) {
 #endif
 }

-void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
-                                        uint worker_id) {
+bool G1RemSet::clean_card_before_refine(CardValue** const card_ptr_addr) {
  assert(!_g1h->is_gc_active(), "Only call concurrently");

-  // Construct the region representing the card.
+  CardValue* card_ptr = *card_ptr_addr;
+  // Find the start address represented by the card.
  HeapWord* start = _ct->addr_for(card_ptr);
  // And find the region containing it.
  HeapRegion* r = _g1h->heap_region_containing_or_null(start);

  // If this is a (stale) card into an uncommitted region, exit.
  if (r == NULL) {
-    return;
+    return false;
  }

  check_card_ptr(card_ptr, _ct);

  // If the card is no longer dirty, nothing to do.
+  // We cannot load the card value before the "r == NULL" check, because G1
+  // could uncommit parts of the card table covering uncommitted regions.
  if (*card_ptr != G1CardTable::dirty_card_val()) {
-    return;
+    return false;
  }

  // This check is needed for some uncommon cases where we should
@ -1302,7 +1304,7 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
  // enqueueing of the card and processing it here will have ensured
  // we see the up-to-date region type here.
  if (!r->is_old_or_humongous_or_archive()) {
-    return;
+    return false;
  }

  // The result from the hot card cache insert call is either:
@ -1321,7 +1323,7 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
    card_ptr = _hot_card_cache->insert(card_ptr);
    if (card_ptr == NULL) {
      // There was no eviction. Nothing to do.
-      return;
+      return false;
    } else if (card_ptr != orig_card_ptr) {
      // Original card was inserted and an old card was evicted.
      start = _ct->addr_for(card_ptr);
@ -1331,8 +1333,9 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
      // ignored, as discussed earlier for the original card.  The
      // region could have been freed while in the cache.
      if (!r->is_old_or_humongous_or_archive()) {
-        return;
+        return false;
      }
+      *card_ptr_addr = card_ptr;
    } // Else we still have the original card.
  }

@ -1341,18 +1344,19 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
  // (part of) an object at the end of the allocated space and extend
  // beyond the end of allocation.

-  // Non-humongous objects are only allocated in the old-gen during
-  // GC, so if region is old then top is stable.  Humongous object
-  // allocation sets top last; if top has not yet been set, this is
-  // a stale card and we'll end up with an empty intersection.  If
-  // this is not a stale card, the synchronization between the
+  // Non-humongous objects are either allocated in the old regions during GC,
+  // or mapped in archive regions during startup. So if region is old or
+  // archive then top is stable.
+  // Humongous object allocation sets top last; if top has not yet been set,
+  // this is a stale card and we'll end up with an empty intersection.
+  // If this is not a stale card, the synchronization between the
  // enqueuing of the card and processing it here will have ensured
  // we see the up-to-date top here.
  HeapWord* scan_limit = r->top();

  if (scan_limit <= start) {
    // If the trimmed region is empty, the card must be stale.
-    return;
+    return false;
  }

  // Okay to clean and process the card now.  There are still some
@ -1360,13 +1364,26 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
  // as iteration failure.
  *const_cast<volatile CardValue*>(card_ptr) = G1CardTable::clean_card_val();

-  // This fence serves two purposes.  First, the card must be cleaned
-  // before processing the contents.  Second, we can't proceed with
-  // processing until after the read of top, for synchronization with
-  // possibly concurrent humongous object allocation.  It's okay that
-  // reading top and reading type were racy wrto each other.  We need
-  // both set, in any order, to proceed.
-  OrderAccess::fence();
+  return true;
+}
+
+void G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
+                                        const uint worker_id) {
+  assert(!_g1h->is_gc_active(), "Only call concurrently");
+  check_card_ptr(card_ptr, _ct);
+
+  // Construct the MemRegion representing the card.
+  HeapWord* start = _ct->addr_for(card_ptr);
+  // And find the region containing it.
+  HeapRegion* r = _g1h->heap_region_containing(start);
+  // This reload of the top is safe even though it happens after the full
+  // fence, because top is stable for old, archive and unfiltered humongous
+  // regions, so it must return the same value as the previous load when
+  // cleaning the card. Also cleaning the card and refinement of the card
+  // cannot span across safepoint, so we don't need to worry about top being
+  // changed during safepoint.
+  HeapWord* scan_limit = r->top();
+  assert(scan_limit > start, "sanity");

  // Don't use addr_for(card_ptr + 1) which can ask for
  // a card beyond the heap.
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp
@ -113,10 +113,17 @@ public:
                                   G1GCPhaseTimes::GCParPhases coderoots_phase,
                                   G1GCPhaseTimes::GCParPhases objcopy_phase);

-  // Refine the card corresponding to "card_ptr". Safe to be called concurrently
-  // to the mutator.
-  void refine_card_concurrently(CardValue* card_ptr,
-                                uint worker_id);
+  // Two methods for concurrent refinement support, executed concurrently to
+  // the mutator:
+  // Cleans the card at "*card_ptr_addr" before refinement, returns true iff the
+  // card needs later refinement. Note that "*card_ptr_addr" could be updated to
+  // a different card due to use of hot card cache.
+  bool clean_card_before_refine(CardValue** const card_ptr_addr);
+  // Refine the region corresponding to "card_ptr". Must be called after
+  // being filtered by clean_card_before_refine(), and after proper
+  // fence/synchronization.
+  void refine_card_concurrently(CardValue* const card_ptr,
+                                const uint worker_id);

  // Print accumulated summary info from the start of the VM.
  void print_summary_info();