8087198: G1 card refinement: batching, sorting

Reviewed-by: tschatzl, kbarrett
This commit is contained in:
Man Cao 2019-11-22 17:03:55 -08:00
parent 6025207be8
commit 5f2ac35cd0
3 changed files with 168 additions and 37 deletions

View File

@ -41,6 +41,7 @@
#include "runtime/safepoint.hpp"
#include "runtime/thread.inline.hpp"
#include "runtime/threadSMR.hpp"
#include "utilities/quickSort.hpp"
G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
// Dirty card queues are always active, so we create them with their
@ -226,21 +227,127 @@ G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
return result;
}
class G1RefineBufferedCards : public StackObj {
BufferNode* const _node;
CardTable::CardValue** const _node_buffer;
const size_t _node_buffer_size;
const uint _worker_id;
size_t* _total_refined_cards;
G1RemSet* const _g1rs;
static inline int compare_card(const CardTable::CardValue* p1,
const CardTable::CardValue* p2) {
return p2 - p1;
}
// Sorts the cards from start_index to _node_buffer_size in *decreasing*
// address order. Tests showed that this order is preferable to not sorting
// or increasing address order.
void sort_cards(size_t start_index) {
QuickSort::sort(&_node_buffer[start_index],
_node_buffer_size - start_index,
compare_card,
false);
}
// Returns the index to the first clean card in the buffer.
size_t clean_cards() {
const size_t start = _node->index();
assert(start <= _node_buffer_size, "invariant");
// Two-fingered compaction algorithm similar to the filtering mechanism in
// SATBMarkQueue. The main difference is that clean_card_before_refine()
// could change the buffer element in-place.
// We don't check for SuspendibleThreadSet::should_yield(), because
// cleaning and redirtying the cards is fast.
CardTable::CardValue** src = &_node_buffer[start];
CardTable::CardValue** dst = &_node_buffer[_node_buffer_size];
assert(src <= dst, "invariant");
for ( ; src < dst; ++src) {
// Search low to high for a card to keep.
if (_g1rs->clean_card_before_refine(src)) {
// Found keeper. Search high to low for a card to discard.
while (src < --dst) {
if (!_g1rs->clean_card_before_refine(dst)) {
*dst = *src; // Replace discard with keeper.
break;
}
}
// If discard search failed (src == dst), the outer loop will also end.
}
}
// dst points to the first retained clean card, or the end of the buffer
// if all the cards were discarded.
const size_t first_clean = dst - _node_buffer;
assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant");
// Discarded cards are considered as refined.
*_total_refined_cards += first_clean - start;
return first_clean;
}
bool refine_cleaned_cards(size_t start_index) {
bool result = true;
size_t i = start_index;
for ( ; i < _node_buffer_size; ++i) {
if (SuspendibleThreadSet::should_yield()) {
redirty_unrefined_cards(i);
result = false;
break;
}
_g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
}
_node->set_index(i);
*_total_refined_cards += i - start_index;
return result;
}
void redirty_unrefined_cards(size_t start) {
for ( ; start < _node_buffer_size; ++start) {
*_node_buffer[start] = G1CardTable::dirty_card_val();
}
}
public:
G1RefineBufferedCards(BufferNode* node,
size_t node_buffer_size,
uint worker_id,
size_t* total_refined_cards) :
_node(node),
_node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
_node_buffer_size(node_buffer_size),
_worker_id(worker_id),
_total_refined_cards(total_refined_cards),
_g1rs(G1CollectedHeap::heap()->rem_set()) {}
bool refine() {
size_t first_clean_index = clean_cards();
if (first_clean_index == _node_buffer_size) {
_node->set_index(first_clean_index);
return true;
}
// This fence serves two purposes. First, the cards must be cleaned
// before processing the contents. Second, we can't proceed with
// processing a region until after the read of the region's top in
// collect_and_clean_cards(), for synchronization with possibly concurrent
// humongous object allocation (see comment at the StoreStore fence before
// setting the regions' tops in humongous allocation path).
// It's okay that reading region's top and reading region's type were racy
// wrto each other. We need both set, in any order, to proceed.
OrderAccess::fence();
sort_cards(first_clean_index);
return refine_cleaned_cards(first_clean_index);
}
};
bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
uint worker_id,
size_t* total_refined_cards) {
G1RemSet* rem_set = G1CollectedHeap::heap()->rem_set();
size_t size = buffer_size();
void** buffer = BufferNode::make_buffer_from_node(node);
size_t i = node->index();
assert(i <= size, "invariant");
for ( ; (i < size) && !SuspendibleThreadSet::should_yield(); ++i) {
CardTable::CardValue* cp = static_cast<CardTable::CardValue*>(buffer[i]);
rem_set->refine_card_concurrently(cp, worker_id);
}
*total_refined_cards += (i - node->index());
node->set_index(i);
return i == size;
G1RefineBufferedCards buffered_cards(node,
buffer_size(),
worker_id,
total_refined_cards);
return buffered_cards.refine();
}
#ifndef ASSERT

View File

@ -1261,25 +1261,27 @@ inline void check_card_ptr(CardTable::CardValue* card_ptr, G1CardTable* ct) {
#endif
}
void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
uint worker_id) {
bool G1RemSet::clean_card_before_refine(CardValue** const card_ptr_addr) {
assert(!_g1h->is_gc_active(), "Only call concurrently");
// Construct the region representing the card.
CardValue* card_ptr = *card_ptr_addr;
// Find the start address represented by the card.
HeapWord* start = _ct->addr_for(card_ptr);
// And find the region containing it.
HeapRegion* r = _g1h->heap_region_containing_or_null(start);
// If this is a (stale) card into an uncommitted region, exit.
if (r == NULL) {
return;
return false;
}
check_card_ptr(card_ptr, _ct);
// If the card is no longer dirty, nothing to do.
// We cannot load the card value before the "r == NULL" check, because G1
// could uncommit parts of the card table covering uncommitted regions.
if (*card_ptr != G1CardTable::dirty_card_val()) {
return;
return false;
}
// This check is needed for some uncommon cases where we should
@ -1302,7 +1304,7 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
// enqueueing of the card and processing it here will have ensured
// we see the up-to-date region type here.
if (!r->is_old_or_humongous_or_archive()) {
return;
return false;
}
// The result from the hot card cache insert call is either:
@ -1321,7 +1323,7 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
card_ptr = _hot_card_cache->insert(card_ptr);
if (card_ptr == NULL) {
// There was no eviction. Nothing to do.
return;
return false;
} else if (card_ptr != orig_card_ptr) {
// Original card was inserted and an old card was evicted.
start = _ct->addr_for(card_ptr);
@ -1331,8 +1333,9 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
// ignored, as discussed earlier for the original card. The
// region could have been freed while in the cache.
if (!r->is_old_or_humongous_or_archive()) {
return;
return false;
}
*card_ptr_addr = card_ptr;
} // Else we still have the original card.
}
@ -1341,18 +1344,19 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
// (part of) an object at the end of the allocated space and extend
// beyond the end of allocation.
// Non-humongous objects are only allocated in the old-gen during
// GC, so if region is old then top is stable. Humongous object
// allocation sets top last; if top has not yet been set, this is
// a stale card and we'll end up with an empty intersection. If
// this is not a stale card, the synchronization between the
// Non-humongous objects are either allocated in the old regions during GC,
// or mapped in archive regions during startup. So if region is old or
// archive then top is stable.
// Humongous object allocation sets top last; if top has not yet been set,
// this is a stale card and we'll end up with an empty intersection.
// If this is not a stale card, the synchronization between the
// enqueuing of the card and processing it here will have ensured
// we see the up-to-date top here.
HeapWord* scan_limit = r->top();
if (scan_limit <= start) {
// If the trimmed region is empty, the card must be stale.
return;
return false;
}
// Okay to clean and process the card now. There are still some
@ -1360,13 +1364,26 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,
// as iteration failure.
*const_cast<volatile CardValue*>(card_ptr) = G1CardTable::clean_card_val();
// This fence serves two purposes. First, the card must be cleaned
// before processing the contents. Second, we can't proceed with
// processing until after the read of top, for synchronization with
// possibly concurrent humongous object allocation. It's okay that
// reading top and reading type were racy wrto each other. We need
// both set, in any order, to proceed.
OrderAccess::fence();
return true;
}
void G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
const uint worker_id) {
assert(!_g1h->is_gc_active(), "Only call concurrently");
check_card_ptr(card_ptr, _ct);
// Construct the MemRegion representing the card.
HeapWord* start = _ct->addr_for(card_ptr);
// And find the region containing it.
HeapRegion* r = _g1h->heap_region_containing(start);
// This reload of the top is safe even though it happens after the full
// fence, because top is stable for old, archive and unfiltered humongous
// regions, so it must return the same value as the previous load when
// cleaning the card. Also cleaning the card and refinement of the card
// cannot span across safepoint, so we don't need to worry about top being
// changed during safepoint.
HeapWord* scan_limit = r->top();
assert(scan_limit > start, "sanity");
// Don't use addr_for(card_ptr + 1) which can ask for
// a card beyond the heap.

View File

@ -113,10 +113,17 @@ public:
G1GCPhaseTimes::GCParPhases coderoots_phase,
G1GCPhaseTimes::GCParPhases objcopy_phase);
// Refine the card corresponding to "card_ptr". Safe to be called concurrently
// to the mutator.
void refine_card_concurrently(CardValue* card_ptr,
uint worker_id);
// Two methods for concurrent refinement support, executed concurrently to
// the mutator:
// Cleans the card at "*card_ptr_addr" before refinement, returns true iff the
// card needs later refinement. Note that "*card_ptr_addr" could be updated to
// a different card due to use of hot card cache.
bool clean_card_before_refine(CardValue** const card_ptr_addr);
// Refine the region corresponding to "card_ptr". Must be called after
// being filtered by clean_card_before_refine(), and after proper
// fence/synchronization.
void refine_card_concurrently(CardValue* const card_ptr,
const uint worker_id);
// Print accumulated summary info from the start of the VM.
void print_summary_info();