8327452: G1: Improve scalability of Merge Log Buffers

Reviewed-by: kbarrett, tschatzl
This commit is contained in:
Ivan Walulya 2024-03-13 10:15:11 +00:00
parent 0ae4fa71e4
commit 49d8008947
4 changed files with 79 additions and 11 deletions

@ -174,6 +174,7 @@ void G1GCPhaseTimes::reset() {
_cur_merge_heap_roots_time_ms = 0.0;
_cur_optional_merge_heap_roots_time_ms = 0.0;
_cur_prepare_merge_heap_roots_time_ms = 0.0;
_cur_distribute_log_buffers_time_ms = 0.0;
_cur_optional_prepare_merge_heap_roots_time_ms = 0.0;
_cur_pre_evacuate_prepare_time_ms = 0.0;
_cur_post_evacuate_cleanup_1_time_ms = 0.0;
@ -459,6 +460,7 @@ double G1GCPhaseTimes::print_evacuate_initial_collection_set() const {
debug_time("Prepare Merge Heap Roots", _cur_prepare_merge_heap_roots_time_ms);
debug_phase_merge_remset();
debug_time("Distribute Log Buffers", _cur_distribute_log_buffers_time_ms);
debug_phase(_gc_par_phases[MergeLB]);
info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms);

@ -181,6 +181,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
double _cur_prepare_merge_heap_roots_time_ms;
double _cur_optional_prepare_merge_heap_roots_time_ms;
double _cur_distribute_log_buffers_time_ms;
double _cur_pre_evacuate_prepare_time_ms;
double _cur_post_evacuate_cleanup_1_time_ms;
@ -304,6 +306,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
_cur_prepare_merge_heap_roots_time_ms += ms;
}
void record_distribute_log_buffers_time_ms(double ms) {
_cur_distribute_log_buffers_time_ms += ms;
}
void record_or_add_optional_prepare_merge_heap_roots_time(double ms) {
_cur_optional_prepare_merge_heap_roots_time_ms += ms;
}
@ -376,6 +382,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
return _cur_collection_start_sec;
}
double cur_distribute_log_buffers_time_ms() {
return _cur_distribute_log_buffers_time_ms;
}
double cur_collection_par_time_ms() {
return _cur_collection_initial_evac_time_ms +
_cur_optional_evac_time_ms +

@ -774,6 +774,10 @@ double G1Policy::logged_cards_processing_time() const {
size_t logged_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
double merge_logged_cards_time = average_time_ms(G1GCPhaseTimes::MergeLB) +
phase_times()->cur_distribute_log_buffers_time_ms();
// Approximate the time spent processing cards from log buffers by scaling
// the total processing time by the ratio of logged cards to total cards
// processed. There might be duplicate cards in different log buffers,
@ -783,9 +787,9 @@ double G1Policy::logged_cards_processing_time() const {
// counts are zero, which happens especially during early GCs. So ascribe
// all of the time to the logged cards unless there are more total cards.
if (logged_dirty_cards >= scan_heap_roots_cards) {
return all_cards_processing_time + average_time_ms(G1GCPhaseTimes::MergeLB);
return all_cards_processing_time + merge_logged_cards_time;
}
return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + average_time_ms(G1GCPhaseTimes::MergeLB);
return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + merge_logged_cards_time;
}
// Anything below that is considered to be zero
@ -874,6 +878,7 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
average_time_ms(G1GCPhaseTimes::MergeRS) +
average_time_ms(G1GCPhaseTimes::MergeLB) +
p->cur_distribute_log_buffers_time_ms() +
average_time_ms(G1GCPhaseTimes::OptMergeRS);
_analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / total_cards_merged, is_young_only_pause);
}

@ -1258,39 +1258,90 @@ class G1MergeHeapRootsTask : public WorkerTask {
size_t cards_skipped() const { return _cards_skipped; }
};
HeapRegionClaimer _hr_claimer;
uint _num_workers;
G1RemSetScanState* _scan_state;
BufferNode::Stack _dirty_card_buffers;
// To mitigate contention due multiple threads accessing and popping BufferNodes from a shared
// G1DirtyCardQueueSet, we implement a sequential distribution phase. Here, BufferNodes are
// distributed to worker threads in a sequential manner utilizing the _dirty_card_buffers. By doing
// so, we effectively alleviate the bottleneck encountered during pop operations on the
// G1DirtyCardQueueSet. Importantly, this approach preserves the helping aspect among worker
// threads, allowing them to assist one another in case of imbalances in work distribution.
BufferNode::Stack* _dirty_card_buffers;
bool _initial_evacuation;
volatile bool _fast_reclaim_handled;
void apply_closure_to_dirty_card_buffers(G1MergeLogBufferCardsClosure* cl, uint worker_id) {
G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
while (BufferNode* node = _dirty_card_buffers.pop()) {
cl->apply_to_buffer(node, worker_id);
dcqs.deallocate_buffer(node);
for (uint i = 0; i < _num_workers; i++) {
uint index = (worker_id + i) % _num_workers;
while (BufferNode* node = _dirty_card_buffers[index].pop()) {
cl->apply_to_buffer(node, worker_id);
dcqs.deallocate_buffer(node);
}
}
}
public:
G1MergeHeapRootsTask(G1RemSetScanState* scan_state, uint num_workers, bool initial_evacuation) :
WorkerTask("G1 Merge Heap Roots"),
_hr_claimer(num_workers),
_num_workers(num_workers),
_scan_state(scan_state),
_dirty_card_buffers(),
_dirty_card_buffers(nullptr),
_initial_evacuation(initial_evacuation),
_fast_reclaim_handled(false)
{
if (initial_evacuation) {
Ticks start = Ticks::now();
_dirty_card_buffers = NEW_C_HEAP_ARRAY(BufferNode::Stack, num_workers, mtGC);
for (uint i = 0; i < num_workers; i++) {
new (&_dirty_card_buffers[i]) BufferNode::Stack();
}
G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
BufferNodeList buffers = dcqs.take_all_completed_buffers();
if (buffers._entry_count != 0) {
_dirty_card_buffers.prepend(*buffers._head, *buffers._tail);
size_t entries_per_thread = ceil(buffers._entry_count / (double)num_workers);
BufferNode* head = buffers._head;
BufferNode* tail = head;
uint worker = 0;
while (tail != nullptr) {
size_t count = tail->size();
BufferNode* cur = tail->next();
while (count < entries_per_thread && cur != nullptr) {
tail = cur;
count += tail->size();
cur = tail->next();
}
tail->set_next(nullptr);
_dirty_card_buffers[worker++ % num_workers].prepend(*head, *tail);
assert(cur != nullptr || tail == buffers._tail, "Must be");
head = cur;
tail = cur;
}
Tickspan total = Ticks::now() - start;
G1CollectedHeap::heap()->phase_times()->record_distribute_log_buffers_time_ms(total.seconds() * 1000.0);
}
}
~G1MergeHeapRootsTask() {
if (_dirty_card_buffers != nullptr) {
using Stack = BufferNode::Stack;
for (uint i = 0; i < _num_workers; i++) {
_dirty_card_buffers[i].~Stack();
}
FREE_C_HEAP_ARRAY(Stack, _dirty_card_buffers);
}
}
virtual void work(uint worker_id) {
G1CollectedHeap* g1h = G1CollectedHeap::heap();
G1GCPhaseTimes* p = g1h->phase_times();