8327452: G1: Improve scalability of Merge Log Buffers
Reviewed-by: kbarrett, tschatzl
This commit is contained in:
parent
0ae4fa71e4
commit
49d8008947
src/hotspot/share/gc/g1
@ -174,6 +174,7 @@ void G1GCPhaseTimes::reset() {
|
||||
_cur_merge_heap_roots_time_ms = 0.0;
|
||||
_cur_optional_merge_heap_roots_time_ms = 0.0;
|
||||
_cur_prepare_merge_heap_roots_time_ms = 0.0;
|
||||
_cur_distribute_log_buffers_time_ms = 0.0;
|
||||
_cur_optional_prepare_merge_heap_roots_time_ms = 0.0;
|
||||
_cur_pre_evacuate_prepare_time_ms = 0.0;
|
||||
_cur_post_evacuate_cleanup_1_time_ms = 0.0;
|
||||
@ -459,6 +460,7 @@ double G1GCPhaseTimes::print_evacuate_initial_collection_set() const {
|
||||
debug_time("Prepare Merge Heap Roots", _cur_prepare_merge_heap_roots_time_ms);
|
||||
debug_phase_merge_remset();
|
||||
|
||||
debug_time("Distribute Log Buffers", _cur_distribute_log_buffers_time_ms);
|
||||
debug_phase(_gc_par_phases[MergeLB]);
|
||||
|
||||
info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms);
|
||||
|
@ -181,6 +181,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
|
||||
double _cur_prepare_merge_heap_roots_time_ms;
|
||||
double _cur_optional_prepare_merge_heap_roots_time_ms;
|
||||
|
||||
double _cur_distribute_log_buffers_time_ms;
|
||||
|
||||
double _cur_pre_evacuate_prepare_time_ms;
|
||||
|
||||
double _cur_post_evacuate_cleanup_1_time_ms;
|
||||
@ -304,6 +306,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
|
||||
_cur_prepare_merge_heap_roots_time_ms += ms;
|
||||
}
|
||||
|
||||
void record_distribute_log_buffers_time_ms(double ms) {
|
||||
_cur_distribute_log_buffers_time_ms += ms;
|
||||
}
|
||||
|
||||
void record_or_add_optional_prepare_merge_heap_roots_time(double ms) {
|
||||
_cur_optional_prepare_merge_heap_roots_time_ms += ms;
|
||||
}
|
||||
@ -376,6 +382,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
|
||||
return _cur_collection_start_sec;
|
||||
}
|
||||
|
||||
double cur_distribute_log_buffers_time_ms() {
|
||||
return _cur_distribute_log_buffers_time_ms;
|
||||
}
|
||||
|
||||
double cur_collection_par_time_ms() {
|
||||
return _cur_collection_initial_evac_time_ms +
|
||||
_cur_optional_evac_time_ms +
|
||||
|
@ -774,6 +774,10 @@ double G1Policy::logged_cards_processing_time() const {
|
||||
size_t logged_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
|
||||
size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
|
||||
phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
|
||||
|
||||
double merge_logged_cards_time = average_time_ms(G1GCPhaseTimes::MergeLB) +
|
||||
phase_times()->cur_distribute_log_buffers_time_ms();
|
||||
|
||||
// Approximate the time spent processing cards from log buffers by scaling
|
||||
// the total processing time by the ratio of logged cards to total cards
|
||||
// processed. There might be duplicate cards in different log buffers,
|
||||
@ -783,9 +787,9 @@ double G1Policy::logged_cards_processing_time() const {
|
||||
// counts are zero, which happens especially during early GCs. So ascribe
|
||||
// all of the time to the logged cards unless there are more total cards.
|
||||
if (logged_dirty_cards >= scan_heap_roots_cards) {
|
||||
return all_cards_processing_time + average_time_ms(G1GCPhaseTimes::MergeLB);
|
||||
return all_cards_processing_time + merge_logged_cards_time;
|
||||
}
|
||||
return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + average_time_ms(G1GCPhaseTimes::MergeLB);
|
||||
return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + merge_logged_cards_time;
|
||||
}
|
||||
|
||||
// Anything below that is considered to be zero
|
||||
@ -874,6 +878,7 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
|
||||
double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
|
||||
average_time_ms(G1GCPhaseTimes::MergeRS) +
|
||||
average_time_ms(G1GCPhaseTimes::MergeLB) +
|
||||
p->cur_distribute_log_buffers_time_ms() +
|
||||
average_time_ms(G1GCPhaseTimes::OptMergeRS);
|
||||
_analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / total_cards_merged, is_young_only_pause);
|
||||
}
|
||||
|
@ -1258,39 +1258,90 @@ class G1MergeHeapRootsTask : public WorkerTask {
|
||||
size_t cards_skipped() const { return _cards_skipped; }
|
||||
};
|
||||
|
||||
HeapRegionClaimer _hr_claimer;
|
||||
uint _num_workers;
|
||||
G1RemSetScanState* _scan_state;
|
||||
BufferNode::Stack _dirty_card_buffers;
|
||||
|
||||
// To mitigate contention due multiple threads accessing and popping BufferNodes from a shared
|
||||
// G1DirtyCardQueueSet, we implement a sequential distribution phase. Here, BufferNodes are
|
||||
// distributed to worker threads in a sequential manner utilizing the _dirty_card_buffers. By doing
|
||||
// so, we effectively alleviate the bottleneck encountered during pop operations on the
|
||||
// G1DirtyCardQueueSet. Importantly, this approach preserves the helping aspect among worker
|
||||
// threads, allowing them to assist one another in case of imbalances in work distribution.
|
||||
BufferNode::Stack* _dirty_card_buffers;
|
||||
|
||||
bool _initial_evacuation;
|
||||
|
||||
volatile bool _fast_reclaim_handled;
|
||||
|
||||
void apply_closure_to_dirty_card_buffers(G1MergeLogBufferCardsClosure* cl, uint worker_id) {
|
||||
G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
|
||||
while (BufferNode* node = _dirty_card_buffers.pop()) {
|
||||
cl->apply_to_buffer(node, worker_id);
|
||||
dcqs.deallocate_buffer(node);
|
||||
for (uint i = 0; i < _num_workers; i++) {
|
||||
uint index = (worker_id + i) % _num_workers;
|
||||
while (BufferNode* node = _dirty_card_buffers[index].pop()) {
|
||||
cl->apply_to_buffer(node, worker_id);
|
||||
dcqs.deallocate_buffer(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
G1MergeHeapRootsTask(G1RemSetScanState* scan_state, uint num_workers, bool initial_evacuation) :
|
||||
WorkerTask("G1 Merge Heap Roots"),
|
||||
_hr_claimer(num_workers),
|
||||
_num_workers(num_workers),
|
||||
_scan_state(scan_state),
|
||||
_dirty_card_buffers(),
|
||||
_dirty_card_buffers(nullptr),
|
||||
_initial_evacuation(initial_evacuation),
|
||||
_fast_reclaim_handled(false)
|
||||
{
|
||||
if (initial_evacuation) {
|
||||
Ticks start = Ticks::now();
|
||||
|
||||
_dirty_card_buffers = NEW_C_HEAP_ARRAY(BufferNode::Stack, num_workers, mtGC);
|
||||
for (uint i = 0; i < num_workers; i++) {
|
||||
new (&_dirty_card_buffers[i]) BufferNode::Stack();
|
||||
}
|
||||
|
||||
G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
|
||||
BufferNodeList buffers = dcqs.take_all_completed_buffers();
|
||||
if (buffers._entry_count != 0) {
|
||||
_dirty_card_buffers.prepend(*buffers._head, *buffers._tail);
|
||||
|
||||
size_t entries_per_thread = ceil(buffers._entry_count / (double)num_workers);
|
||||
|
||||
BufferNode* head = buffers._head;
|
||||
BufferNode* tail = head;
|
||||
|
||||
uint worker = 0;
|
||||
while (tail != nullptr) {
|
||||
size_t count = tail->size();
|
||||
BufferNode* cur = tail->next();
|
||||
|
||||
while (count < entries_per_thread && cur != nullptr) {
|
||||
tail = cur;
|
||||
count += tail->size();
|
||||
cur = tail->next();
|
||||
}
|
||||
|
||||
tail->set_next(nullptr);
|
||||
_dirty_card_buffers[worker++ % num_workers].prepend(*head, *tail);
|
||||
|
||||
assert(cur != nullptr || tail == buffers._tail, "Must be");
|
||||
head = cur;
|
||||
tail = cur;
|
||||
}
|
||||
|
||||
Tickspan total = Ticks::now() - start;
|
||||
G1CollectedHeap::heap()->phase_times()->record_distribute_log_buffers_time_ms(total.seconds() * 1000.0);
|
||||
}
|
||||
}
|
||||
|
||||
~G1MergeHeapRootsTask() {
|
||||
if (_dirty_card_buffers != nullptr) {
|
||||
using Stack = BufferNode::Stack;
|
||||
for (uint i = 0; i < _num_workers; i++) {
|
||||
_dirty_card_buffers[i].~Stack();
|
||||
}
|
||||
FREE_C_HEAP_ARRAY(Stack, _dirty_card_buffers);
|
||||
}
|
||||
}
|
||||
virtual void work(uint worker_id) {
|
||||
G1CollectedHeap* g1h = G1CollectedHeap::heap();
|
||||
G1GCPhaseTimes* p = g1h->phase_times();
|
||||
|
Loading…
x
Reference in New Issue
Block a user