8301116: Parallelize TLAB resizing in G1

Reviewed-by: ayang, iwalulya
This commit is contained in:
Thomas Schatzl 2023-02-09 09:17:06 +00:00
parent c72f951529
commit 83e2db6ba3
11 changed files with 109 additions and 35 deletions

View File

@ -1010,7 +1010,7 @@ void G1CollectedHeap::verify_before_full_collection(bool explicit_gc) {
_verifier->verify_bitmap_clear(true /* above_tams_only */);
}
void G1CollectedHeap::prepare_heap_for_mutators() {
void G1CollectedHeap::prepare_for_mutator_after_full_collection() {
// Delete metaspaces for unloaded class loaders and clean up loader_data graph
ClassLoaderDataGraph::purge(/*at_safepoint*/true);
DEBUG_ONLY(MetaspaceUtils::verify();)
@ -1025,9 +1025,8 @@ void G1CollectedHeap::prepare_heap_for_mutators() {
// Rebuild the code root lists for each region
rebuild_code_roots();
// Start a new incremental collection set for the next pause
start_new_collection_set();
allocate_dummy_regions();
_allocator->init_mutator_alloc_regions();
// Post collection state updates.
@ -2642,8 +2641,6 @@ class VerifyRegionRemSetClosure : public HeapRegionClosure {
};
void G1CollectedHeap::start_new_collection_set() {
double start = os::elapsedTime();
collection_set()->start_incremental_building();
clear_region_attr();
@ -2654,8 +2651,6 @@ void G1CollectedHeap::start_new_collection_set() {
// We redo the verification but now wrt to the new CSet which
// has just got initialized after the previous CSet was freed.
_cm->verify_no_collection_set_oops();
phase_times()->record_start_new_cset_time_ms((os::elapsedTime() - start) * 1000.0);
}
G1HeapVerifier::G1VerifyType G1CollectedHeap::young_collection_verify_type() const {
@ -2765,19 +2760,18 @@ G1JFRTracerMark::~G1JFRTracerMark() {
_tracer->report_gc_end(_timer->gc_end(), _timer->time_partitions());
}
void G1CollectedHeap::prepare_tlabs_for_mutator() {
void G1CollectedHeap::prepare_for_mutator_after_young_collection() {
Ticks start = Ticks::now();
_survivor_evac_stats.adjust_desired_plab_size();
_old_evac_stats.adjust_desired_plab_size();
// Start a new incremental collection set for the mutator phase.
start_new_collection_set();
allocate_dummy_regions();
_allocator->init_mutator_alloc_regions();
resize_all_tlabs();
phase_times()->record_resize_tlab_time_ms((Ticks::now() - start).seconds() * 1000.0);
phase_times()->record_prepare_for_mutator_time_ms((Ticks::now() - start).seconds() * 1000.0);
}
void G1CollectedHeap::retire_tlabs() {

View File

@ -57,6 +57,7 @@
#include "memory/iterator.hpp"
#include "memory/memRegion.hpp"
#include "runtime/mutexLocker.hpp"
#include "runtime/threadSMR.hpp"
#include "utilities/bitMap.hpp"
// A "G1CollectedHeap" is an implementation of a java heap for HotSpot.
@ -120,6 +121,32 @@ class G1RegionMappingChangedListener : public G1MappingChangedListener {
void on_commit(uint start_idx, size_t num_regions, bool zero_filled) override;
};
// Helper to claim contiguous sets of JavaThread for processing by multiple threads.
class G1JavaThreadsListClaimer : public StackObj {
ThreadsListHandle _list;
uint _claim_step;
volatile uint _cur_claim;
// Attempts to claim _claim_step JavaThreads, returning an array of claimed
// JavaThread* with count elements. Returns null (and a zero count) if there
// are no more threads to claim.
JavaThread* const* claim(uint& count);
public:
G1JavaThreadsListClaimer(uint claim_step) : _list(), _claim_step(claim_step), _cur_claim(0) {
assert(claim_step > 0, "must be");
}
// Executes the given closure on the elements of the JavaThread list, chunking the
// JavaThread set in claim_step chunks for each caller to reduce parallelization
// overhead.
void apply(ThreadClosure* cl);
// Total number of JavaThreads that can be claimed.
uint length() const { return _list.length(); }
};
class G1CollectedHeap : public CollectedHeap {
friend class VM_G1CollectForAllocation;
friend class VM_G1CollectFull;
@ -491,7 +518,7 @@ private:
bool abort_concurrent_cycle();
void verify_before_full_collection(bool explicit_gc);
void prepare_heap_for_full_collection();
void prepare_heap_for_mutators();
void prepare_for_mutator_after_full_collection();
void abort_refinement();
void verify_after_full_collection();
void print_heap_after_full_collection();
@ -771,7 +798,7 @@ public:
// Start a concurrent cycle.
void start_concurrent_cycle(bool concurrent_operation_is_full_mark);
void prepare_tlabs_for_mutator();
void prepare_for_mutator_after_young_collection();
void retire_tlabs();

View File

@ -41,6 +41,7 @@
#include "gc/shared/taskqueue.inline.hpp"
#include "oops/stackChunkOop.hpp"
#include "runtime/atomic.hpp"
#include "runtime/threadSMR.inline.hpp"
#include "utilities/bitMap.inline.hpp"
inline bool G1STWIsAliveClosure::do_object_b(oop p) {
@ -49,6 +50,30 @@ inline bool G1STWIsAliveClosure::do_object_b(oop p) {
return !_g1h->is_in_cset(p) || p->is_forwarded();
}
inline JavaThread* const* G1JavaThreadsListClaimer::claim(uint& count) {
count = 0;
if (Atomic::load(&_cur_claim) >= _list.length()) {
return nullptr;
}
uint claim = Atomic::fetch_and_add(&_cur_claim, _claim_step);
if (claim >= _list.length()) {
return nullptr;
}
count = MIN2(_list.length() - claim, _claim_step);
return _list.list()->threads() + claim;
}
inline void G1JavaThreadsListClaimer::apply(ThreadClosure* cl) {
JavaThread* const* list;
uint count;
while ((list = claim(count)) != nullptr) {
for (uint i = 0; i < count; i++) {
cl->do_thread(list[i]);
}
}
}
G1GCPhaseTimes* G1CollectedHeap::phase_times() const {
return _policy->phase_times();
}

View File

@ -228,7 +228,7 @@ void G1FullCollector::complete_collection() {
// Prepare the bitmap for the next (potentially concurrent) marking.
_heap->concurrent_mark()->clear_bitmap(_heap->workers());
_heap->prepare_heap_for_mutators();
_heap->prepare_for_mutator_after_full_collection();
_heap->resize_all_tlabs();

View File

@ -153,6 +153,8 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
_gc_par_phases[RedirtyCards] = new WorkerDataArray<double>("RedirtyCards", "Redirty Logged Cards (ms):", max_gc_threads);
_gc_par_phases[RedirtyCards]->create_thread_work_items("Redirtied Cards:");
_gc_par_phases[ResizeThreadLABs] = new WorkerDataArray<double>("ResizeTLABs", "Resize TLABs (ms):", max_gc_threads);
_gc_par_phases[FreeCollectionSet] = new WorkerDataArray<double>("FreeCSet", "Free Collection Set (ms):", max_gc_threads);
_gc_par_phases[YoungFreeCSet] = new WorkerDataArray<double>("YoungFreeCSet", "Young Free Collection Set (ms):", max_gc_threads);
_gc_par_phases[NonYoungFreeCSet] = new WorkerDataArray<double>("NonYoungFreeCSet", "Non-Young Free Collection Set (ms):", max_gc_threads);
@ -173,7 +175,6 @@ void G1GCPhaseTimes::reset() {
_cur_prepare_merge_heap_roots_time_ms = 0.0;
_cur_optional_prepare_merge_heap_roots_time_ms = 0.0;
_cur_prepare_tlab_time_ms = 0.0;
_cur_resize_tlab_time_ms = 0.0;
_cur_post_evacuate_cleanup_1_time_ms = 0.0;
_cur_post_evacuate_cleanup_2_time_ms = 0.0;
_cur_expand_heap_time_ms = 0.0;
@ -184,7 +185,7 @@ void G1GCPhaseTimes::reset() {
_recorded_prepare_heap_roots_time_ms = 0.0;
_recorded_young_cset_choice_time_ms = 0.0;
_recorded_non_young_cset_choice_time_ms = 0.0;
_recorded_start_new_cset_time_ms = 0.0;
_recorded_prepare_for_mutator_time_ms = 0.0;
_recorded_serial_free_cset_time_ms = 0.0;
_recorded_total_rebuild_freelist_time_ms = 0.0;
_recorded_serial_rebuild_freelist_time_ms = 0.0;
@ -489,7 +490,7 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed
_cur_post_evacuate_cleanup_1_time_ms +
_cur_post_evacuate_cleanup_2_time_ms +
_recorded_total_rebuild_freelist_time_ms +
_recorded_start_new_cset_time_ms +
_recorded_prepare_for_mutator_time_ms +
_cur_expand_heap_time_ms;
info_time("Post Evacuate Collection Set", sum_ms);
@ -527,6 +528,9 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed
debug_phase(_gc_par_phases[SampleCollectionSetCandidates], 1);
}
debug_phase(_gc_par_phases[RedirtyCards], 1);
if (UseTLAB && ResizeTLAB) {
debug_phase(_gc_par_phases[ResizeThreadLABs], 1);
}
debug_phase(_gc_par_phases[FreeCollectionSet], 1);
trace_phase(_gc_par_phases[YoungFreeCSet], true, 1);
trace_phase(_gc_par_phases[NonYoungFreeCSet], true, 1);
@ -537,10 +541,7 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed
trace_time("Serial Rebuild Free List ", _recorded_serial_rebuild_freelist_time_ms);
trace_phase(_gc_par_phases[RebuildFreeList]);
debug_time("Start New Collection Set", _recorded_start_new_cset_time_ms);
if (UseTLAB && ResizeTLAB) {
debug_time("Resize TLABs", _cur_resize_tlab_time_ms);
}
debug_time("Prepare For Mutator", _recorded_prepare_for_mutator_time_ms);
debug_time("Expand Heap After Collection", _cur_expand_heap_time_ms);
return sum_ms;

View File

@ -74,6 +74,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
FreeCollectionSet,
YoungFreeCSet,
NonYoungFreeCSet,
ResizeThreadLABs,
RebuildFreeList,
SampleCollectionSetCandidates,
MergePSS,
@ -179,7 +180,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
double _cur_optional_prepare_merge_heap_roots_time_ms;
double _cur_prepare_tlab_time_ms;
double _cur_resize_tlab_time_ms;
double _cur_concatenate_dirty_card_logs_time_ms;
@ -199,7 +199,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
double _recorded_young_cset_choice_time_ms;
double _recorded_non_young_cset_choice_time_ms;
double _recorded_start_new_cset_time_ms;
double _recorded_prepare_for_mutator_time_ms;
double _recorded_serial_free_cset_time_ms;
@ -276,10 +276,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
_cur_prepare_tlab_time_ms = ms;
}
void record_resize_tlab_time_ms(double ms) {
_cur_resize_tlab_time_ms = ms;
}
void record_concatenate_dirty_card_logs_time_ms(double ms) {
_cur_concatenate_dirty_card_logs_time_ms = ms;
}
@ -356,8 +352,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
_recorded_non_young_cset_choice_time_ms = time_ms;
}
void record_start_new_cset_time_ms(double time_ms) {
_recorded_start_new_cset_time_ms = time_ms;
void record_prepare_for_mutator_time_ms(double time_ms) {
_recorded_prepare_for_mutator_time_ms = time_ms;
}
void record_cur_collection_start_sec(double time_ms) {

View File

@ -1022,9 +1022,7 @@ void G1YoungCollector::post_evacuate_collection_set(G1EvacInfo* evacuation_info,
evacuation_info->set_bytes_used(_g1h->bytes_used_during_gc());
_g1h->start_new_collection_set();
_g1h->prepare_tlabs_for_mutator();
_g1h->prepare_for_mutator_after_young_collection();
_g1h->gc_epilogue(false);

View File

@ -37,6 +37,8 @@
#include "gc/g1/g1YoungGCPostEvacuateTasks.hpp"
#include "gc/shared/preservedMarks.inline.hpp"
#include "jfr/jfrEvents.hpp"
#include "runtime/threads.hpp"
#include "runtime/threadSMR.hpp"
#include "utilities/ticks.hpp"
class G1PostEvacuateCollectionSetCleanupTask1::MergePssTask : public G1AbstractSubTask {
@ -701,6 +703,31 @@ public:
}
};
class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsTask : public G1AbstractSubTask {
G1JavaThreadsListClaimer _claimer;
// There is not much work per thread so the number of threads per worker is high.
static const uint ThreadsPerWorker = 250;
public:
ResizeTLABsTask() : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker) { }
void do_work(uint worker_id) override {
class ResizeClosure : public ThreadClosure {
public:
void do_thread(Thread* thread) {
static_cast<JavaThread*>(thread)->tlab().resize();
}
} cl;
_claimer.apply(&cl);
}
double worker_cost() const override {
return (double)_claimer.length() / ThreadsPerWorker;
}
};
G1PostEvacuateCollectionSetCleanupTask2::G1PostEvacuateCollectionSetCleanupTask2(G1ParScanThreadStateSet* per_thread_states,
G1EvacInfo* evacuation_info,
G1EvacFailureRegions* evac_failure_regions) :
@ -722,6 +749,9 @@ G1PostEvacuateCollectionSetCleanupTask2::G1PostEvacuateCollectionSetCleanupTask2
}
}
add_parallel_task(new RedirtyLoggedCardsTask(per_thread_states->rdcqs(), evac_failure_regions));
if (UseTLAB && ResizeTLAB) {
add_parallel_task(new ResizeTLABsTask());
}
add_parallel_task(new FreeCollectionSetTask(evacuation_info,
per_thread_states->surviving_young_words(),
evac_failure_regions));

View File

@ -60,6 +60,7 @@ public:
// - Redirty Logged Cards
// - Restore Preserved Marks (on evacuation failure)
// - Free Collection Set
// - Resize TLABs
class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
class EagerlyReclaimHumongousObjectsTask;
class ResetHotCardCacheTask;
@ -70,6 +71,7 @@ class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
class ClearRetainedRegionBitmaps;
class RedirtyLoggedCardsTask;
class RestorePreservedMarksTask;
class ResizeTLABsTask;
class FreeCollectionSetTask;
public:

View File

@ -183,6 +183,7 @@ public class TestGCLogMessages {
new LogMessageWithLevelC2OrJVMCIOnly("Update Derived Pointers", Level.DEBUG),
new LogMessageWithLevel("Redirty Logged Cards", Level.DEBUG),
new LogMessageWithLevel("Redirtied Cards", Level.DEBUG),
new LogMessageWithLevel("Resize TLABs", Level.DEBUG),
new LogMessageWithLevel("Free Collection Set", Level.DEBUG),
new LogMessageWithLevel("Serial Free Collection Set", Level.TRACE),
new LogMessageWithLevel("Young Free Collection Set", Level.TRACE),
@ -192,8 +193,7 @@ public class TestGCLogMessages {
new LogMessageWithLevel("Rebuild Free List", Level.DEBUG),
new LogMessageWithLevel("Serial Rebuild Free List", Level.TRACE),
new LogMessageWithLevel("Parallel Rebuild Free List", Level.TRACE),
new LogMessageWithLevel("Start New Collection Set", Level.DEBUG),
new LogMessageWithLevel("Resize TLABs", Level.DEBUG),
new LogMessageWithLevel("Prepare For Mutator", Level.DEBUG),
new LogMessageWithLevel("Expand Heap After Collection", Level.DEBUG),
};

View File

@ -107,6 +107,7 @@ public class TestG1ParallelPhases {
"RedirtyCards",
"RecalculateUsed",
"ResetHotCardCache",
"ResizeTLABs",
"FreeCSet",
"UpdateDerivedPointers",
"EagerlyReclaimHumongousObjects",