diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp index e4ac20ca7ea..f81c3241a1a 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp @@ -35,6 +35,7 @@ #include "gc/g1/g1Trace.hpp" #include "gc/g1/g1YoungGCAllocationFailureInjector.inline.hpp" #include "gc/shared/continuationGCSupport.inline.hpp" +#include "gc/shared/partialArrayState.hpp" #include "gc/shared/partialArrayTaskStepper.inline.hpp" #include "gc/shared/preservedMarks.inline.hpp" #include "gc/shared/stringdedup/stringDedup.hpp" @@ -43,6 +44,7 @@ #include "oops/access.inline.hpp" #include "oops/oop.inline.hpp" #include "runtime/atomic.hpp" +#include "runtime/mutexLocker.hpp" #include "runtime/prefetch.inline.hpp" #include "utilities/globalDefinitions.hpp" #include "utilities/macros.hpp" @@ -61,7 +63,8 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, uint num_workers, G1CollectionSet* collection_set, - G1EvacFailureRegions* evac_failure_regions) + G1EvacFailureRegions* evac_failure_regions, + PartialArrayStateAllocator* pas_allocator) : _g1h(g1h), _task_queue(g1h->task_queue(worker_id)), _rdc_local_qset(rdcqs), @@ -80,8 +83,8 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, _surviving_young_words(nullptr), _surviving_words_length(collection_set->young_region_length() + 1), _old_gen_is_full(false), - _partial_objarray_chunk_size(ParGCArrayScanChunk), - _partial_array_stepper(num_workers), + _partial_array_state_allocator(pas_allocator), + _partial_array_stepper(num_workers, ParGCArrayScanChunk), _string_dedup_requests(), _max_num_optional_regions(collection_set->optional_region_length()), _numa(g1h->numa()), @@ -169,9 +172,9 @@ void G1ParScanThreadState::verify_task(oop* task) const { "task=" PTR_FORMAT " p=" PTR_FORMAT, p2i(task), p2i(p)); } -void G1ParScanThreadState::verify_task(PartialArrayScanTask task) const { +void G1ParScanThreadState::verify_task(PartialArrayState* task) const { // Must be in the collection set--it's already been copied. - oop p = task.to_source_array(); + oop p = task->source(); assert(_g1h->is_in_cset(p), "p=" PTR_FORMAT, p2i(p)); } @@ -180,8 +183,8 @@ void G1ParScanThreadState::verify_task(ScannerTask task) const { verify_task(task.to_narrow_oop_ptr()); } else if (task.is_oop_ptr()) { verify_task(task.to_oop_ptr()); - } else if (task.is_partial_array_task()) { - verify_task(task.to_partial_array_task()); + } else if (task.is_partial_array_state()) { + verify_task(task.to_partial_array_state()); } else { ShouldNotReachHere(); } @@ -223,34 +226,39 @@ void G1ParScanThreadState::do_oop_evac(T* p) { } MAYBE_INLINE_EVACUATION -void G1ParScanThreadState::do_partial_array(PartialArrayScanTask task) { - oop from_obj = task.to_source_array(); +void G1ParScanThreadState::do_partial_array(PartialArrayState* state) { + oop to_obj = state->destination(); +#ifdef ASSERT + oop from_obj = state->source(); assert(_g1h->is_in_reserved(from_obj), "must be in heap."); assert(from_obj->is_objArray(), "must be obj array"); assert(from_obj->is_forwarded(), "must be forwarded"); - - oop to_obj = from_obj->forwardee(); assert(from_obj != to_obj, "should not be chunking self-forwarded objects"); assert(to_obj->is_objArray(), "must be obj array"); +#endif // ASSERT + objArrayOop to_array = objArrayOop(to_obj); - PartialArrayTaskStepper::Step step - = _partial_array_stepper.next(objArrayOop(from_obj), - to_array, - _partial_objarray_chunk_size); - for (uint i = 0; i < step._ncreate; ++i) { - push_on_queue(ScannerTask(PartialArrayScanTask(from_obj))); + // Claim a chunk and get number of additional tasks to enqueue. + PartialArrayTaskStepper::Step step = _partial_array_stepper.next(state); + // Push any additional partial scan tasks needed. Pushed before processing + // the claimed chunk to allow other workers to steal while we're processing. + if (step._ncreate > 0) { + state->add_references(step._ncreate); + for (uint i = 0; i < step._ncreate; ++i) { + push_on_queue(ScannerTask(state)); + } } G1HeapRegionAttr dest_attr = _g1h->region_attr(to_array); G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_new_survivor()); - // Process claimed task. The length of to_array is not correct, but - // fortunately the iteration ignores the length field and just relies - // on start/end. + // Process claimed task. to_array->oop_iterate_range(&_scanner, - step._index, - step._index + _partial_objarray_chunk_size); + checked_cast(step._index), + checked_cast(step._index + _partial_array_stepper.chunk_size())); + // Release reference to the state, now that we're done with it. + _partial_array_state_allocator->release(_worker_id, state); } MAYBE_INLINE_EVACUATION @@ -260,20 +268,30 @@ void G1ParScanThreadState::start_partial_objarray(G1HeapRegionAttr dest_attr, assert(from_obj->is_objArray(), "precondition"); assert(from_obj->is_forwarded(), "precondition"); assert(from_obj->forwardee() == to_obj, "precondition"); - assert(from_obj != to_obj, "should not be scanning self-forwarded objects"); assert(to_obj->is_objArray(), "precondition"); objArrayOop to_array = objArrayOop(to_obj); - PartialArrayTaskStepper::Step step - = _partial_array_stepper.start(objArrayOop(from_obj), - to_array, - _partial_objarray_chunk_size); + size_t array_length = to_array->length(); + PartialArrayTaskStepper::Step step = _partial_array_stepper.start(array_length); // Push any needed partial scan tasks. Pushed before processing the // initial chunk to allow other workers to steal while we're processing. - for (uint i = 0; i < step._ncreate; ++i) { - push_on_queue(ScannerTask(PartialArrayScanTask(from_obj))); + if (step._ncreate > 0) { + assert(step._index < array_length, "invariant"); + assert(((array_length - step._index) % _partial_array_stepper.chunk_size()) == 0, + "invariant"); + PartialArrayState* state = + _partial_array_state_allocator->allocate(_worker_id, + from_obj, to_obj, + step._index, + array_length, + step._ncreate); + for (uint i = 0; i < step._ncreate; ++i) { + push_on_queue(ScannerTask(state)); + } + } else { + assert(step._index == array_length, "invariant"); } // Skip the card enqueue iff the object (to_array) is in survivor region. @@ -284,9 +302,8 @@ void G1ParScanThreadState::start_partial_objarray(G1HeapRegionAttr dest_attr, G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_young()); // Process the initial chunk. No need to process the type in the // klass, as it will already be handled by processing the built-in - // module. The length of to_array is not correct, but fortunately - // the iteration ignores that length field and relies on start/end. - to_array->oop_iterate_range(&_scanner, 0, step._index); + // module. + to_array->oop_iterate_range(&_scanner, 0, checked_cast(step._index)); } MAYBE_INLINE_EVACUATION @@ -297,7 +314,7 @@ void G1ParScanThreadState::dispatch_task(ScannerTask task) { } else if (task.is_oop_ptr()) { do_oop_evac(task.to_oop_ptr()); } else { - do_partial_array(task.to_partial_array_task()); + do_partial_array(task.to_partial_array_state()); } } @@ -582,7 +599,8 @@ G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id) worker_id, _num_workers, _collection_set, - _evac_failure_regions); + _evac_failure_regions, + &_partial_array_state_allocator); } return _states[worker_id]; } @@ -715,7 +733,9 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h, _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, collection_set->young_region_length() + 1, mtGC)), _num_workers(num_workers), _flushed(false), - _evac_failure_regions(evac_failure_regions) { + _evac_failure_regions(evac_failure_regions), + _partial_array_state_allocator(num_workers) +{ _preserved_marks_set.init(num_workers); for (uint i = 0; i < num_workers; ++i) { _states[i] = nullptr; diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp index 24ca682e141..1cfd6fca08a 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp @@ -32,6 +32,7 @@ #include "gc/shared/ageTable.hpp" #include "gc/shared/copyFailedInfo.hpp" #include "gc/shared/gc_globals.hpp" +#include "gc/shared/partialArrayState.hpp" #include "gc/shared/partialArrayTaskStepper.hpp" #include "gc/shared/preservedMarks.hpp" #include "gc/shared/stringdedup/stringDedup.hpp" @@ -87,7 +88,8 @@ class G1ParScanThreadState : public CHeapObj { // available for allocation. bool _old_gen_is_full; // Size (in elements) of a partial objArray task chunk. - int _partial_objarray_chunk_size; + size_t _partial_objarray_chunk_size; + PartialArrayStateAllocator* _partial_array_state_allocator; PartialArrayTaskStepper _partial_array_stepper; StringDedup::Requests _string_dedup_requests; @@ -129,7 +131,8 @@ public: uint worker_id, uint num_workers, G1CollectionSet* collection_set, - G1EvacFailureRegions* evac_failure_regions); + G1EvacFailureRegions* evac_failure_regions, + PartialArrayStateAllocator* partial_array_state_allocator); virtual ~G1ParScanThreadState(); void set_ref_discoverer(ReferenceDiscoverer* rd) { _scanner.set_ref_discoverer(rd); } @@ -140,7 +143,7 @@ public: void verify_task(narrowOop* task) const NOT_DEBUG_RETURN; void verify_task(oop* task) const NOT_DEBUG_RETURN; - void verify_task(PartialArrayScanTask task) const NOT_DEBUG_RETURN; + void verify_task(PartialArrayState* task) const NOT_DEBUG_RETURN; void verify_task(ScannerTask task) const NOT_DEBUG_RETURN; void push_on_queue(ScannerTask task); @@ -169,7 +172,7 @@ public: size_t flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* buffer_log); private: - void do_partial_array(PartialArrayScanTask task); + void do_partial_array(PartialArrayState* state); void start_partial_objarray(G1HeapRegionAttr dest_dir, oop from, oop to); HeapWord* allocate_copy_slow(G1HeapRegionAttr* dest_attr, @@ -252,6 +255,7 @@ class G1ParScanThreadStateSet : public StackObj { uint _num_workers; bool _flushed; G1EvacFailureRegions* _evac_failure_regions; + PartialArrayStateAllocator _partial_array_state_allocator; public: G1ParScanThreadStateSet(G1CollectedHeap* g1h, diff --git a/src/hotspot/share/gc/shared/partialArrayState.cpp b/src/hotspot/share/gc/shared/partialArrayState.cpp new file mode 100644 index 00000000000..583c5dede40 --- /dev/null +++ b/src/hotspot/share/gc/shared/partialArrayState.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "gc/shared/partialArrayState.hpp" +#include "memory/allocation.inline.hpp" +#include "memory/arena.hpp" +#include "nmt/memflags.hpp" +#include "oops/oopsHierarchy.hpp" +#include "runtime/atomic.hpp" +#include "runtime/orderAccess.hpp" +#include "utilities/debug.hpp" +#include "utilities/globalDefinitions.hpp" +#include "utilities/macros.hpp" +#include + +PartialArrayState::PartialArrayState(oop src, oop dst, + size_t index, size_t length, + size_t initial_refcount) + : _source(src), + _destination(dst), + _length(length), + _index(index), + _refcount(initial_refcount) +{ + assert(index <= length, "precondition"); +} + +void PartialArrayState::add_references(size_t count) { + size_t new_count = Atomic::add(&_refcount, count, memory_order_relaxed); + assert(new_count >= count, "reference count overflow"); +} + +class PartialArrayStateAllocator::Impl : public CHeapObj { + struct FreeListEntry; + + Arena* _arenas; + FreeListEntry** _free_lists; + uint _num_workers; + +public: + Impl(uint num_workers); + ~Impl(); + + NONCOPYABLE(Impl); + + PartialArrayState* allocate(uint worker_id, + oop src, oop dst, + size_t index, size_t length, + size_t initial_refcount); + void release(uint worker_id, PartialArrayState* state); +}; + +struct PartialArrayStateAllocator::Impl::FreeListEntry { + FreeListEntry* _next; + + FreeListEntry(FreeListEntry* next) : _next(next) {} + ~FreeListEntry() = default; + + NONCOPYABLE(FreeListEntry); +}; + +PartialArrayStateAllocator::Impl::Impl(uint num_workers) + : _arenas(NEW_C_HEAP_ARRAY(Arena, num_workers, mtGC)), + _free_lists(NEW_C_HEAP_ARRAY(FreeListEntry*, num_workers, mtGC)), + _num_workers(num_workers) +{ + for (uint i = 0; i < _num_workers; ++i) { + ::new (&_arenas[i]) Arena(mtGC); + _free_lists[i] = nullptr; + } +} + +PartialArrayStateAllocator::Impl::~Impl() { + // We don't need to clean up the free lists. Deallocating the entries + // does nothing, since we're using arena allocation. Instead, leave it + // to the arena destructor to release the memory. + FREE_C_HEAP_ARRAY(FreeListEntry*, _free_lists); + for (uint i = 0; i < _num_workers; ++i) { + _arenas[i].~Arena(); + } +} + +PartialArrayState* PartialArrayStateAllocator::Impl::allocate(uint worker_id, + oop src, oop dst, + size_t index, + size_t length, + size_t initial_refcount) { + void* p; + FreeListEntry* head = _free_lists[worker_id]; + if (head == nullptr) { + p = NEW_ARENA_OBJ(&_arenas[worker_id], PartialArrayState); + } else { + _free_lists[worker_id] = head->_next; + head->~FreeListEntry(); + p = head; + } + return ::new (p) PartialArrayState(src, dst, index, length, initial_refcount); +} + +void PartialArrayStateAllocator::Impl::release(uint worker_id, PartialArrayState* state) { + size_t refcount = Atomic::sub(&state->_refcount, size_t(1), memory_order_release); + if (refcount != 0) { + assert(refcount + 1 != 0, "refcount underflow"); + } else { + OrderAccess::acquire(); + state->~PartialArrayState(); + _free_lists[worker_id] = ::new (state) FreeListEntry(_free_lists[worker_id]); + } +} + +PartialArrayStateAllocator::PartialArrayStateAllocator(uint num_workers) + : _impl(new Impl(num_workers)) +{} + +PartialArrayStateAllocator::~PartialArrayStateAllocator() { + delete _impl; +} + +PartialArrayState* PartialArrayStateAllocator::allocate(uint worker_id, + oop src, oop dst, + size_t index, + size_t length, + size_t initial_refcount) { + return _impl->allocate(worker_id, src, dst, index, length, initial_refcount); +} + +void PartialArrayStateAllocator::release(uint worker_id, PartialArrayState* state) { + _impl->release(worker_id, state); +} + diff --git a/src/hotspot/share/gc/shared/partialArrayState.hpp b/src/hotspot/share/gc/shared/partialArrayState.hpp new file mode 100644 index 00000000000..f3bfc3ed8b8 --- /dev/null +++ b/src/hotspot/share/gc/shared/partialArrayState.hpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_GC_SHARED_PARTIALARRAYSTATE_HPP +#define SHARE_GC_SHARED_PARTIALARRAYSTATE_HPP + +#include "oops/oopsHierarchy.hpp" +#include "utilities/globalDefinitions.hpp" +#include "utilities/macros.hpp" + +class PartialArrayStateAllocator; + +// Instances of this class are used to represent processing progress for an +// array task in a taskqueue. When a sufficiently large array needs to be +// processed, such that it is desirable to split up the processing into +// parallelizable subtasks, a state object is allocated for the array. +// Multiple tasks referring to the state can then be added to the taskqueue +// for later processing, either by the current thread or by some other thread +// that steals one of those tasks. +// +// Processing a state involves using the state to claim a segment of the +// array, and processing that segment. Claiming is done by atomically +// incrementing the index, thereby claiming the segment from the old to new +// index values. New tasks should also be added as needed to ensure the +// entire array will be processed. A PartialArrayTaskStepper can be used to +// help with this. +// +// States are allocated and released using a PartialArrayStateAllocator. +// States are reference counted to aid in that management. Each task +// referring to a given state that is added to a taskqueue must increase the +// reference count by one. When the processing of a task referring to a state +// is complete, the reference count must be decreased by one. When the +// reference count reaches zero the state should be released to the allocator +// for later reuse. +class PartialArrayState { + oop _source; + oop _destination; + size_t _length; + volatile size_t _index; + volatile size_t _refcount; + + friend class PartialArrayStateAllocator; + + PartialArrayState(oop src, oop dst, + size_t index, size_t length, + size_t initial_refcount); + ~PartialArrayState() = default; + + NONCOPYABLE(PartialArrayState); + +public: + // Add count references, one per referring task being added to a taskqueue. + void add_references(size_t count); + + // The source array oop. + oop source() const { return _source; } + + // The destination array oop. In some circumstances the source and + // destination may be the same. + oop destination() const { return _destination; } + + // The length of the array oop. + size_t length() const { return _length; } + + // A pointer to the start index for the next segment to process, for atomic + // update. + volatile size_t* index_addr() { return &_index; } +}; + +// This class provides memory management for PartialArrayStates. +// +// States are initially allocated from a set of arenas owned by the allocator. +// This allows the entire set of allocated states to be discarded without the +// need to keep track of or find them under some circumstances. For example, +// if G1 concurrent marking is aborted and needs to restart because of a full +// marking queue, the queue doesn't need to be searched for tasks referring to +// states to allow releasing them. Instead the queue contents can just be +// discarded, and the memory for the no longer referenced states will +// eventually be reclaimed when the arenas are reset. +// +// A set of free-lists is placed in front of the arena allocators. This +// causes the maximum number of allocated states to be based on the number of +// in-progress arrays, rather than the total number of arrays that need to be +// processed. The use of free-list allocators is the reason for reference +// counting states. +// +// The arena and free-list to use for an allocation operation is designated by +// the worker_id used in the operation. This avoids locking and such on those +// data structures, at the cost of possibly doing more total arena allocation +// that would be needed with a single shared arena and free-list. +class PartialArrayStateAllocator { + class Impl; + Impl* _impl; + +public: + PartialArrayStateAllocator(uint num_workers); + ~PartialArrayStateAllocator(); + + NONCOPYABLE(PartialArrayStateAllocator); + + // Create a new state, obtaining the memory for it from the free-list or + // arena associated with worker_id. + PartialArrayState* allocate(uint worker_id, + oop src, oop dst, + size_t index, size_t length, + size_t initial_refcount); + + // Decrement the state's refcount. If the new refcount is zero, add the + // state to the free-list associated with worker_id. The state must have + // been allocated by this allocator, but that allocation doesn't need to + // have been associated with worker_id. + void release(uint worker_id, PartialArrayState* state); +}; + +#endif // SHARE_GC_SHARED_PARTIALARRAYSTATE_HPP diff --git a/src/hotspot/share/gc/shared/partialArrayTaskStepper.cpp b/src/hotspot/share/gc/shared/partialArrayTaskStepper.cpp index ed52d7abff1..6faa162ac7b 100644 --- a/src/hotspot/share/gc/shared/partialArrayTaskStepper.cpp +++ b/src/hotspot/share/gc/shared/partialArrayTaskStepper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,6 +25,7 @@ #include "precompiled.hpp" #include "gc/shared/partialArrayTaskStepper.hpp" #include "oops/arrayOop.hpp" +#include "utilities/debug.hpp" #include "utilities/globalDefinitions.hpp" #include "utilities/powerOfTwo.hpp" @@ -48,7 +49,8 @@ static uint compute_task_fanout(uint task_limit) { return result; } -PartialArrayTaskStepper::PartialArrayTaskStepper(uint n_workers) : +PartialArrayTaskStepper::PartialArrayTaskStepper(uint n_workers, size_t chunk_size) : + _chunk_size(chunk_size), _task_limit(compute_task_limit(n_workers)), _task_fanout(compute_task_fanout(_task_limit)) {} diff --git a/src/hotspot/share/gc/shared/partialArrayTaskStepper.hpp b/src/hotspot/share/gc/shared/partialArrayTaskStepper.hpp index aec993f907c..a68d9bd3612 100644 --- a/src/hotspot/share/gc/shared/partialArrayTaskStepper.hpp +++ b/src/hotspot/share/gc/shared/partialArrayTaskStepper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,55 +28,52 @@ #include "oops/arrayOop.hpp" #include "utilities/globalDefinitions.hpp" -// Helper for handling PartialArrayTasks. +class PartialArrayState; + +// Helper for partial array chunking tasks. // // When an array is large, we want to split it up into chunks that can be -// processed in parallel. Each task (implicitly) represents such a chunk. -// We can enqueue multiple tasks at the same time. We want to enqueue -// enough tasks to benefit from the available parallelism, while not so many -// as to substantially expand the task queues. -// -// A task directly refers to the from-space array. The from-space array's -// forwarding pointer refers to the associated to-space array, and its -// length is the actual length. The to-space array's length field is used to -// indicate processing progress. It is the starting index of the next chunk -// to process, or equals the actual length when there are no more chunks to -// be processed. +// processed in parallel. Each task (implicitly) represents such a chunk. We +// can enqueue multiple tasks at the same time. We want to enqueue enough +// tasks to benefit from the available parallelism, while not so many as to +// substantially expand the task queues. class PartialArrayTaskStepper { public: - PartialArrayTaskStepper(uint n_workers); + PartialArrayTaskStepper(uint n_workers, size_t chunk_size); struct Step { - int _index; // Array index for the step. + size_t _index; // Array index for the step. uint _ncreate; // Number of new tasks to create. }; - // Set to's length to the end of the initial chunk, which is the start of - // the first partial task if the array is large enough to need splitting. - // Returns a Step with _index being that index and _ncreate being the - // initial number of partial tasks to enqueue. - inline Step start(arrayOop from, arrayOop to, int chunk_size) const; + // Called with the length of the array to be processed. Returns a Step with + // _index being the end of the initial chunk, which the caller should + // process. This is also the starting index for the next chunk to process. + // The _ncreate is the number of tasks to enqueue to continue processing the + // array. If _ncreate is zero then _index will be length. + inline Step start(size_t length) const; - // Increment to's length by chunk_size to claim the next chunk. Returns a - // Step with _index being the starting index of the claimed chunk and - // _ncreate being the number of additional partial tasks to enqueue. - // precondition: chunk_size must be the same as used to start the task sequence. - inline Step next(arrayOop from, arrayOop to, int chunk_size) const; + // Atomically increment state's index by chunk_size() to claim the next + // chunk. Returns a Step with _index being the starting index of the + // claimed chunk and _ncreate being the number of additional partial tasks + // to enqueue. + inline Step next(PartialArrayState* state) const; + + // The size of chunks to claim for each task. + inline size_t chunk_size() const; class TestSupport; // For unit tests private: + // Size (number of elements) of a chunk to process. + size_t _chunk_size; // Limit on the number of partial array tasks to create for a given array. uint _task_limit; // Maximum number of new tasks to create when processing an existing task. uint _task_fanout; - // Split start/next into public part dealing with oops and private - // impl dealing with lengths and pointers to lengths, for unit testing. - // length is the actual length obtained from the from-space object. - // to_length_addr is the address of the to-space object's length value. - inline Step start_impl(int length, int* to_length_addr, int chunk_size) const; - inline Step next_impl(int length, int* to_length_addr, int chunk_size) const; + // For unit tests. + inline Step next_impl(size_t length, volatile size_t* index_addr) const; }; #endif // SHARE_GC_SHARED_PARTIALARRAYTASKSTEPPER_HPP diff --git a/src/hotspot/share/gc/shared/partialArrayTaskStepper.inline.hpp b/src/hotspot/share/gc/shared/partialArrayTaskStepper.inline.hpp index aa9a02c4902..1d43578b5fe 100644 --- a/src/hotspot/share/gc/shared/partialArrayTaskStepper.inline.hpp +++ b/src/hotspot/share/gc/shared/partialArrayTaskStepper.inline.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,66 +25,46 @@ #ifndef SHARE_GC_SHARED_PARTIALARRAYTASKSTEPPER_INLINE_HPP #define SHARE_GC_SHARED_PARTIALARRAYTASKSTEPPER_INLINE_HPP +#include "gc/shared/partialArrayState.hpp" #include "gc/shared/partialArrayTaskStepper.hpp" - -#include "oops/arrayOop.hpp" #include "runtime/atomic.hpp" +#include "utilities/checkedCast.hpp" +#include "utilities/debug.hpp" + +size_t PartialArrayTaskStepper::chunk_size() const { + return _chunk_size; +} PartialArrayTaskStepper::Step -PartialArrayTaskStepper::start_impl(int length, - int* to_length_addr, - int chunk_size) const { - assert(chunk_size > 0, "precondition"); - - int end = length % chunk_size; // End of initial chunk. - // Set to's length to end of initial chunk. Partial tasks use that length - // field as the start of the next chunk to process. Must be done before - // enqueuing partial scan tasks, in case other threads steal any of those - // tasks. - // - // The value of end can be 0, either because of a 0-length array or - // because length is a multiple of the chunk size. Both of those are - // relatively rare and handled in the normal course of the iteration, so - // not worth doing anything special about here. - *to_length_addr = end; - +PartialArrayTaskStepper::start(size_t length) const { + size_t end = length % _chunk_size; // End of initial chunk. // If the initial chunk is the complete array, then don't need any partial // tasks. Otherwise, start with just one partial task; see new task // calculation in next(). - Step result = { end, (length > end) ? 1u : 0u }; - return result; + return Step{ end, (length > end) ? 1u : 0u }; } PartialArrayTaskStepper::Step -PartialArrayTaskStepper::start(arrayOop from, arrayOop to, int chunk_size) const { - return start_impl(from->length(), to->length_addr(), chunk_size); -} - -PartialArrayTaskStepper::Step -PartialArrayTaskStepper::next_impl(int length, - int* to_length_addr, - int chunk_size) const { - assert(chunk_size > 0, "precondition"); - - // The start of the next task is in the length field of the to-space object. +PartialArrayTaskStepper::next_impl(size_t length, volatile size_t* index_addr) const { + // The start of the next task is in the state's index. // Atomically increment by the chunk size to claim the associated chunk. // Because we limit the number of enqueued tasks to being no more than the // number of remaining chunks to process, we can use an atomic add for the // claim, rather than a CAS loop. - int start = Atomic::fetch_then_add(to_length_addr, - chunk_size, - memory_order_relaxed); + size_t start = Atomic::fetch_then_add(index_addr, + _chunk_size, + memory_order_relaxed); - assert(start < length, "invariant: start %d, length %d", start, length); - assert(((length - start) % chunk_size) == 0, - "invariant: start %d, length %d, chunk size %d", - start, length, chunk_size); + assert(start < length, "invariant: start %zu, length %zu", start, length); + assert(((length - start) % _chunk_size) == 0, + "invariant: start %zu, length %zu, chunk size %zu", + start, length, _chunk_size); // Determine the number of new tasks to create. // Zero-based index for this partial task. The initial task isn't counted. - uint task_num = (start / chunk_size); + uint task_num = checked_cast(start / _chunk_size); // Number of tasks left to process, including this one. - uint remaining_tasks = (length - start) / chunk_size; + uint remaining_tasks = checked_cast((length - start) / _chunk_size); assert(remaining_tasks > 0, "invariant"); // Compute number of pending tasks, including this one. The maximum number // of tasks is a function of task_num (N) and _task_fanout (F). @@ -106,13 +86,12 @@ PartialArrayTaskStepper::next_impl(int length, // of tasks to add for this task. uint pending = MIN3(max_pending, remaining_tasks, _task_limit); uint ncreate = MIN2(_task_fanout, MIN2(remaining_tasks, _task_limit + 1) - pending); - Step result = { start, ncreate }; - return result; + return Step{ start, ncreate }; } PartialArrayTaskStepper::Step -PartialArrayTaskStepper::next(arrayOop from, arrayOop to, int chunk_size) const { - return next_impl(from->length(), to->length_addr(), chunk_size); +PartialArrayTaskStepper::next(PartialArrayState* state) const { + return next_impl(state->length(), state->index_addr()); } #endif // SHARE_GC_SHARED_PARTIALARRAYTASKSTEPPER_INLINE_HPP diff --git a/src/hotspot/share/gc/shared/taskqueue.hpp b/src/hotspot/share/gc/shared/taskqueue.hpp index 2e21ba33b0b..2ea75e1457c 100644 --- a/src/hotspot/share/gc/shared/taskqueue.hpp +++ b/src/hotspot/share/gc/shared/taskqueue.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -576,6 +576,7 @@ private: // Wrapper over an oop that is a partially scanned array. // Can be converted to a ScannerTask for placement in associated task queues. // Refers to the partially copied source array oop. +// Temporarily retained to support ParallelGC until it adopts PartialArrayState. class PartialArrayScanTask { oop _src; @@ -586,7 +587,9 @@ public: oop to_source_array() const { return _src; } }; -// Discriminated union over oop*, narrowOop*, and PartialArrayScanTask. +class PartialArrayState; + +// Discriminated union over oop*, narrowOop*, and PartialArrayState. // Uses a low tag in the associated pointer to identify the category. // Used as a task queue element type. class ScannerTask { @@ -624,9 +627,13 @@ public: explicit ScannerTask(narrowOop* p) : _p(encode(p, NarrowOopTag)) {} + // Temporarily retained to support ParallelGC until it adopts PartialArrayState. explicit ScannerTask(PartialArrayScanTask t) : _p(encode(t.to_source_array(), PartialArrayTag)) {} + explicit ScannerTask(PartialArrayState* state) : + _p(encode(state, PartialArrayTag)) {} + // Trivially copyable. // Predicate implementations assume OopTag == 0, others are powers of 2. @@ -639,10 +646,15 @@ public: return (raw_value() & NarrowOopTag) != 0; } + // Temporarily retained to support ParallelGC until it adopts PartialArrayState. bool is_partial_array_task() const { return (raw_value() & PartialArrayTag) != 0; } + bool is_partial_array_state() const { + return (raw_value() & PartialArrayTag) != 0; + } + oop* to_oop_ptr() const { return static_cast(decode(OopTag)); } @@ -651,9 +663,14 @@ public: return static_cast(decode(NarrowOopTag)); } + // Temporarily retained to support ParallelGC until it adopts PartialArrayState. PartialArrayScanTask to_partial_array_task() const { return PartialArrayScanTask(cast_to_oop(decode(PartialArrayTag))); } + + PartialArrayState* to_partial_array_state() const { + return static_cast(decode(PartialArrayTag)); + } }; #endif // SHARE_GC_SHARED_TASKQUEUE_HPP diff --git a/test/hotspot/gtest/gc/shared/test_partialArrayTaskStepper.cpp b/test/hotspot/gtest/gc/shared/test_partialArrayTaskStepper.cpp index fb797ba12c1..3bb3a74437c 100644 --- a/test/hotspot/gtest/gc/shared/test_partialArrayTaskStepper.cpp +++ b/test/hotspot/gtest/gc/shared/test_partialArrayTaskStepper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,50 +32,42 @@ using Stepper = PartialArrayTaskStepper; class PartialArrayTaskStepper::TestSupport : AllStatic { public: - static Step start(const Stepper* stepper, - int length, - int* to_length_addr, - uint chunk_size) { - return stepper->start_impl(length, to_length_addr, chunk_size); - } - static Step next(const Stepper* stepper, - int length, - int* to_length_addr, - uint chunk_size) { - return stepper->next_impl(length, to_length_addr, chunk_size); + size_t length, + size_t* to_length_addr) { + return stepper->next_impl(length, to_length_addr); } }; using StepperSupport = PartialArrayTaskStepper::TestSupport; -static int simulate(const Stepper* stepper, - int length, - int* to_length_addr, - uint chunk_size) { - Step init = StepperSupport::start(stepper, length, to_length_addr, chunk_size); +static uint simulate(const Stepper* stepper, + size_t length, + size_t* to_length_addr) { + Step init = stepper->start(length); + *to_length_addr = init._index; uint queue_count = init._ncreate; - int task = 0; + uint task = 0; for ( ; queue_count > 0; ++task) { --queue_count; - Step step = StepperSupport::next(stepper, length, to_length_addr, chunk_size); + Step step = StepperSupport::next(stepper, length, to_length_addr); queue_count += step._ncreate; } return task; } -static void run_test(int length, int chunk_size, uint n_workers) { - const PartialArrayTaskStepper stepper(n_workers); - int to_length; - int tasks = simulate(&stepper, length, &to_length, chunk_size); +static void run_test(size_t length, size_t chunk_size, uint n_workers) { + const PartialArrayTaskStepper stepper(n_workers, chunk_size); + size_t to_length; + uint tasks = simulate(&stepper, length, &to_length); ASSERT_EQ(length, to_length); ASSERT_EQ(tasks, length / chunk_size); } TEST(PartialArrayTaskStepperTest, doit) { - for (int chunk_size = 50; chunk_size <= 500; chunk_size += 50) { + for (size_t chunk_size = 50; chunk_size <= 500; chunk_size += 50) { for (uint n_workers = 1; n_workers <= 256; n_workers = (n_workers * 3 / 2 + 1)) { - for (int length = 0; length <= 1000000; length = (length * 2 + 1)) { + for (size_t length = 0; length <= 1000000; length = (length * 2 + 1)) { run_test(length, chunk_size, n_workers); } // Ensure we hit boundary cases for length % chunk_size == 0.