8243326: Cleanup use of volatile in taskqueue code

Removed volatile on queue elements, cleaned up other uses, made atomics explicit.

Reviewed-by: tschatzl, iwalulya
This commit is contained in:
Kim Barrett 2020-04-14 02:25:19 -04:00
parent 313758a57a
commit 478773c102
3 changed files with 166 additions and 145 deletions

@ -65,22 +65,13 @@ private:
}
G1TaskQueueEntry(HeapWord* addr) : _holder((void*)((uintptr_t)addr | ArraySliceBit)) { }
public:
G1TaskQueueEntry(const G1TaskQueueEntry& other) { _holder = other._holder; }
G1TaskQueueEntry() : _holder(NULL) { }
// Trivially copyable, for use in GenericTaskQueue.
static G1TaskQueueEntry from_slice(HeapWord* what) { return G1TaskQueueEntry(what); }
static G1TaskQueueEntry from_oop(oop obj) { return G1TaskQueueEntry(obj); }
G1TaskQueueEntry& operator=(const G1TaskQueueEntry& t) {
_holder = t._holder;
return *this;
}
volatile G1TaskQueueEntry& operator=(const volatile G1TaskQueueEntry& t) volatile {
_holder = t._holder;
return *this;
}
oop obj() const {
assert(!is_array_slice(), "Trying to read array slice " PTR_FORMAT " as oop", p2i(_holder));
return (oop)_holder;

@ -28,6 +28,8 @@
#include "memory/allocation.hpp"
#include "memory/padded.hpp"
#include "oops/oopsHierarchy.hpp"
#include "runtime/atomic.hpp"
#include "utilities/debug.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/ostream.hpp"
#include "utilities/stack.hpp"
@ -107,31 +109,23 @@ class TaskQueueSuper: public CHeapObj<F> {
protected:
// Internal type for indexing the queue; also used for the tag.
typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
STATIC_ASSERT(N == idx_t(N)); // Ensure N fits in an idx_t.
// The first free element after the last one pushed (mod N).
volatile uint _bottom;
enum { MOD_N_MASK = N - 1 };
// N must be a power of 2 for computing modulo via masking.
// N must be >= 2 for the algorithm to work at all, though larger is better.
// C++11: is_power_of_2 is not (yet) constexpr.
STATIC_ASSERT((N >= 2) && ((N & (N - 1)) == 0));
static const uint MOD_N_MASK = N - 1;
class Age {
friend class TaskQueueSuper;
public:
Age(size_t data = 0) { _data = data; }
Age(const Age& age) { _data = age._data; }
Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; }
explicit Age(size_t data = 0) : _data(data) {}
Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; }
Age get() const volatile { return _data; }
void set(Age age) volatile { _data = age._data; }
idx_t top() const volatile { return _fields._top; }
idx_t tag() const volatile { return _fields._tag; }
// Increment top; if it wraps, increment tag also.
void increment() {
_fields._top = increment_index(_fields._top);
if (_fields._top == 0) ++_fields._tag;
}
Age cmpxchg(const Age new_age, const Age old_age) volatile;
idx_t top() const { return _fields._top; }
idx_t tag() const { return _fields._tag; }
bool operator ==(const Age& other) const { return _data == other._data; }
@ -144,9 +138,41 @@ protected:
size_t _data;
fields _fields;
};
STATIC_ASSERT(sizeof(size_t) >= sizeof(fields));
};
volatile Age _age;
uint bottom_relaxed() const {
return Atomic::load(&_bottom);
}
uint bottom_acquire() const {
return Atomic::load_acquire(&_bottom);
}
void set_bottom_relaxed(uint new_bottom) {
Atomic::store(&_bottom, new_bottom);
}
void release_set_bottom(uint new_bottom) {
Atomic::release_store(&_bottom, new_bottom);
}
Age age_relaxed() const {
return Age(Atomic::load(&_age._data));
}
void set_age_relaxed(Age new_age) {
Atomic::store(&_age._data, new_age._data);
}
Age cmpxchg_age(Age old_age, Age new_age) {
return Age(Atomic::cmpxchg(&_age._data, old_age._data, new_age._data));
}
idx_t age_top_relaxed() const {
// Atomically accessing a subfield of an "atomic" member.
return Atomic::load(&_age._fields._top);
}
// These both operate mod N.
static uint increment_index(uint ind) {
@ -163,7 +189,7 @@ protected:
}
// Returns the size corresponding to the given "bot" and "top".
uint size(uint bot, uint top) const {
uint clean_size(uint bot, uint top) const {
uint sz = dirty_size(bot, top);
// Has the queue "wrapped", so that bottom is less than top? There's a
// complicated special case here. A pair of threads could perform pop_local
@ -182,27 +208,46 @@ protected:
return (sz == N - 1) ? 0 : sz;
}
private:
DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
// Index of the first free element after the last one pushed (mod N).
volatile uint _bottom;
DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(uint));
// top() is the index of the oldest pushed element (mod N), and tag()
// is the associated epoch, to distinguish different modifications of
// the age. There is no available element if top() == _bottom or
// (_bottom - top()) mod N == N-1; the latter indicates underflow
// during concurrent pop_local/pop_global.
volatile Age _age;
DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(Age));
NONCOPYABLE(TaskQueueSuper);
public:
TaskQueueSuper() : _bottom(0), _age() {}
// Return true if the TaskQueue contains/does not contain any tasks.
bool peek() const { return _bottom != _age.top(); }
bool is_empty() const { return size() == 0; }
// Return true if the TaskQueue contains any tasks.
// Unreliable if there are concurrent pushes or pops.
bool peek() const {
return bottom_relaxed() != age_top_relaxed();
}
bool is_empty() const {
return size() == 0;
}
// Return an estimate of the number of elements in the queue.
// The "careful" version admits the possibility of pop_local/pop_global
// races.
// Treats pop_local/pop_global race that underflows as empty.
uint size() const {
return size(_bottom, _age.top());
}
uint dirty_size() const {
return dirty_size(_bottom, _age.top());
return clean_size(bottom_relaxed(), age_top_relaxed());
}
// Discard the contents of the queue.
void set_empty() {
_bottom = 0;
_age.set(0);
set_bottom_relaxed(0);
set_age_relaxed(Age());
}
// Maximum number of elements allowed in the queue. This is two less
@ -211,9 +256,6 @@ public:
// in GenericTaskQueue.
uint max_elems() const { return N - 2; }
// Total size of queue.
static const uint total_size() { return N; }
TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
};
@ -254,11 +296,23 @@ protected:
typedef typename TaskQueueSuper<N, F>::Age Age;
typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
using TaskQueueSuper<N, F>::_bottom;
using TaskQueueSuper<N, F>::_age;
using TaskQueueSuper<N, F>::MOD_N_MASK;
using TaskQueueSuper<N, F>::bottom_relaxed;
using TaskQueueSuper<N, F>::bottom_acquire;
using TaskQueueSuper<N, F>::set_bottom_relaxed;
using TaskQueueSuper<N, F>::release_set_bottom;
using TaskQueueSuper<N, F>::age_relaxed;
using TaskQueueSuper<N, F>::set_age_relaxed;
using TaskQueueSuper<N, F>::cmpxchg_age;
using TaskQueueSuper<N, F>::age_top_relaxed;
using TaskQueueSuper<N, F>::increment_index;
using TaskQueueSuper<N, F>::decrement_index;
using TaskQueueSuper<N, F>::dirty_size;
using TaskQueueSuper<N, F>::clean_size;
public:
using TaskQueueSuper<N, F>::max_elems;
@ -285,13 +339,14 @@ public:
// Attempts to claim a task from the "local" end of the queue (the most
// recently pushed) as long as the number of entries exceeds the threshold.
// If successful, returns true and sets t to the task; otherwise, returns false
// (the queue is empty or the number of elements below the threshold).
inline bool pop_local(volatile E& t, uint threshold = 0);
// If successfully claims a task, returns true and sets t to the task;
// otherwise, returns false and t is unspecified. May fail and return
// false because of a successful steal by pop_global.
inline bool pop_local(E& t, uint threshold = 0);
// Like pop_local(), but uses the "global" end of the queue (the least
// recently pushed).
bool pop_global(volatile E& t);
bool pop_global(E& t);
// Delete any resource associated with the queue.
~GenericTaskQueue();
@ -301,9 +356,10 @@ public:
template<typename Fn> void iterate(Fn fn);
private:
DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
// Base class has trailing padding.
// Element array.
volatile E* _elems;
E* _elems;
DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*));
// Queue owner local variables. Not to be accessed by other threads.
@ -444,12 +500,6 @@ public:
virtual bool should_exit_termination() = 0;
};
#ifdef _MSC_VER
#pragma warning(push)
// warning C4522: multiple assignment operators specified
#pragma warning(disable:4522)
#endif
// This is a container class for either an oop* or a narrowOop*.
// Both are pushed onto a task queue and the consumer will test is_narrow()
// to determine which should be processed.
@ -468,20 +518,13 @@ class StarTask {
_holder = (void*)p;
}
StarTask() { _holder = NULL; }
// Trivially copyable, for use in GenericTaskQueue.
operator oop*() { return (oop*)_holder; }
operator narrowOop*() {
return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK);
}
StarTask& operator=(const StarTask& t) {
_holder = t._holder;
return *this;
}
volatile StarTask& operator=(const volatile StarTask& t) volatile {
_holder = t._holder;
return *this;
}
bool is_narrow() const {
return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0);
}
@ -494,19 +537,7 @@ public:
ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) {
assert(idx <= size_t(max_jint), "too big");
}
ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { }
ObjArrayTask& operator =(const ObjArrayTask& t) {
_obj = t._obj;
_index = t._index;
return *this;
}
volatile ObjArrayTask&
operator =(const volatile ObjArrayTask& t) volatile {
(void)const_cast<oop&>(_obj = t._obj);
_index = t._index;
return *this;
}
// Trivially copyable, for use in GenericTaskQueue.
inline oop obj() const { return _obj; }
inline int index() const { return _index; }
@ -518,8 +549,4 @@ private:
int _index;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#endif // SHARE_GC_SHARED_TASKQUEUE_HPP

@ -54,14 +54,14 @@ inline void GenericTaskQueue<E, F, N>::initialize() {
template<class E, MEMFLAGS F, unsigned int N>
inline GenericTaskQueue<E, F, N>::~GenericTaskQueue() {
ArrayAllocator<E>::free(const_cast<E*>(_elems), N);
ArrayAllocator<E>::free(_elems, N);
}
template<class E, MEMFLAGS F, unsigned int N> inline bool
GenericTaskQueue<E, F, N>::push(E t) {
uint localBot = _bottom;
uint localBot = bottom_relaxed();
assert(localBot < N, "_bottom out of range.");
idx_t top = _age.top();
idx_t top = age_top_relaxed();
uint dirty_n_elems = dirty_size(localBot, top);
// A dirty_size of N-1 cannot happen in push. Considering only push:
// (1) dirty_n_elems is initially 0.
@ -75,13 +75,8 @@ GenericTaskQueue<E, F, N>::push(E t) {
// so can't underflow to -1 (== N-1) with push.
assert(dirty_n_elems <= max_elems(), "n_elems out of range.");
if (dirty_n_elems < max_elems()) {
// g++ complains if the volatile result of the assignment is
// unused, so we cast the volatile away. We cannot cast directly
// to void, because gcc treats that as not using the result of the
// assignment. However, casting to E& means that we trigger an
// unused-value warning. So, we cast the E& to void.
(void) const_cast<E&>(_elems[localBot] = t);
Atomic::release_store(&_bottom, increment_index(localBot));
_elems[localBot] = t;
release_set_bottom(increment_index(localBot));
TASKQUEUE_STATS_ONLY(stats.record_push());
return true;
}
@ -89,8 +84,7 @@ GenericTaskQueue<E, F, N>::push(E t) {
}
template <class E, MEMFLAGS F, unsigned int N>
inline bool OverflowTaskQueue<E, F, N>::push(E t)
{
inline bool OverflowTaskQueue<E, F, N>::push(E t) {
if (!taskqueue_t::push(t)) {
overflow_stack()->push(t);
TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->size()));
@ -114,61 +108,56 @@ bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) {
// This queue was observed to contain exactly one element; either this
// thread will claim it, or a competing "pop_global". In either case,
// the queue will be logically empty afterwards. Create a new Age value
// that represents the empty queue for the given value of "_bottom". (We
// that represents the empty queue for the given value of "bottom". (We
// must also increment "tag" because of the case where "bottom == 1",
// "top == 0". A pop_global could read the queue element in that case,
// then have the owner thread do a pop followed by another push. Without
// the incrementing of "tag", the pop_global's CAS could succeed,
// allowing it to believe it has claimed the stale element.)
Age newAge((idx_t)localBot, oldAge.tag() + 1);
Age newAge((idx_t)localBot, (idx_t)(oldAge.tag() + 1));
// Perhaps a competing pop_global has already incremented "top", in which
// case it wins the element.
if (localBot == oldAge.top()) {
// No competing pop_global has yet incremented "top"; we'll try to
// install new_age, thus claiming the element.
Age tempAge = _age.cmpxchg(newAge, oldAge);
Age tempAge = cmpxchg_age(oldAge, newAge);
if (tempAge == oldAge) {
// We win.
assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
assert(dirty_size(localBot, age_top_relaxed()) != N - 1, "sanity");
TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
return true;
}
}
// We lose; a completing pop_global gets the element. But the queue is empty
// We lose; a competing pop_global got the element. But the queue is empty
// and top is greater than bottom. Fix this representation of the empty queue
// to become the canonical one.
_age.set(newAge);
assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
set_age_relaxed(newAge);
assert(dirty_size(localBot, age_top_relaxed()) != N - 1, "sanity");
return false;
}
template<class E, MEMFLAGS F, unsigned int N> inline bool
GenericTaskQueue<E, F, N>::pop_local(volatile E& t, uint threshold) {
uint localBot = _bottom;
GenericTaskQueue<E, F, N>::pop_local(E& t, uint threshold) {
uint localBot = bottom_relaxed();
// This value cannot be N-1. That can only occur as a result of
// the assignment to bottom in this method. If it does, this method
// resets the size to 0 before the next call (which is sequential,
// since this is pop_local.)
uint dirty_n_elems = dirty_size(localBot, _age.top());
uint dirty_n_elems = dirty_size(localBot, age_top_relaxed());
assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
if (dirty_n_elems <= threshold) return false;
localBot = decrement_index(localBot);
_bottom = localBot;
set_bottom_relaxed(localBot);
// This is necessary to prevent any read below from being reordered
// before the store just above.
OrderAccess::fence();
// g++ complains if the volatile result of the assignment is
// unused, so we cast the volatile away. We cannot cast directly
// to void, because gcc treats that as not using the result of the
// assignment. However, casting to E& means that we trigger an
// unused-value warning. So, we cast the E& to void.
(void) const_cast<E&>(t = _elems[localBot]);
t = _elems[localBot];
// This is a second read of "age"; the "size()" above is the first.
// If there's still at least one element in the queue, based on the
// "_bottom" and "age" we've read, then there can be no interference with
// a "pop_global" operation, and we're done.
idx_t tp = _age.top(); // XXX
if (size(localBot, tp) > 0) {
idx_t tp = age_top_relaxed();
if (clean_size(localBot, tp) > 0) {
assert(dirty_size(localBot, tp) != N - 1, "sanity");
TASKQUEUE_STATS_ONLY(stats.record_pop());
return true;
@ -176,11 +165,11 @@ GenericTaskQueue<E, F, N>::pop_local(volatile E& t, uint threshold) {
// Otherwise, the queue contained exactly one element; we take the slow
// path.
// The barrier is required to prevent reordering the two reads of _age:
// one is the _age.get() below, and the other is _age.top() above the if-stmt.
// The algorithm may fail if _age.get() reads an older value than _age.top().
// The barrier is required to prevent reordering the two reads of age:
// one is the age() below, and the other is age_top() above the if-stmt.
// The algorithm may fail if age() reads an older value than age_top().
OrderAccess::loadload();
return pop_local_slow(localBot, _age.get());
return pop_local_slow(localBot, age_relaxed());
}
}
@ -192,9 +181,30 @@ bool OverflowTaskQueue<E, F, N>::pop_overflow(E& t)
return true;
}
// A pop_global operation may read an element that is being concurrently
// written by a push operation. The pop_global operation will not use
// such an element, returning failure instead. But the concurrent read
// and write places requirements on the element type.
//
// Strictly, such concurrent reads and writes are undefined behavior.
// We ignore that. Instead we require that whatever value tearing may
// occur as a result is benign. A trivially copyable type (C++14 3.9/9)
// satisfies the requirement. But we might use classes such as oop that
// are not trivially copyable (in some build configurations). Such
// classes need to be carefully examined with this requirement in mind.
//
// The sequence where such a read/write collision can arise is as follows.
// Assume there is one value in the queue, so bottom == top+1.
// (1) Thief is doing a pop_global. It has read age and bottom, and its
// captured (localBottom - oldAge.top) == 1.
// (2) Owner does a pop_local and wins the race for that element. It
// decrements bottom and increments the age tag.
// (3) Owner starts a push, writing elems[bottom]. At the same time, Thief
// reads elems[oldAge.top]. The owner's bottom == the thief's oldAge.top.
// (4) Thief will discard the read value, because its cmpxchg of age will fail.
template<class E, MEMFLAGS F, unsigned int N>
bool GenericTaskQueue<E, F, N>::pop_global(volatile E& t) {
Age oldAge = _age.get();
bool GenericTaskQueue<E, F, N>::pop_global(E& t) {
Age oldAge = age_relaxed();
#ifndef CPU_MULTI_COPY_ATOMIC
// Architectures with non-multi-copy-atomic memory model require a
// full fence here to guarantee that bottom is not older than age,
@ -212,26 +222,24 @@ bool GenericTaskQueue<E, F, N>::pop_global(volatile E& t) {
OrderAccess::fence();
#else
// Everyone else can make do with a LoadLoad barrier to keep reads
// from _age and _bottom in order.
// from age and bottom in order.
OrderAccess::loadload();
#endif
uint localBot = Atomic::load_acquire(&_bottom);
uint n_elems = size(localBot, oldAge.top());
uint localBot = bottom_acquire();
uint n_elems = clean_size(localBot, oldAge.top());
if (n_elems == 0) {
return false;
}
// g++ complains if the volatile result of the assignment is
// unused, so we cast the volatile away. We cannot cast directly
// to void, because gcc treats that as not using the result of the
// assignment. However, casting to E& means that we trigger an
// unused-value warning. So, we cast the E& to void.
(void) const_cast<E&>(t = _elems[oldAge.top()]);
Age newAge(oldAge);
newAge.increment();
Age resAge = _age.cmpxchg(newAge, oldAge);
t = _elems[oldAge.top()];
// Increment top; if it wraps, also increment tag, to distinguish it
// from any recent _age for the same top() index.
idx_t new_top = increment_index(oldAge.top());
idx_t new_tag = oldAge.tag() + ((new_top == 0) ? 1 : 0);
Age newAge(new_top, new_tag);
Age resAge = cmpxchg_age(oldAge, newAge);
// Note that using "_bottom" here might fail, since a pop_local might
// Note that using "bottom" here might fail, since a pop_local might
// have decremented it.
assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
return resAge == oldAge;
@ -324,19 +332,14 @@ GenericTaskQueueSet<T, F>::steal(uint queue_num, E& t) {
return false;
}
template <unsigned int N, MEMFLAGS F>
inline typename TaskQueueSuper<N, F>::Age TaskQueueSuper<N, F>::Age::cmpxchg(const Age new_age, const Age old_age) volatile {
return Atomic::cmpxchg(&_data, old_age._data, new_age._data);
}
template<class E, MEMFLAGS F, unsigned int N>
template<class Fn>
inline void GenericTaskQueue<E, F, N>::iterate(Fn fn) {
uint iters = size();
uint index = _bottom;
uint index = bottom_relaxed();
for (uint i = 0; i < iters; ++i) {
index = decrement_index(index);
fn(const_cast<E&>(_elems[index])); // cast away volatility
fn(_elems[index]);
}
}