8225255: Make SATB qset lock-free

Refactor PtrQueueSet, use lock-free stack for SATB completed buffers Reviewed-by: tschatzl, shade
2019-06-26 13:18:38 -04:00 · 2019-06-26 13:18:38 -04:00 · 805258f3a9
commit 805258f3a9
parent b6c6f3ce43
15 changed files with 317 additions and 216 deletions
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -1677,7 +1677,6 @@ jint G1CollectedHeap::initialize() {
  _card_table = ct;

  G1BarrierSet::satb_mark_queue_set().initialize(this,
-                                                 SATB_Q_CBL_mon,
                                                 &bs->satb_mark_queue_buffer_allocator(),
                                                 G1SATBProcessCompletedThreshold,
                                                 G1SATBBufferEnqueueingThresholdPercent);
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
@ -2419,11 +2419,12 @@ void G1CMTask::drain_satb_buffers() {
    abort_marking_if_regular_check_fail();
  }

-  _draining_satb_buffers = false;
+  // Can't assert qset is empty here, even if not aborted.  If concurrent,
+  // some other thread might be adding to the queue.  If not concurrent,
+  // some other thread might have won the race for the last buffer, but
+  // has not yet decremented the count.

-  assert(has_aborted() ||
-         _cm->concurrent() ||
-         satb_mq_set.completed_buffers_num() == 0, "invariant");
+  _draining_satb_buffers = false;

  // again, this was a potentially expensive operation, decrease the
  // limits to get the regular clock call early
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -78,7 +78,14 @@ void G1DirtyCardQueue::handle_completed_buffer() {
 }

 G1DirtyCardQueueSet::G1DirtyCardQueueSet(bool notify_when_complete) :
-  PtrQueueSet(notify_when_complete),
+  PtrQueueSet(),
+  _cbl_mon(NULL),
+  _completed_buffers_head(NULL),
+  _completed_buffers_tail(NULL),
+  _n_completed_buffers(0),
+  _process_completed_buffers_threshold(ProcessCompletedBuffersThresholdNever),
+  _process_completed_buffers(false),
+  _notify_when_complete(notify_when_complete),
  _max_completed_buffers(MaxCompletedBuffersUnlimited),
  _completed_buffers_padding(0),
  _free_ids(NULL),
@ -90,6 +97,7 @@ G1DirtyCardQueueSet::G1DirtyCardQueueSet(bool notify_when_complete) :
 }

 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
+  abandon_completed_buffers();
  delete _free_ids;
 }

@ -101,7 +109,9 @@ uint G1DirtyCardQueueSet::num_par_ids() {
 void G1DirtyCardQueueSet::initialize(Monitor* cbl_mon,
                                     BufferNode::Allocator* allocator,
                                     bool init_free_ids) {
-  PtrQueueSet::initialize(cbl_mon, allocator);
+  PtrQueueSet::initialize(allocator);
+  assert(_cbl_mon == NULL, "Init order issue?");
+  _cbl_mon = cbl_mon;
  if (init_free_ids) {
    _free_ids = new G1FreeIdSet(0, num_par_ids());
  }
@ -111,6 +121,123 @@ void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
  G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }

+void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
+  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  cbn->set_next(NULL);
+  if (_completed_buffers_tail == NULL) {
+    assert(_completed_buffers_head == NULL, "Well-formedness");
+    _completed_buffers_head = cbn;
+    _completed_buffers_tail = cbn;
+  } else {
+    _completed_buffers_tail->set_next(cbn);
+    _completed_buffers_tail = cbn;
+  }
+  _n_completed_buffers++;
+
+  if (!process_completed_buffers() &&
+      (_n_completed_buffers > process_completed_buffers_threshold())) {
+    set_process_completed_buffers(true);
+    if (_notify_when_complete) {
+      _cbl_mon->notify_all();
+    }
+  }
+  assert_completed_buffers_list_len_correct_locked();
+}
+
+BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
+  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+
+  if (_n_completed_buffers <= stop_at) {
+    return NULL;
+  }
+
+  assert(_n_completed_buffers > 0, "invariant");
+  assert(_completed_buffers_head != NULL, "invariant");
+  assert(_completed_buffers_tail != NULL, "invariant");
+
+  BufferNode* bn = _completed_buffers_head;
+  _n_completed_buffers--;
+  _completed_buffers_head = bn->next();
+  if (_completed_buffers_head == NULL) {
+    assert(_n_completed_buffers == 0, "invariant");
+    _completed_buffers_tail = NULL;
+    set_process_completed_buffers(false);
+  }
+  assert_completed_buffers_list_len_correct_locked();
+  bn->set_next(NULL);
+  return bn;
+}
+
+void G1DirtyCardQueueSet::abandon_completed_buffers() {
+  BufferNode* buffers_to_delete = NULL;
+  {
+    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+    buffers_to_delete = _completed_buffers_head;
+    _completed_buffers_head = NULL;
+    _completed_buffers_tail = NULL;
+    _n_completed_buffers = 0;
+    set_process_completed_buffers(false);
+  }
+  while (buffers_to_delete != NULL) {
+    BufferNode* bn = buffers_to_delete;
+    buffers_to_delete = bn->next();
+    bn->set_next(NULL);
+    deallocate_buffer(bn);
+  }
+}
+
+void G1DirtyCardQueueSet::notify_if_necessary() {
+  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  if (_n_completed_buffers > process_completed_buffers_threshold()) {
+    set_process_completed_buffers(true);
+    if (_notify_when_complete)
+      _cbl_mon->notify();
+  }
+}
+
+#ifdef ASSERT
+void G1DirtyCardQueueSet::assert_completed_buffers_list_len_correct_locked() {
+  assert_lock_strong(_cbl_mon);
+  size_t n = 0;
+  for (BufferNode* bn = _completed_buffers_head; bn != NULL; bn = bn->next()) {
+    ++n;
+  }
+  assert(n == _n_completed_buffers,
+         "Completed buffer length is wrong: counted: " SIZE_FORMAT
+         ", expected: " SIZE_FORMAT, n, _n_completed_buffers);
+}
+#endif // ASSERT
+
+// Merge lists of buffers. Notify the processing threads.
+// The source queue is emptied as a result. The queues
+// must share the monitor.
+void G1DirtyCardQueueSet::merge_bufferlists(G1DirtyCardQueueSet *src) {
+  assert(_cbl_mon == src->_cbl_mon, "Should share the same lock");
+  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  if (_completed_buffers_tail == NULL) {
+    assert(_completed_buffers_head == NULL, "Well-formedness");
+    _completed_buffers_head = src->_completed_buffers_head;
+    _completed_buffers_tail = src->_completed_buffers_tail;
+  } else {
+    assert(_completed_buffers_head != NULL, "Well formedness");
+    if (src->_completed_buffers_head != NULL) {
+      _completed_buffers_tail->set_next(src->_completed_buffers_head);
+      _completed_buffers_tail = src->_completed_buffers_tail;
+    }
+  }
+  _n_completed_buffers += src->_n_completed_buffers;
+
+  src->_n_completed_buffers = 0;
+  src->_completed_buffers_head = NULL;
+  src->_completed_buffers_tail = NULL;
+  src->set_process_completed_buffers(false);
+
+  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
+         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
+         "Sanity");
+  assert_completed_buffers_list_len_correct_locked();
+}
+
 bool G1DirtyCardQueueSet::apply_closure_to_buffer(G1CardTableEntryClosure* cl,
                                                  BufferNode* node,
                                                  bool consume,
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -76,6 +76,21 @@ public:
 };

 class G1DirtyCardQueueSet: public PtrQueueSet {
+  Monitor* _cbl_mon;  // Protects the fields below.
+  BufferNode* _completed_buffers_head;
+  BufferNode* _completed_buffers_tail;
+  volatile size_t _n_completed_buffers;
+
+  size_t _process_completed_buffers_threshold;
+  volatile bool _process_completed_buffers;
+
+  // If true, notify_all on _cbl_mon when the threshold is reached.
+  bool _notify_when_complete;
+
+  void assert_completed_buffers_list_len_correct_locked() NOT_DEBUG_RETURN;
+
+  void abandon_completed_buffers();
+
  // Apply the closure to the elements of "node" from it's index to
  // buffer_size.  If all closure applications return true, then
  // returns true.  Stops processing after the first closure
@ -111,7 +126,7 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
  // mutator must start doing some of the concurrent refinement work,
  size_t _max_completed_buffers;
  size_t _completed_buffers_padding;
-  static const size_t MaxCompletedBuffersUnlimited = ~size_t(0);
+  static const size_t MaxCompletedBuffersUnlimited = SIZE_MAX;

  G1FreeIdSet* _free_ids;

@ -142,6 +157,34 @@ public:
  // it can be reused in place.
  bool process_or_enqueue_completed_buffer(BufferNode* node);

+  virtual void enqueue_completed_buffer(BufferNode* node);
+
+  // If the number of completed buffers is > stop_at, then remove and
+  // return a completed buffer from the list.  Otherwise, return NULL.
+  BufferNode* get_completed_buffer(size_t stop_at = 0);
+
+  // The number of buffers in the list.  Racy...
+  size_t completed_buffers_num() const { return _n_completed_buffers; }
+
+  bool process_completed_buffers() { return _process_completed_buffers; }
+  void set_process_completed_buffers(bool x) { _process_completed_buffers = x; }
+
+  // Get/Set the number of completed buffers that triggers log processing.
+  // Log processing should be done when the number of buffers exceeds the
+  // threshold.
+  void set_process_completed_buffers_threshold(size_t sz) {
+    _process_completed_buffers_threshold = sz;
+  }
+  size_t process_completed_buffers_threshold() const {
+    return _process_completed_buffers_threshold;
+  }
+  static const size_t ProcessCompletedBuffersThresholdNever = SIZE_MAX;
+
+  // Notify the consumer if the number of buffers crossed the threshold
+  void notify_if_necessary();
+
+  void merge_bufferlists(G1DirtyCardQueueSet* src);
+
  // Apply G1RefineCardConcurrentlyClosure to completed buffers until there are stop_at
  // completed buffers remaining.
  bool refine_completed_buffer_concurrently(uint worker_i, size_t stop_at);
@ -150,13 +193,13 @@ public:
  // must never return false. Must only be called during GC.
  bool apply_closure_during_gc(G1CardTableEntryClosure* cl, uint worker_i);

-  void reset_for_par_iteration() { _cur_par_buffer_node = completed_buffers_head(); }
+  void reset_for_par_iteration() { _cur_par_buffer_node = _completed_buffers_head; }
  // Applies the current closure to all completed buffers, non-consumptively.
  // Can be used in parallel, all callers using the iteration state initialized
  // by reset_for_par_iteration.
  void par_apply_closure_to_all_completed_buffers(G1CardTableEntryClosure* cl);

-  // If a full collection is happening, reset partial logs, and ignore
+  // If a full collection is happening, reset partial logs, and release
  // completed ones: the full collection will make them all irrelevant.
  void abandon_logs();

--- a/src/hotspot/share/gc/g1/g1SATBMarkQueueSet.cpp
+++ b/src/hotspot/share/gc/g1/g1SATBMarkQueueSet.cpp
@ -35,12 +35,10 @@
 G1SATBMarkQueueSet::G1SATBMarkQueueSet() : _g1h(NULL) {}

 void G1SATBMarkQueueSet::initialize(G1CollectedHeap* g1h,
-                                    Monitor* cbl_mon,
                                    BufferNode::Allocator* allocator,
                                    size_t process_completed_buffers_threshold,
                                    uint buffer_enqueue_threshold_percentage) {
-  SATBMarkQueueSet::initialize(cbl_mon,
-                               allocator,
+  SATBMarkQueueSet::initialize(allocator,
                               process_completed_buffers_threshold,
                               buffer_enqueue_threshold_percentage);
  _g1h = g1h;
--- a/src/hotspot/share/gc/g1/g1SATBMarkQueueSet.hpp
+++ b/src/hotspot/share/gc/g1/g1SATBMarkQueueSet.hpp
@ -38,7 +38,6 @@ public:
  G1SATBMarkQueueSet();

  void initialize(G1CollectedHeap* g1h,
-                  Monitor* cbl_mon,
                  BufferNode::Allocator* allocator,
                  size_t process_completed_buffers_threshold,
                  uint buffer_enqueue_threshold_percentage);
--- a/src/hotspot/share/gc/shared/ptrQueue.cpp
+++ b/src/hotspot/share/gc/shared/ptrQueue.cpp
@ -250,28 +250,15 @@ size_t BufferNode::Allocator::reduce_free_list(size_t remove_goal) {
  return removed;
 }

-PtrQueueSet::PtrQueueSet(bool notify_when_complete) :
+PtrQueueSet::PtrQueueSet() :
  _allocator(NULL),
-  _cbl_mon(NULL),
-  _completed_buffers_head(NULL),
-  _completed_buffers_tail(NULL),
-  _n_completed_buffers(0),
-  _process_completed_buffers_threshold(ProcessCompletedBuffersThresholdNever),
-  _process_completed_buffers(false),
-  _notify_when_complete(notify_when_complete),
  _all_active(false)
 {}

-PtrQueueSet::~PtrQueueSet() {
-  // There are presently only a couple (derived) instances ever
-  // created, and they are permanent, so no harm currently done by
-  // doing nothing here.
-}
+PtrQueueSet::~PtrQueueSet() {}

-void PtrQueueSet::initialize(Monitor* cbl_mon,
-                             BufferNode::Allocator* allocator) {
-  assert(cbl_mon != NULL && allocator != NULL, "Init order issue?");
-  _cbl_mon = cbl_mon;
+void PtrQueueSet::initialize(BufferNode::Allocator* allocator) {
+  assert(allocator != NULL, "Init order issue?");
  _allocator = allocator;
 }

@ -284,121 +271,3 @@ void PtrQueueSet::deallocate_buffer(BufferNode* node) {
  _allocator->release(node);
 }

-void PtrQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  cbn->set_next(NULL);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = cbn;
-    _completed_buffers_tail = cbn;
-  } else {
-    _completed_buffers_tail->set_next(cbn);
-    _completed_buffers_tail = cbn;
-  }
-  _n_completed_buffers++;
-
-  if (!_process_completed_buffers &&
-      (_n_completed_buffers > _process_completed_buffers_threshold)) {
-    _process_completed_buffers = true;
-    if (_notify_when_complete) {
-      _cbl_mon->notify();
-    }
-  }
-  assert_completed_buffers_list_len_correct_locked();
-}
-
-BufferNode* PtrQueueSet::get_completed_buffer(size_t stop_at) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-
-  if (_n_completed_buffers <= stop_at) {
-    return NULL;
-  }
-
-  assert(_n_completed_buffers > 0, "invariant");
-  assert(_completed_buffers_head != NULL, "invariant");
-  assert(_completed_buffers_tail != NULL, "invariant");
-
-  BufferNode* bn = _completed_buffers_head;
-  _n_completed_buffers--;
-  _completed_buffers_head = bn->next();
-  if (_completed_buffers_head == NULL) {
-    assert(_n_completed_buffers == 0, "invariant");
-    _completed_buffers_tail = NULL;
-    _process_completed_buffers = false;
-  }
-  assert_completed_buffers_list_len_correct_locked();
-  bn->set_next(NULL);
-  return bn;
-}
-
-void PtrQueueSet::abandon_completed_buffers() {
-  BufferNode* buffers_to_delete = NULL;
-  {
-    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-    buffers_to_delete = _completed_buffers_head;
-    _completed_buffers_head = NULL;
-    _completed_buffers_tail = NULL;
-    _n_completed_buffers = 0;
-    _process_completed_buffers = false;
-  }
-  while (buffers_to_delete != NULL) {
-    BufferNode* bn = buffers_to_delete;
-    buffers_to_delete = bn->next();
-    bn->set_next(NULL);
-    deallocate_buffer(bn);
-  }
-}
-
-#ifdef ASSERT
-
-void PtrQueueSet::assert_completed_buffers_list_len_correct_locked() {
-  assert_lock_strong(_cbl_mon);
-  size_t n = 0;
-  for (BufferNode* bn = _completed_buffers_head; bn != NULL; bn = bn->next()) {
-    ++n;
-  }
-  assert(n == _n_completed_buffers,
-         "Completed buffer length is wrong: counted: " SIZE_FORMAT
-         ", expected: " SIZE_FORMAT, n, _n_completed_buffers);
-}
-
-#endif // ASSERT
-
-// Merge lists of buffers. Notify the processing threads.
-// The source queue is emptied as a result. The queues
-// must share the monitor.
-void PtrQueueSet::merge_bufferlists(PtrQueueSet *src) {
-  assert(_cbl_mon == src->_cbl_mon, "Should share the same lock");
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = src->_completed_buffers_head;
-    _completed_buffers_tail = src->_completed_buffers_tail;
-  } else {
-    assert(_completed_buffers_head != NULL, "Well formedness");
-    if (src->_completed_buffers_head != NULL) {
-      _completed_buffers_tail->set_next(src->_completed_buffers_head);
-      _completed_buffers_tail = src->_completed_buffers_tail;
-    }
-  }
-  _n_completed_buffers += src->_n_completed_buffers;
-
-  src->_n_completed_buffers = 0;
-  src->_completed_buffers_head = NULL;
-  src->_completed_buffers_tail = NULL;
-  src->_process_completed_buffers = false;
-
-  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
-         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
-         "Sanity");
-  assert_completed_buffers_list_len_correct_locked();
-}
-
-void PtrQueueSet::notify_if_necessary() {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (_n_completed_buffers > _process_completed_buffers_threshold) {
-    _process_completed_buffers = true;
-    if (_notify_when_complete)
-      _cbl_mon->notify();
-  }
-}
--- a/src/hotspot/share/gc/shared/ptrQueue.hpp
+++ b/src/hotspot/share/gc/shared/ptrQueue.hpp
@ -296,35 +296,16 @@ public:
 class PtrQueueSet {
  BufferNode::Allocator* _allocator;

-  Monitor* _cbl_mon;  // Protects the fields below.
-  BufferNode* _completed_buffers_head;
-  BufferNode* _completed_buffers_tail;
-  volatile size_t _n_completed_buffers;
-
-  size_t _process_completed_buffers_threshold;
-  volatile bool _process_completed_buffers;
-
-  // If true, notify_all on _cbl_mon when the threshold is reached.
-  bool _notify_when_complete;
-
-  void assert_completed_buffers_list_len_correct_locked() NOT_DEBUG_RETURN;
-
 protected:
  bool _all_active;

  // Create an empty ptr queue set.
-  PtrQueueSet(bool notify_when_complete = false);
+  PtrQueueSet();
  ~PtrQueueSet();

  // Because of init-order concerns, we can't pass these as constructor
  // arguments.
-  void initialize(Monitor* cbl_mon, BufferNode::Allocator* allocator);
-
-  // For (unlocked!) iteration over the completed buffers.
-  BufferNode* completed_buffers_head() const { return _completed_buffers_head; }
-
-  // Deallocate all of the completed buffers.
-  void abandon_completed_buffers();
+  void initialize(BufferNode::Allocator* allocator);

 public:

@ -339,38 +320,13 @@ public:
  // is ready to be processed by the collector.  It need not be full.

  // Adds node to the completed buffer list.
-  void enqueue_completed_buffer(BufferNode* node);
-
-  // If the number of completed buffers is > stop_at, then remove and
-  // return a completed buffer from the list.  Otherwise, return NULL.
-  BufferNode* get_completed_buffer(size_t stop_at = 0);
-
-  bool process_completed_buffers() { return _process_completed_buffers; }
-  void set_process_completed_buffers(bool x) { _process_completed_buffers = x; }
+  virtual void enqueue_completed_buffer(BufferNode* node) = 0;

  bool is_active() { return _all_active; }

  size_t buffer_size() const {
    return _allocator->buffer_size();
  }
-
-  // Get/Set the number of completed buffers that triggers log processing.
-  // Log processing should be done when the number of buffers exceeds the
-  // threshold.
-  void set_process_completed_buffers_threshold(size_t sz) {
-    _process_completed_buffers_threshold = sz;
-  }
-  size_t process_completed_buffers_threshold() const {
-    return _process_completed_buffers_threshold;
-  }
-  static const size_t ProcessCompletedBuffersThresholdNever = ~size_t(0);
-
-  size_t completed_buffers_num() const { return _n_completed_buffers; }
-
-  void merge_bufferlists(PtrQueueSet* src);
-
-  // Notify the consumer if the number of buffers crossed the threshold
-  void notify_if_necessary();
 };

 #endif // SHARE_GC_SHARED_PTRQUEUE_HPP
--- a/src/hotspot/share/gc/shared/satbMarkQueue.cpp
+++ b/src/hotspot/share/gc/shared/satbMarkQueue.cpp
@ -28,12 +28,15 @@
 #include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/mutexLocker.hpp"
+#include "runtime/orderAccess.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.hpp"
 #include "runtime/threadSMR.hpp"
 #include "runtime/vmThread.hpp"
+#include "utilities/globalCounter.inline.hpp"

 SATBMarkQueue::SATBMarkQueue(SATBMarkQueueSet* qset) :
  // SATB queues are only active during marking cycles. We create
@ -107,15 +110,66 @@ void SATBMarkQueue::print(const char* name) {

 SATBMarkQueueSet::SATBMarkQueueSet() :
  PtrQueueSet(),
+  _list(),
+  _count_and_process_flag(0),
+  _process_completed_buffers_threshold(SIZE_MAX),
  _buffer_enqueue_threshold(0)
 {}

-void SATBMarkQueueSet::initialize(Monitor* cbl_mon,
-                                  BufferNode::Allocator* allocator,
+SATBMarkQueueSet::~SATBMarkQueueSet() {
+  abandon_completed_buffers();
+}
+
+// _count_and_process_flag has flag in least significant bit, count in
+// remaining bits.  _process_completed_buffers_threshold is scaled
+// accordingly, with the lsbit set, so a _count_and_process_flag value
+// is directly comparable with the recorded threshold value.  The
+// process flag is set whenever the count exceeds the threshold, and
+// remains set until the count is reduced to zero.
+
+// Increment count.  If count > threshold, set flag, else maintain flag.
+static void increment_count(volatile size_t* cfptr, size_t threshold) {
+  size_t old;
+  size_t value = Atomic::load(cfptr);
+  do {
+    old = value;
+    value += 2;
+    assert(value > old, "overflow");
+    if (value > threshold) value |= 1;
+    value = Atomic::cmpxchg(value, cfptr, old);
+  } while (value != old);
+}
+
+// Decrement count.  If count == 0, clear flag, else maintain flag.
+static void decrement_count(volatile size_t* cfptr) {
+  size_t old;
+  size_t value = Atomic::load(cfptr);
+  do {
+    assert((value >> 1) != 0, "underflow");
+    old = value;
+    value -= 2;
+    if (value <= 1) value = 0;
+    value = Atomic::cmpxchg(value, cfptr, old);
+  } while (value != old);
+}
+
+// Scale requested threshold to align with count field.  If scaling
+// overflows, just use max value.  Set process flag field to make
+// comparison in increment_count exact.
+static size_t scale_threshold(size_t value) {
+  size_t scaled_value = value << 1;
+  if ((scaled_value >> 1) != value) {
+    scaled_value = SIZE_MAX;
+  }
+  return scaled_value | 1;
+}
+
+void SATBMarkQueueSet::initialize(BufferNode::Allocator* allocator,
                                  size_t process_completed_buffers_threshold,
                                  uint buffer_enqueue_threshold_percentage) {
-  PtrQueueSet::initialize(cbl_mon, allocator);
-  set_process_completed_buffers_threshold(process_completed_buffers_threshold);
+  PtrQueueSet::initialize(allocator);
+  _process_completed_buffers_threshold =
+    scale_threshold(process_completed_buffers_threshold);
  assert(buffer_size() != 0, "buffer size not initialized");
  // Minimum threshold of 1 ensures enqueuing of completely full buffers.
  size_t size = buffer_size();
@ -207,6 +261,38 @@ bool SATBMarkQueueSet::apply_closure_to_completed_buffer(SATBBufferClosure* cl)
  }
 }

+// SATB buffer life-cycle - Per-thread queues obtain buffers from the
+// qset's buffer allocator, fill them, and push them onto the qset's
+// list.  The GC concurrently pops buffers from the qset, processes
+// them, and returns them to the buffer allocator for re-use.  Both
+// the allocator and the qset use lock-free stacks.  The ABA problem
+// is solved by having both allocation pops and GC pops performed
+// within GlobalCounter critical sections, while the return of buffers
+// to the allocator performs a GlobalCounter synchronize before
+// pushing onto the allocator's list.
+
+void SATBMarkQueueSet::enqueue_completed_buffer(BufferNode* node) {
+  assert(node != NULL, "precondition");
+  // Increment count and update flag appropriately.  Done before
+  // pushing buffer so count is always at least the actual number in
+  // the list, and decrement never underflows.
+  increment_count(&_count_and_process_flag, _process_completed_buffers_threshold);
+  _list.push(*node);
+}
+
+BufferNode* SATBMarkQueueSet::get_completed_buffer() {
+  BufferNode* node;
+  {
+    GlobalCounter::CriticalSection cs(Thread::current());
+    node = _list.pop();
+  }
+  if (node != NULL) {
+    // Got a buffer so decrement count and update flag appropriately.
+    decrement_count(&_count_and_process_flag);
+  }
+  return node;
+}
+
 #ifndef PRODUCT
 // Helpful for debugging

@ -219,7 +305,7 @@ void SATBMarkQueueSet::print_all(const char* msg) {
  tty->cr();
  tty->print_cr("SATB BUFFERS [%s]", msg);

-  BufferNode* nd = completed_buffers_head();
+  BufferNode* nd = _list.top();
  int i = 0;
  while (nd != NULL) {
    void** buf = BufferNode::make_buffer_from_node(nd);
@ -248,6 +334,17 @@ void SATBMarkQueueSet::print_all(const char* msg) {
 }
 #endif // PRODUCT

+void SATBMarkQueueSet::abandon_completed_buffers() {
+  Atomic::store(size_t(0), &_count_and_process_flag);
+  BufferNode* buffers_to_delete = _list.pop_all();
+  while (buffers_to_delete != NULL) {
+    BufferNode* bn = buffers_to_delete;
+    buffers_to_delete = bn->next();
+    bn->set_next(NULL);
+    deallocate_buffer(bn);
+  }
+}
+
 void SATBMarkQueueSet::abandon_partial_marking() {
  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
  abandon_completed_buffers();
--- a/src/hotspot/share/gc/shared/satbMarkQueue.hpp
+++ b/src/hotspot/share/gc/shared/satbMarkQueue.hpp
@ -27,6 +27,7 @@

 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
+#include "memory/padded.hpp"

 class Thread;
 class Monitor;
@ -93,7 +94,17 @@ public:
 };

 class SATBMarkQueueSet: public PtrQueueSet {
+
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0);
+  PaddedEnd<BufferNode::Stack> _list;
+  volatile size_t _count_and_process_flag;
+  // These are rarely (if ever) changed, so same cache line as count.
+  size_t _process_completed_buffers_threshold;
  size_t _buffer_enqueue_threshold;
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, 3 * sizeof(size_t));
+
+  BufferNode* get_completed_buffer();
+  void abandon_completed_buffers();

 #ifdef ASSERT
  void dump_active_states(bool expected_active);
@ -102,15 +113,14 @@ class SATBMarkQueueSet: public PtrQueueSet {

 protected:
  SATBMarkQueueSet();
-  ~SATBMarkQueueSet() {}
+  ~SATBMarkQueueSet();

  template<typename Filter>
  void apply_filter(Filter filter, SATBMarkQueue* queue) {
    queue->apply_filter(filter);
  }

-  void initialize(Monitor* cbl_mon,
-                  BufferNode::Allocator* allocator,
+  void initialize(BufferNode::Allocator* allocator,
                  size_t process_completed_buffers_threshold,
                  uint buffer_enqueue_threshold_percentage);

@ -132,6 +142,19 @@ public:
  // buffer; the leading entries may be excluded due to filtering.
  bool apply_closure_to_completed_buffer(SATBBufferClosure* cl);

+  virtual void enqueue_completed_buffer(BufferNode* node);
+
+  // The number of buffers in the list.  Racy and not updated atomically
+  // with the set of completed buffers.
+  size_t completed_buffers_num() const {
+    return _count_and_process_flag >> 1;
+  }
+
+  // Return true if completed buffers should be processed.
+  bool process_completed_buffers() const {
+    return (_count_and_process_flag & 1) != 0;
+  }
+
 #ifndef PRODUCT
  // Helpful for debugging
  void print_all(const char* msg);
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
@ -348,7 +348,6 @@ jint ShenandoahHeap::initialize() {
  // The call below uses stuff (the SATB* things) that are in G1, but probably
  // belong into a shared location.
  ShenandoahBarrierSet::satb_mark_queue_set().initialize(this,
-                                                         SATB_Q_CBL_mon,
                                                         20 /* G1SATBProcessCompletedThreshold */,
                                                         60 /* G1SATBBufferEnqueueingThresholdPercent */);

--- a/src/hotspot/share/gc/shenandoah/shenandoahSATBMarkQueueSet.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahSATBMarkQueueSet.cpp
@ -33,11 +33,9 @@ ShenandoahSATBMarkQueueSet::ShenandoahSATBMarkQueueSet() :
 {}

 void ShenandoahSATBMarkQueueSet::initialize(ShenandoahHeap* const heap,
-                                            Monitor* cbl_mon,
                                            int process_completed_threshold,
                                            uint buffer_enqueue_threshold_percentage) {
-  SATBMarkQueueSet::initialize(cbl_mon,
-                               &_satb_mark_queue_buffer_allocator,
+  SATBMarkQueueSet::initialize(&_satb_mark_queue_buffer_allocator,
                               process_completed_threshold,
                               buffer_enqueue_threshold_percentage);
  _heap = heap;
--- a/src/hotspot/share/gc/shenandoah/shenandoahSATBMarkQueueSet.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahSATBMarkQueueSet.hpp
@ -44,7 +44,6 @@ public:
  ShenandoahSATBMarkQueueSet();

  void initialize(ShenandoahHeap* const heap,
-                  Monitor* cbl_mon,
                  int process_completed_threshold,
                  uint buffer_enqueue_threshold_percentage);

--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@ -82,7 +82,6 @@ Mutex*   NonJavaThreadsListSync_lock  = NULL;
 Monitor* CGC_lock                     = NULL;
 Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
-Monitor* SATB_Q_CBL_mon               = NULL;
 Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
@ -228,8 +227,6 @@ void mutex_init() {

  def(FullGCCount_lock             , PaddedMonitor, leaf,        true,  Monitor::_safepoint_check_never);      // in support of ExplicitGCInvokesConcurrent
  if (UseG1GC) {
-    def(SATB_Q_CBL_mon             , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
-
    def(DirtyCardQ_CBL_mon         , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
    def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  Monitor::_safepoint_check_never);

@ -246,8 +243,6 @@ void mutex_init() {
    def(MonitoringSupport_lock     , PaddedMutex  , native   ,   true,  Monitor::_safepoint_check_never);      // used for serviceability monitoring support
  }
  if (UseShenandoahGC) {
-    def(SATB_Q_CBL_mon             , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
-
    def(StringDedupQueue_lock      , PaddedMonitor, leaf,        true,  Monitor::_safepoint_check_never);
    def(StringDedupTable_lock      , PaddedMutex  , leaf,        true,  Monitor::_safepoint_check_never);
  }
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@ -77,8 +77,6 @@ extern Monitor* CGC_lock;                        // used for coordination betwee
                                                 // fore- & background GC threads.
 extern Monitor* STS_lock;                        // used for joining/leaving SuspendibleThreadSet.
 extern Monitor* FullGCCount_lock;                // in support of "concurrent" full gc
-extern Monitor* SATB_Q_CBL_mon;                  // Protects SATB Q
-                                                 // completed buffer queue.
 extern Monitor* DirtyCardQ_CBL_mon;              // Protects dirty card Q
                                                 // completed buffer queue.
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card