From 0c693e2f03b1adef0e946ebc32827ac09192f5f0 Mon Sep 17 00:00:00 2001
From: Kim Barrett <kbarrett@openjdk.org>
Date: Tue, 22 Jun 2021 17:43:59 +0000
Subject: [PATCH] 8268290: Improve LockFreeQueue<> utility

Reviewed-by: iwalulya, tschatzl
---
 src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp  |  27 +--
 src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp  |   8 +-
 src/hotspot/share/utilities/lockFreeQueue.hpp | 120 -----------
 .../share/utilities/lockFreeQueue.inline.hpp  | 165 ---------------
 .../share/utilities/nonblockingQueue.hpp      | 136 ++++++++++++
 .../utilities/nonblockingQueue.inline.hpp     | 197 ++++++++++++++++++
 ...reeQueue.cpp => test_nonblockingQueue.cpp} |  95 ++++-----
 7 files changed, 383 insertions(+), 365 deletions(-)
 delete mode 100644 src/hotspot/share/utilities/lockFreeQueue.hpp
 delete mode 100644 src/hotspot/share/utilities/lockFreeQueue.inline.hpp
 create mode 100644 src/hotspot/share/utilities/nonblockingQueue.hpp
 create mode 100644 src/hotspot/share/utilities/nonblockingQueue.inline.hpp
 rename test/hotspot/gtest/utilities/{test_lockFreeQueue.cpp => test_nonblockingQueue.cpp} (76%)

diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
index 6769296346b..0a4a6c40d23 100644
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@@ -45,8 +45,8 @@
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
 #include "utilities/globalCounter.inline.hpp"
-#include "utilities/lockFreeQueue.inline.hpp"
 #include "utilities/macros.hpp"
+#include "utilities/nonblockingQueue.inline.hpp"
 #include "utilities/pair.hpp"
 #include "utilities/quickSort.hpp"
 #include "utilities/ticks.hpp"
@@ -131,12 +131,12 @@ void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
 }
 
 // Thread-safe attempt to remove and return the first buffer from
-// the _completed queue, using the LockFreeQueue::try_pop() underneath.
-// It has a restriction that it may return NULL when there are objects
+// the _completed queue, using the NonblockingQueue::try_pop() underneath.
+// It has a limitation that it may return NULL when there are objects
 // in the queue if there is a concurrent push/append operation.
 BufferNode* G1DirtyCardQueueSet::dequeue_completed_buffer() {
-  using Status = LockFreeQueuePopStatus;
   Thread* current_thread = Thread::current();
+  BufferNode* result = NULL;
   while (true) {
     // Use GlobalCounter critical section to avoid ABA problem.
     // The release of a buffer to its allocator's free list uses
@@ -147,19 +147,7 @@ BufferNode* G1DirtyCardQueueSet::dequeue_completed_buffer() {
     // one CS could defer releasing buffer to the free list for reuse,
     // leading to excessive allocations.
     GlobalCounter::CriticalSection cs(current_thread);
-    Pair<Status, BufferNode*> pop_result = _completed.try_pop();
-    switch (pop_result.first) {
-      case Status::success:
-        return pop_result.second;
-      case Status::operation_in_progress:
-        // Returning NULL instead retrying, in order to mitigate the
-        // chance of spinning for a long time. In the case of getting a
-        // buffer to refine, it is also OK to return NULL when there is
-        // an interfering concurrent push/append operation.
-        return NULL;
-      case Status::lost_race:
-        break;  // Try again.
-    }
+    if (_completed.try_pop(&result)) return result;
   }
 }
 
@@ -177,8 +165,9 @@ BufferNode* G1DirtyCardQueueSet::get_completed_buffer() {
 #ifdef ASSERT
 void G1DirtyCardQueueSet::verify_num_cards() const {
   size_t actual = 0;
-  BufferNode* cur = _completed.top();
-  for ( ; cur != NULL; cur = cur->next()) {
+  for (BufferNode* cur = _completed.first();
+       !_completed.is_end(cur);
+       cur = cur->next()) {
     actual += buffer_size() - cur->index();
   }
   assert(actual == Atomic::load(&_num_cards),
diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
index e07f9740a01..9e9ad0ee80c 100644
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@@ -32,7 +32,7 @@
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
-#include "utilities/lockFreeQueue.hpp"
+#include "utilities/nonblockingQueue.hpp"
 
 class G1ConcurrentRefineThread;
 class G1DirtyCardQueueSet;
@@ -164,9 +164,9 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
   volatile size_t _num_cards;
   DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(size_t));
   // Buffers ready for refinement.
-  // LockFreeQueue has inner padding of one cache line.
-  LockFreeQueue<BufferNode, &BufferNode::next_ptr> _completed;
-  // Add a trailer padding after LockFreeQueue.
+  // NonblockingQueue has inner padding of one cache line.
+  NonblockingQueue<BufferNode, &BufferNode::next_ptr> _completed;
+  // Add a trailer padding after NonblockingQueue.
   DEFINE_PAD_MINUS_SIZE(3, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*));
   // Buffers for which refinement is temporarily paused.
   // PausedBuffers has inner padding, including trailer.
diff --git a/src/hotspot/share/utilities/lockFreeQueue.hpp b/src/hotspot/share/utilities/lockFreeQueue.hpp
deleted file mode 100644
index 9322787f2c1..00000000000
--- a/src/hotspot/share/utilities/lockFreeQueue.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_UTILITIES_LOCKFREEQUEUE_HPP
-#define SHARE_UTILITIES_LOCKFREEQUEUE_HPP
-
-#include "memory/padded.hpp"
-#include "utilities/globalDefinitions.hpp"
-#include "utilities/pair.hpp"
-
-// Return status of a LockFreeQueue::try_pop() call.
-// See description for try_pop() below.
-enum class LockFreeQueuePopStatus {
-  success,
-  lost_race,
-  operation_in_progress
-};
-
-// The LockFreeQueue template provides a lock-free FIFO. Its structure
-// and usage is similar to LockFreeStack. It provides a try_pop() function
-// for the client to implement pop() according to its need (e.g., whether
-// or not to retry or prevent ABA problem). It has inner padding of one
-// cache line between its two internal pointer fields.
-//
-// \tparam T is the class of the elements in the queue.
-//
-// \tparam next_ptr is a function pointer.  Applying this function to
-// an object of type T must return a pointer to the list entry member
-// of the object associated with the LockFreeQueue type.
-template<typename T, T* volatile* (*next_ptr)(T&)>
-class LockFreeQueue {
-  T* volatile _head;
-  // Padding of one cache line to avoid false sharing.
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(T*));
-  T* volatile _tail;
-
-  NONCOPYABLE(LockFreeQueue);
-
-  // Return the entry following node in the list used by the
-  // specialized LockFreeQueue class.
-  static inline T* next(const T& node);
-
-  // Set the entry following node to new_next in the list used by the
-  // specialized LockFreeQueue class. Not thread-safe, as it cannot
-  // concurrently run with push or try_pop operations that modify this
-  // node.
-  static inline void set_next(T& node, T* new_next);
-
-public:
-  inline LockFreeQueue();
-  DEBUG_ONLY(~LockFreeQueue();)
-
-  // Return the first object in the queue.
-  // Thread-safe, but the result may change immediately.
-  inline T* top() const;
-
-  // Return true if the queue is empty.
-  inline bool empty() const { return top() == NULL; }
-
-  // Return the number of objects in the queue.
-  // Not thread-safe. There must be no concurrent modification
-  // while the length is being determined.
-  inline size_t length() const;
-
-  // Thread-safe add the object to the end of the queue.
-  inline void push(T& node) { append(node, node); }
-
-  // Thread-safe add the objects from first to last to the end of the queue.
-  inline void append(T& first, T& last);
-
-  // Thread-safe attempt to remove and return the first object in the queue.
-  // Returns a <LockFreeQueuePopStatus, T*> pair for the caller to determine
-  // further operation. 3 possible cases depending on pair.first:
-  // - success:
-  //   The operation succeeded. If pair.second is NULL, the queue is empty;
-  //   otherwise caller can assume ownership of the object pointed by
-  //   pair.second. Note that this case is still subject to ABA behavior;
-  //   callers must ensure usage is safe.
-  // - lost_race:
-  //   An atomic operation failed. pair.second is NULL.
-  //   The caller can typically retry in this case.
-  // - operation_in_progress:
-  //   An in-progress concurrent operation interfered with taking what had been
-  //   the only remaining element in the queue. pair.second is NULL.
-  //   A concurrent try_pop may have already claimed it, but not completely
-  //   updated the queue. Alternatively, a concurrent push/append may have not
-  //   yet linked the new entry(s) to the former sole entry. Retrying the try_pop
-  //   will continue to fail in this way until that other thread has updated the
-  //   queue's internal structure.
-  inline Pair<LockFreeQueuePopStatus, T*> try_pop();
-
-  // Take all the objects from the queue, leaving the queue empty.
-  // Not thread-safe. It should only be used when there is no concurrent
-  // push/append/try_pop operation.
-  // Returns a pair of <head, tail> pointers to the current queue.
-  inline Pair<T*, T*> take_all();
-};
-
-#endif // SHARE_UTILITIES_LOCKFREEQUEUE_HPP
diff --git a/src/hotspot/share/utilities/lockFreeQueue.inline.hpp b/src/hotspot/share/utilities/lockFreeQueue.inline.hpp
deleted file mode 100644
index 1cbf1518e6c..00000000000
--- a/src/hotspot/share/utilities/lockFreeQueue.inline.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_UTILITIES_LOCKFREEQUEUE_INLINE_HPP
-#define SHARE_UTILITIES_LOCKFREEQUEUE_INLINE_HPP
-
-#include "utilities/lockFreeQueue.hpp"
-
-#include "runtime/atomic.hpp"
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-T* LockFreeQueue<T, next_ptr>::next(const T& node) {
-  return Atomic::load(next_ptr(const_cast<T&>(node)));
-}
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-void LockFreeQueue<T, next_ptr>::set_next(T& node, T* new_next) {
-    Atomic::store(next_ptr(node), new_next);
-}
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-LockFreeQueue<T, next_ptr>::LockFreeQueue() : _head(NULL), _tail(NULL) {}
-
-#ifdef ASSERT
-template<typename T, T* volatile* (*next_ptr)(T&)>
-LockFreeQueue<T, next_ptr>::~LockFreeQueue() {
-  assert(_head == NULL, "precondition");
-  assert(_tail == NULL, "precondition");
-}
-#endif
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-T* LockFreeQueue<T, next_ptr>::top() const {
-  return Atomic::load(&_head);
-}
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-size_t LockFreeQueue<T, next_ptr>::length() const {
-  size_t result = 0;
-  for (const T* current = top(); current != NULL; current = next(*current)) {
-    ++result;
-  }
-  return result;
-}
-
-// An append operation atomically exchanges the new tail with the queue tail.
-// It then sets the "next" value of the old tail to the head of the list being
-// appended; it is an invariant that the old tail's "next" value is NULL.
-// But if the old tail is NULL then the queue was empty.  In this case the
-// head of the list being appended is instead stored in the queue head; it is
-// an invariant that the queue head is NULL in this case.
-//
-// This means there is a period between the exchange and the old tail update
-// where the queue sequence is split into two parts, the list from the queue
-// head to the old tail, and the list being appended.  If there are concurrent
-// push/append operations, each may introduce another such segment.  But they
-// all eventually get resolved by their respective updates of their old tail's
-// "next" value.  This also means that try_pop operation must handle an object
-// with a NULL "next" value specially.
-//
-// A push operation is just a degenerate append, where the object being pushed
-// is both the head and the tail of the list being appended.
-template<typename T, T* volatile* (*next_ptr)(T&)>
-void LockFreeQueue<T, next_ptr>::append(T& first, T& last) {
-  assert(next(last) == NULL, "precondition");
-  T* old_tail = Atomic::xchg(&_tail, &last);
-  if (old_tail == NULL) {       // Was empty.
-    Atomic::store(&_head, &first);
-  } else {
-    assert(next(*old_tail) == NULL, "invariant");
-    set_next(*old_tail, &first);
-  }
-}
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-Pair<LockFreeQueuePopStatus, T*> LockFreeQueue<T, next_ptr>::try_pop() {
-  typedef Pair<LockFreeQueuePopStatus, T*> StatusPair;
-  // We only need memory_order_consume. Upgrade it to "load_acquire"
-  // as the memory_order_consume API is not ready for use yet.
-  T* result = Atomic::load_acquire(&_head);
-  if (result == NULL) {
-    // Queue is empty.
-    return StatusPair(LockFreeQueuePopStatus::success, NULL);
-  }
-
-  // This relaxed load is always followed by a cmpxchg(), thus it
-  // is OK as the reader-side of the release-acquire ordering.
-  T* next_node = Atomic::load(next_ptr(*result));
-  if (next_node != NULL) {
-    // The "usual" lock-free pop from the head of a singly linked list.
-    if (result == Atomic::cmpxchg(&_head, result, next_node)) {
-      // Former head successfully taken; it is not the last.
-      assert(Atomic::load(&_tail) != result, "invariant");
-      assert(next(*result) != NULL, "invariant");
-      set_next(*result, NULL);
-      return StatusPair(LockFreeQueuePopStatus::success, result);
-    }
-    // Lost the race; the caller should try again.
-    return StatusPair(LockFreeQueuePopStatus::lost_race, NULL);
-  }
-
-  // next is NULL.  This case is handled differently from the "usual"
-  // lock-free pop from the head of a singly linked list.
-
-  // If _tail == result then result is the only element in the list. We can
-  // remove it from the list by first setting _tail to NULL and then setting
-  // _head to NULL, the order being important.  We set _tail with cmpxchg in
-  // case of a concurrent push/append/try_pop also changing _tail.  If we win
-  // then we've claimed result.
-  if (Atomic::cmpxchg(&_tail, result, (T*)NULL) == result) {
-    assert(next(*result) == NULL, "invariant");
-    // Now that we've claimed result, also set _head to NULL.  But we must
-    // be careful of a concurrent push/append after we NULLed _tail, since
-    // it may have already performed its list-was-empty update of _head,
-    // which we must not overwrite.
-    Atomic::cmpxchg(&_head, result, (T*)NULL);
-    return StatusPair(LockFreeQueuePopStatus::success, result);
-  }
-
-  // If _head != result then we lost the race to take result;
-  // the caller should try again.
-  if (result != Atomic::load_acquire(&_head)) {
-    return StatusPair(LockFreeQueuePopStatus::lost_race, NULL);
-  }
-
-  // An in-progress concurrent operation interfered with taking the head
-  // element when it was the only element.  A concurrent try_pop may have won
-  // the race to clear the tail but not yet cleared the head. Alternatively,
-  // a concurrent push/append may have changed the tail but not yet linked
-  // result->next(). This case slightly differs from the "lost_race" case,
-  // because the caller could wait for a long time for the other concurrent
-  // operation to finish.
-  return StatusPair(LockFreeQueuePopStatus::operation_in_progress, NULL);
-}
-
-template<typename T, T* volatile* (*next_ptr)(T&)>
-Pair<T*, T*> LockFreeQueue<T, next_ptr>::take_all() {
-  Pair<T*, T*> result(Atomic::load(&_head), Atomic::load(&_tail));
-  Atomic::store(&_head, (T*)NULL);
-  Atomic::store(&_tail, (T*)NULL);
-  return result;
-}
-
-#endif // SHARE_UTILITIES_LOCKFREEQUEUE_INLINE_HPP
diff --git a/src/hotspot/share/utilities/nonblockingQueue.hpp b/src/hotspot/share/utilities/nonblockingQueue.hpp
new file mode 100644
index 00000000000..499dca8721c
--- /dev/null
+++ b/src/hotspot/share/utilities/nonblockingQueue.hpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_UTILITIES_NONBLOCKINGQUEUE_HPP
+#define SHARE_UTILITIES_NONBLOCKINGQUEUE_HPP
+
+#include "memory/padded.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/pair.hpp"
+
+// The NonblockingQueue template provides a non-blocking FIFO. It provides a
+// try_pop() function for the client to implement pop() according to its
+// need (e.g., whether or not to retry or prevent ABA problem). It has inner
+// padding of one cache line between its two internal pointer fields.
+//
+// The queue is internally represented by a linked list of elements, with
+// the link to the next element provided by a member of each element.
+// Access to this member is provided by the next_ptr function.
+//
+// The queue has a special pseudo-element that marks the end of the list.
+// Each queue has its own unique special element.  A pointer to this element
+// can be recognized using the is_end() function.  Such a pointer must never
+// be dereferenced.  This end marker is the value of the next member of the
+// last element in the queue, and possibly other elements while modifying
+// the queue.
+//
+// A queue may temporarily appear to be empty even though elements have been
+// added and not removed.  For example, after running the following program,
+// the value of r may be NULL.
+//
+// thread1: q.push(a); r = q.pop();
+// thread2: q.push(b);
+//
+// This can occur if the push of b started before the push of a, but didn't
+// complete until after the pop.
+//
+// \tparam T is the class of the elements in the queue.
+//
+// \tparam next_ptr is a function pointer.  Applying this function to
+// an object of type T must return a pointer to the list entry member
+// of the object associated with the NonblockingQueue type.
+template<typename T, T* volatile* (*next_ptr)(T&)>
+class NonblockingQueue {
+  T* volatile _head;
+  // Padding of one cache line to avoid false sharing.
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(T*));
+  T* volatile _tail;
+
+  NONCOPYABLE(NonblockingQueue);
+
+  // Return the entry following node in the list used by the
+  // specialized NonblockingQueue class.
+  static inline T* next(const T& node);
+
+  // Set the entry following node to new_next in the list used by the
+  // specialized NonblockingQueue class. Not thread-safe, as it cannot
+  // concurrently run with push or try_pop operations that modify this
+  // node.
+  static inline void set_next(T& node, T* new_next);
+
+  // A unique pseudo-object pointer associated with this specific queue.
+  // The resulting pointer must not be dereferenced.
+  inline T* end_marker() const;
+
+public:
+  inline NonblockingQueue();
+  inline ~NonblockingQueue() NOT_DEBUG(= default);
+
+  // Return true if the queue is empty.
+  // Not thread-safe.  There must be no concurrent modification while the
+  // queue is being tested.
+  inline bool empty() const;
+
+  // Return the number of objects in the queue.
+  // Not thread-safe. There must be no concurrent modification while the
+  // length is being determined.
+  inline size_t length() const;
+
+  // Thread-safe add the object to the end of the queue.
+  inline void push(T& node) { append(node, node); }
+
+  // Thread-safe add the objects from first to last to the end of the queue.
+  inline void append(T& first, T& last);
+
+  // Thread-safe attempt to remove and return the first object in the queue.
+  // Returns true if successful.  If successful then *node_ptr is the former
+  // first object, or NULL if the queue was empty.  If unsuccessful, because
+  // of contention with a concurrent modification, then returns false with
+  // the value of *node_ptr unspecified.  Subject to ABA behavior; callers
+  // must ensure usage is safe.
+  inline bool try_pop(T** node_ptr);
+
+  // Thread-safe remove and return the first object in the queue, or NULL if
+  // the queue was empty.  This just iterates on try_pop() until it
+  // succeeds, returning the (possibly NULL) element obtained from that.
+  // Subject to ABA behavior; callers must ensure usage is safe.
+  inline T* pop();
+
+  // Take all the objects from the queue, leaving the queue empty.
+  // Not thread-safe.  There must be no concurrent operations.
+  // Returns a pair of <head, tail> pointers to the current queue.
+  inline Pair<T*, T*> take_all();
+
+  // Iteration support is provided by first() and is_end().  The queue must
+  // not be modified while iterating over its elements.
+
+  // Return the first object in the queue, or an end marker (a pointer p for
+  // which is_end(p) is true) if the queue is empty.
+  inline T* first() const;
+
+  // Test whether entry is an end marker for this queue.
+  inline bool is_end(const T* entry) const;
+};
+
+#endif // SHARE_UTILITIES_NONBLOCKINGQUEUE_HPP
diff --git a/src/hotspot/share/utilities/nonblockingQueue.inline.hpp b/src/hotspot/share/utilities/nonblockingQueue.inline.hpp
new file mode 100644
index 00000000000..d16f5a9692a
--- /dev/null
+++ b/src/hotspot/share/utilities/nonblockingQueue.inline.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_UTILITIES_NONBLOCKINGQUEUE_INLINE_HPP
+#define SHARE_UTILITIES_NONBLOCKINGQUEUE_INLINE_HPP
+
+#include "utilities/nonblockingQueue.hpp"
+
+#include "runtime/atomic.hpp"
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+T* NonblockingQueue<T, next_ptr>::next(const T& node) {
+  return Atomic::load(next_ptr(const_cast<T&>(node)));
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+void NonblockingQueue<T, next_ptr>::set_next(T& node, T* new_next) {
+  Atomic::store(next_ptr(node), new_next);
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+NonblockingQueue<T, next_ptr>::NonblockingQueue() : _head(NULL), _tail(NULL) {}
+
+#ifdef ASSERT
+template<typename T, T* volatile* (*next_ptr)(T&)>
+NonblockingQueue<T, next_ptr>::~NonblockingQueue() {
+  assert(_head == NULL, "precondition");
+  assert(_tail == NULL, "precondition");
+}
+#endif
+
+// The end_marker must be uniquely associated with the specific queue, in
+// case queue elements can make their way through multiple queues.  A
+// pointer to the queue itself (after casting) satisfies that requirement.
+template<typename T, T* volatile* (*next_ptr)(T&)>
+T* NonblockingQueue<T, next_ptr>::end_marker() const {
+  return const_cast<T*>(reinterpret_cast<const T*>(this));
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+T* NonblockingQueue<T, next_ptr>::first() const {
+  T* head = Atomic::load(&_head);
+  return head == NULL ? end_marker() : head;
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+bool NonblockingQueue<T, next_ptr>::is_end(const T* entry) const {
+  return entry == end_marker();
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+bool NonblockingQueue<T, next_ptr>::empty() const {
+  return Atomic::load(&_head) == NULL;
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+size_t NonblockingQueue<T, next_ptr>::length() const {
+  size_t result = 0;
+  for (T* cur = first(); !is_end(cur); cur = next(*cur)) {
+    ++result;
+  }
+  return result;
+}
+
+// An append operation atomically exchanges the new tail with the queue tail.
+// It then sets the "next" value of the old tail to the head of the list being
+// appended; it is an invariant that the old tail's "next" value is NULL.
+// But if the old tail is NULL then the queue was empty.  In this case the
+// head of the list being appended is instead stored in the queue head; it is
+// an invariant that the queue head is NULL in this case.
+//
+// This means there is a period between the exchange and the old tail update
+// where the queue sequence is split into two parts, the list from the queue
+// head to the old tail, and the list being appended.  If there are concurrent
+// push/append operations, each may introduce another such segment.  But they
+// all eventually get resolved by their respective updates of their old tail's
+// "next" value.  This also means that try_pop operation must handle an object
+// with a NULL "next" value specially.
+//
+// A push operation is just a degenerate append, where the object being pushed
+// is both the head and the tail of the list being appended.
+template<typename T, T* volatile* (*next_ptr)(T&)>
+void NonblockingQueue<T, next_ptr>::append(T& first, T& last) {
+  assert(next(last) == NULL, "precondition");
+  set_next(last, end_marker());
+  T* old_tail = Atomic::xchg(&_tail, &last);
+  if ((old_tail == NULL) ||
+      // Try to install first as old_tail's next.
+      !is_end(Atomic::cmpxchg(next_ptr(*old_tail), end_marker(), &first))) {
+    // Install first as the new head if either
+    // (1) the list was empty, or
+    // (2) a concurrent try_pop claimed old_tail, so it is no longer in the list.
+    Atomic::store(&_head, &first);
+  }
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+bool NonblockingQueue<T, next_ptr>::try_pop(T** node_ptr) {
+  // We only need memory_order_consume. Upgrade it to "load_acquire"
+  // as the memory_order_consume API is not ready for use yet.
+  T* result = Atomic::load_acquire(&_head);
+  if (result == NULL) {
+    *node_ptr = NULL;
+    return true;                // Queue is empty.
+  }
+
+  T* next_node = Atomic::load_acquire(next_ptr(*result));
+  if (next_node == NULL) {
+    // A concurrent try_pop already claimed what was the last entry.  That
+    // operation may not have cleared queue head yet, but we should still
+    // treat the queue as empty until a push/append operation changes head
+    // to an entry with a non-NULL next value.
+    *node_ptr = NULL;
+    return true;
+
+  } else if (!is_end(next_node)) {
+    // The next_node is not at the end of the queue's list.  Use the "usual"
+    // lock-free pop from the head of a singly linked list to try to take it.
+    if (result == Atomic::cmpxchg(&_head, result, next_node)) {
+      // Former head successfully taken.
+      set_next(*result, NULL);
+      *node_ptr = result;
+      return true;
+    } else {
+      // Lost race to take result from the head of the list.
+      return false;
+    }
+
+  } else if (is_end(Atomic::cmpxchg(next_ptr(*result), end_marker(), (T*)NULL))) {
+    // Result was the last entry and we've claimed it by setting its next
+    // value to NULL.  However, this leaves the queue in disarray.  Fix up
+    // the queue, possibly in conjunction with other concurrent operations.
+    // Any further try_pops will consider the queue empty until a
+    // push/append completes by installing a new head.
+
+    // Attempt to change the queue tail from result to NULL.  Failure of the
+    // cmpxchg indicates that a concurrent push/append updated the tail first.
+    // That operation will eventually recognize the old tail (our result) is
+    // no longer in the list and update head from the list being appended.
+    Atomic::cmpxchg(&_tail, result, (T*)NULL);
+
+    // Attempt to change the queue head from result to NULL.  Failure of the
+    // cmpxchg indicates a concurrent push/append updated the head first.
+    Atomic::cmpxchg(&_head, result, (T*)NULL);
+
+    // The queue has been restored to order, and we can return the result.
+    *node_ptr = result;
+    return true;
+
+  } else {
+    // Result was the last entry in the list, but either a concurrent pop
+    // claimed it first or a concurrent push/append extended the list from
+    // it.  Either way, we lost the race.
+    return false;
+  }
+}
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+T* NonblockingQueue<T, next_ptr>::pop() {
+  T* result = NULL;
+  while (!try_pop(&result)) {}
+  return result;
+}
+
+
+template<typename T, T* volatile* (*next_ptr)(T&)>
+Pair<T*, T*> NonblockingQueue<T, next_ptr>::take_all() {
+  T* tail = Atomic::load(&_tail);
+  if (tail != NULL) set_next(*tail, NULL); // Clear end marker.
+  Pair<T*, T*> result(Atomic::load(&_head), tail);
+  Atomic::store(&_head, (T*)NULL);
+  Atomic::store(&_tail, (T*)NULL);
+  return result;
+}
+
+#endif // SHARE_UTILITIES_NONBLOCKINGQUEUE_INLINE_HPP
diff --git a/test/hotspot/gtest/utilities/test_lockFreeQueue.cpp b/test/hotspot/gtest/utilities/test_nonblockingQueue.cpp
similarity index 76%
rename from test/hotspot/gtest/utilities/test_lockFreeQueue.cpp
rename to test/hotspot/gtest/utilities/test_nonblockingQueue.cpp
index e5eabc103d0..88f2bf85ee8 100644
--- a/test/hotspot/gtest/utilities/test_lockFreeQueue.cpp
+++ b/test/hotspot/gtest/utilities/test_nonblockingQueue.cpp
@@ -25,14 +25,14 @@
 #include "memory/allocation.inline.hpp"
 #include "runtime/atomic.hpp"
 #include "utilities/globalDefinitions.hpp"
-#include "utilities/lockFreeQueue.inline.hpp"
+#include "utilities/nonblockingQueue.inline.hpp"
 #include "utilities/pair.hpp"
 #include "threadHelper.inline.hpp"
 #include "unittest.hpp"
 #include <new>
 
-class LockFreeQueueTestElement {
-  typedef LockFreeQueueTestElement Element;
+class NonblockingQueueTestElement {
+  typedef NonblockingQueueTestElement Element;
 
   Element* volatile _entry;
   Element* volatile _entry1;
@@ -42,41 +42,17 @@ class LockFreeQueueTestElement {
   static Element* volatile* entry1_ptr(Element& e) { return &e._entry1; }
 
 public:
-  class TestQueue: public LockFreeQueue<Element, &entry_ptr> {
-  public:
-    Element* pop() {
-      using Status = LockFreeQueuePopStatus;
-      while (true) {
-        Pair<Status, Element*> pop_result = try_pop();
-        if (pop_result.first == Status::success) {
-          return pop_result.second;
-        }
-        // Retry until success.
-      }
-    }
-  };
-  class TestQueue1: public LockFreeQueue<Element, &entry1_ptr> {
-  public:
-    Element* pop() {
-      using Status = LockFreeQueuePopStatus;
-      while (true) {
-        Pair<Status, Element*> pop_result = try_pop();
-        if (pop_result.first == Status::success) {
-          return pop_result.second;
-        }
-        // Retry until success.
-      }
-    }
-  };
+  using TestQueue = NonblockingQueue<Element, &entry_ptr>;
+  using TestQueue1 = NonblockingQueue<Element, &entry1_ptr>;
 
-  LockFreeQueueTestElement(size_t id = 0) : _entry(), _entry1(), _id(id) {}
+  NonblockingQueueTestElement(size_t id = 0) : _entry(), _entry1(), _id(id) {}
   size_t id() const { return _id; }
   void set_id(size_t value) { _id = value; }
   Element* next() { return _entry; }
   Element* next1() { return _entry1; }
 };
 
-typedef LockFreeQueueTestElement Element;
+typedef NonblockingQueueTestElement Element;
 typedef Element::TestQueue TestQueue;
 typedef Element::TestQueue1 TestQueue1;
 
@@ -86,8 +62,8 @@ static void initialize(Element* elements, size_t size, TestQueue* queue) {
   }
   ASSERT_TRUE(queue->empty());
   ASSERT_EQ(0u, queue->length());
+  ASSERT_TRUE(queue->is_end(queue->first()));
   ASSERT_TRUE(queue->pop() == NULL);
-  ASSERT_TRUE(queue->top() == NULL);
 
   for (size_t id = 0; id < size; ++id) {
     ASSERT_EQ(id, queue->length());
@@ -95,27 +71,27 @@ static void initialize(Element* elements, size_t size, TestQueue* queue) {
     ASSERT_EQ(id, e->id());
     queue->push(*e);
     ASSERT_FALSE(queue->empty());
-    // top() is always the oldest element.
-    ASSERT_EQ(&elements[0], queue->top());
+    // first() is always the oldest element.
+    ASSERT_EQ(&elements[0], queue->first());
   }
 }
 
-class LockFreeQueueTestBasics : public ::testing::Test {
+class NonblockingQueueTestBasics : public ::testing::Test {
 public:
-  LockFreeQueueTestBasics();
+  NonblockingQueueTestBasics();
 
   static const size_t nelements = 10;
   Element elements[nelements];
   TestQueue queue;
 };
 
-const size_t LockFreeQueueTestBasics::nelements;
+const size_t NonblockingQueueTestBasics::nelements;
 
-LockFreeQueueTestBasics::LockFreeQueueTestBasics() : queue() {
+NonblockingQueueTestBasics::NonblockingQueueTestBasics() : queue() {
   initialize(elements, nelements, &queue);
 }
 
-TEST_F(LockFreeQueueTestBasics, pop) {
+TEST_F(NonblockingQueueTestBasics, pop) {
   for (size_t i = 0; i < nelements; ++i) {
     ASSERT_FALSE(queue.empty());
     ASSERT_EQ(nelements - i, queue.length());
@@ -129,11 +105,11 @@ TEST_F(LockFreeQueueTestBasics, pop) {
   ASSERT_TRUE(queue.pop() == NULL);
 }
 
-TEST_F(LockFreeQueueTestBasics, append) {
+TEST_F(NonblockingQueueTestBasics, append) {
   TestQueue other_queue;
   ASSERT_TRUE(other_queue.empty());
   ASSERT_EQ(0u, other_queue.length());
-  ASSERT_TRUE(other_queue.top() == NULL);
+  ASSERT_TRUE(other_queue.is_end(other_queue.first()));
   ASSERT_TRUE(other_queue.pop() == NULL);
 
   Pair<Element*, Element*> pair = queue.take_all();
@@ -141,8 +117,8 @@ TEST_F(LockFreeQueueTestBasics, append) {
   ASSERT_EQ(nelements, other_queue.length());
   ASSERT_TRUE(queue.empty());
   ASSERT_EQ(0u, queue.length());
+  ASSERT_TRUE(queue.is_end(queue.first()));
   ASSERT_TRUE(queue.pop() == NULL);
-  ASSERT_TRUE(queue.top() == NULL);
 
   for (size_t i = 0; i < nelements; ++i) {
     ASSERT_EQ(nelements - i, other_queue.length());
@@ -155,7 +131,7 @@ TEST_F(LockFreeQueueTestBasics, append) {
   ASSERT_TRUE(other_queue.pop() == NULL);
 }
 
-TEST_F(LockFreeQueueTestBasics, two_queues) {
+TEST_F(NonblockingQueueTestBasics, two_queues) {
   TestQueue1 queue1;
   ASSERT_TRUE(queue1.pop() == NULL);
 
@@ -163,14 +139,19 @@ TEST_F(LockFreeQueueTestBasics, two_queues) {
     queue1.push(elements[id]);
   }
   ASSERT_EQ(nelements, queue1.length());
-  Element* e0 = queue.top();
-  Element* e1 = queue1.top();
-  while (true) {
+  Element* e0 = queue.first();
+  Element* e1 = queue1.first();
+  ASSERT_TRUE(e0 != NULL);
+  ASSERT_TRUE(e1 != NULL);
+  ASSERT_FALSE(queue.is_end(e0));
+  ASSERT_FALSE(queue1.is_end(e1));
+  while (!queue.is_end(e0) && !queue1.is_end(e1)) {
     ASSERT_EQ(e0, e1);
-    if (e0 == NULL) break;
     e0 = e0->next();
     e1 = e1->next1();
   }
+  ASSERT_TRUE(queue.is_end(e0));
+  ASSERT_TRUE(queue1.is_end(e1));
 
   for (size_t i = 0; i < nelements; ++i) {
     ASSERT_EQ(nelements - i, queue.length());
@@ -194,7 +175,7 @@ TEST_F(LockFreeQueueTestBasics, two_queues) {
   ASSERT_TRUE(queue1.pop() == NULL);
 }
 
-class LockFreeQueueTestThread : public JavaTestThread {
+class NonblockingQueueTestThread : public JavaTestThread {
   uint _id;
   TestQueue* _from;
   TestQueue* _to;
@@ -204,12 +185,12 @@ class LockFreeQueueTestThread : public JavaTestThread {
   volatile bool _ready;
 
 public:
-  LockFreeQueueTestThread(Semaphore* post,
-                          uint id,
-                          TestQueue* from,
-                          TestQueue* to,
-                          volatile size_t* processed,
-                          size_t process_limit) :
+  NonblockingQueueTestThread(Semaphore* post,
+                             uint id,
+                             TestQueue* from,
+                             TestQueue* to,
+                             volatile size_t* processed,
+                             size_t process_limit) :
     JavaTestThread(post),
     _id(id),
     _from(from),
@@ -238,7 +219,7 @@ public:
   bool ready() const { return Atomic::load_acquire(&_ready); }
 };
 
-TEST_VM(LockFreeQueueTest, stress) {
+TEST_VM(NonblockingQueueTest, stress) {
   Semaphore post;
   TestQueue initial_queue;
   TestQueue start_queue;
@@ -263,7 +244,7 @@ TEST_VM(LockFreeQueueTest, stress) {
   const uint stage1_threads = 2;
   const uint stage2_threads = 2;
   const uint nthreads = stage1_threads + stage2_threads;
-  LockFreeQueueTestThread* threads[nthreads] = {};
+  NonblockingQueueTestThread* threads[nthreads] = {};
 
   for (uint i = 0; i < ARRAY_SIZE(threads); ++i) {
     TestQueue* from = &start_queue;
@@ -275,7 +256,7 @@ TEST_VM(LockFreeQueueTest, stress) {
       processed = &stage2_processed;
     }
     threads[i] =
-      new LockFreeQueueTestThread(&post, i, from, to, processed, nelements);
+      new NonblockingQueueTestThread(&post, i, from, to, processed, nelements);
     threads[i]->doit();
     while (!threads[i]->ready()) {} // Wait until ready to start test.
   }