From 1001452ba9338d90bf279e38d2dd0a8a1e65f31a Mon Sep 17 00:00:00 2001
From: Doug Lea
Date: Wed, 3 Jul 2013 11:58:10 +0200
Subject: [PATCH] 8019481: Sync misc j.u.c classes from 166 to tl
Reviewed-by: martin
---
.../concurrent/BrokenBarrierException.java | 4 +-
.../java/util/concurrent/CountDownLatch.java | 26 +-
.../java/util/concurrent/CyclicBarrier.java | 59 +-
.../java/util/concurrent/Exchanger.java | 867 +++++++++---------
.../classes/java/util/concurrent/Phaser.java | 143 +--
.../java/util/concurrent/TimeUnit.java | 93 +-
.../util/concurrent/TimeoutException.java | 6 +-
.../java/util/concurrent/package-info.java | 14 +-
8 files changed, 600 insertions(+), 612 deletions(-)
diff --git a/jdk/src/share/classes/java/util/concurrent/BrokenBarrierException.java b/jdk/src/share/classes/java/util/concurrent/BrokenBarrierException.java
index 2c8f7e3396d..11f126e015a 100644
--- a/jdk/src/share/classes/java/util/concurrent/BrokenBarrierException.java
+++ b/jdk/src/share/classes/java/util/concurrent/BrokenBarrierException.java
@@ -49,13 +49,13 @@ public class BrokenBarrierException extends Exception {
private static final long serialVersionUID = 7117394618823254244L;
/**
- * Constructs a BrokenBarrierException with no specified detail
+ * Constructs a {@code BrokenBarrierException} with no specified detail
* message.
*/
public BrokenBarrierException() {}
/**
- * Constructs a BrokenBarrierException with the specified
+ * Constructs a {@code BrokenBarrierException} with the specified
* detail message.
*
* @param message the detail message
diff --git a/jdk/src/share/classes/java/util/concurrent/CountDownLatch.java b/jdk/src/share/classes/java/util/concurrent/CountDownLatch.java
index 055eb113727..a8b50ca3907 100644
--- a/jdk/src/share/classes/java/util/concurrent/CountDownLatch.java
+++ b/jdk/src/share/classes/java/util/concurrent/CountDownLatch.java
@@ -92,15 +92,15 @@ import java.util.concurrent.locks.AbstractQueuedSynchronizer;
* private final CountDownLatch startSignal;
* private final CountDownLatch doneSignal;
* Worker(CountDownLatch startSignal, CountDownLatch doneSignal) {
- * this.startSignal = startSignal;
- * this.doneSignal = doneSignal;
+ * this.startSignal = startSignal;
+ * this.doneSignal = doneSignal;
* }
* public void run() {
- * try {
- * startSignal.await();
- * doWork();
- * doneSignal.countDown();
- * } catch (InterruptedException ex) {} // return;
+ * try {
+ * startSignal.await();
+ * doWork();
+ * doneSignal.countDown();
+ * } catch (InterruptedException ex) {} // return;
* }
*
* void doWork() { ... }
@@ -130,14 +130,14 @@ import java.util.concurrent.locks.AbstractQueuedSynchronizer;
* private final CountDownLatch doneSignal;
* private final int i;
* WorkerRunnable(CountDownLatch doneSignal, int i) {
- * this.doneSignal = doneSignal;
- * this.i = i;
+ * this.doneSignal = doneSignal;
+ * this.i = i;
* }
* public void run() {
- * try {
- * doWork(i);
- * doneSignal.countDown();
- * } catch (InterruptedException ex) {} // return;
+ * try {
+ * doWork(i);
+ * doneSignal.countDown();
+ * } catch (InterruptedException ex) {} // return;
* }
*
* void doWork() { ... }
diff --git a/jdk/src/share/classes/java/util/concurrent/CyclicBarrier.java b/jdk/src/share/classes/java/util/concurrent/CyclicBarrier.java
index eb25879dbcf..d1186d8eb4f 100644
--- a/jdk/src/share/classes/java/util/concurrent/CyclicBarrier.java
+++ b/jdk/src/share/classes/java/util/concurrent/CyclicBarrier.java
@@ -45,14 +45,14 @@ import java.util.concurrent.locks.ReentrantLock;
* cyclic because it can be re-used after the waiting threads
* are released.
*
- *
A CyclicBarrier supports an optional {@link Runnable} command
+ *
A {@code CyclicBarrier} supports an optional {@link Runnable} command
* that is run once per barrier point, after the last thread in the party
* arrives, but before any threads are released.
* This barrier action is useful
* for updating shared-state before any of the parties continue.
*
- *
Sample usage: Here is an example of
- * using a barrier in a parallel decomposition design:
+ *
Sample usage: Here is an example of using a barrier in a
+ * parallel decomposition design:
*
*
{@code
* class Solver {
@@ -81,16 +81,20 @@ import java.util.concurrent.locks.ReentrantLock;
* public Solver(float[][] matrix) {
* data = matrix;
* N = matrix.length;
- * barrier = new CyclicBarrier(N,
- * new Runnable() {
- * public void run() {
- * mergeRows(...);
- * }
- * });
- * for (int i = 0; i < N; ++i)
- * new Thread(new Worker(i)).start();
+ * Runnable barrierAction =
+ * new Runnable() { public void run() { mergeRows(...); }};
+ * barrier = new CyclicBarrier(N, barrierAction);
*
- * waitUntilDone();
+ * List threads = new ArrayList(N);
+ * for (int i = 0; i < N; i++) {
+ * Thread thread = new Thread(new Worker(i));
+ * threads.add(thread);
+ * thread.start();
+ * }
+ *
+ * // wait until done
+ * for (Thread thread : threads)
+ * thread.join();
* }
* }}
*
@@ -98,8 +102,8 @@ import java.util.concurrent.locks.ReentrantLock;
* barrier until all rows have been processed. When all rows are processed
* the supplied {@link Runnable} barrier action is executed and merges the
* rows. If the merger
- * determines that a solution has been found then done() will return
- * true and each worker will terminate.
+ * determines that a solution has been found then {@code done()} will return
+ * {@code true} and each worker will terminate.
*
*
If the barrier action does not rely on the parties being suspended when
* it is executed, then any of the threads in the party could execute that
@@ -112,7 +116,7 @@ import java.util.concurrent.locks.ReentrantLock;
* // log the completion of this iteration
* }}
*
- *
The CyclicBarrier uses an all-or-none breakage model
+ *
The {@code CyclicBarrier} uses an all-or-none breakage model
* for failed synchronization attempts: If a thread leaves a barrier
* point prematurely because of interruption, failure, or timeout, all
* other threads waiting at that barrier point will also leave
@@ -139,7 +143,7 @@ public class CyclicBarrier {
* is reset. There can be many generations associated with threads
* using the barrier - due to the non-deterministic way the lock
* may be allocated to waiting threads - but only one of these
- * can be active at a time (the one to which count applies)
+ * can be active at a time (the one to which {@code count} applies)
* and all the rest are either broken or tripped.
* There need not be an active generation if there has been a break
* but no subsequent reset.
@@ -259,7 +263,7 @@ public class CyclicBarrier {
}
/**
- * Creates a new CyclicBarrier that will trip when the
+ * Creates a new {@code CyclicBarrier} that will trip when the
* given number of parties (threads) are waiting upon it, and which
* will execute the given barrier action when the barrier is tripped,
* performed by the last thread entering the barrier.
@@ -278,7 +282,7 @@ public class CyclicBarrier {
}
/**
- * Creates a new CyclicBarrier that will trip when the
+ * Creates a new {@code CyclicBarrier} that will trip when the
* given number of parties (threads) are waiting upon it, and
* does not perform a predefined action when the barrier is tripped.
*
@@ -301,7 +305,7 @@ public class CyclicBarrier {
/**
* Waits until all {@linkplain #getParties parties} have invoked
- * await on this barrier.
+ * {@code await} on this barrier.
*
*
If the current thread is not the last to arrive then it is
* disabled for thread scheduling purposes and lies dormant until
@@ -326,7 +330,7 @@ public class CyclicBarrier {
*
*
If the barrier is {@link #reset} while any thread is waiting,
* or if the barrier {@linkplain #isBroken is broken} when
- * await is invoked, or while any thread is waiting, then
+ * {@code await} is invoked, or while any thread is waiting, then
* {@link BrokenBarrierException} is thrown.
*
*
If any thread is {@linkplain Thread#interrupt interrupted} while waiting,
@@ -343,7 +347,7 @@ public class CyclicBarrier {
* the broken state.
*
* @return the arrival index of the current thread, where index
- * {@link #getParties()} - 1 indicates the first
+ * {@code getParties() - 1} indicates the first
* to arrive and zero indicates the last to arrive
* @throws InterruptedException if the current thread was interrupted
* while waiting
@@ -351,7 +355,7 @@ public class CyclicBarrier {
* interrupted or timed out while the current thread was
* waiting, or the barrier was reset, or the barrier was
* broken when {@code await} was called, or the barrier
- * action (if present) failed due an exception.
+ * action (if present) failed due to an exception
*/
public int await() throws InterruptedException, BrokenBarrierException {
try {
@@ -363,7 +367,7 @@ public class CyclicBarrier {
/**
* Waits until all {@linkplain #getParties parties} have invoked
- * await on this barrier, or the specified waiting time elapses.
+ * {@code await} on this barrier, or the specified waiting time elapses.
*
*
If the current thread is not the last to arrive then it is
* disabled for thread scheduling purposes and lies dormant until
@@ -393,7 +397,7 @@ public class CyclicBarrier {
*
*
If the barrier is {@link #reset} while any thread is waiting,
* or if the barrier {@linkplain #isBroken is broken} when
- * await is invoked, or while any thread is waiting, then
+ * {@code await} is invoked, or while any thread is waiting, then
* {@link BrokenBarrierException} is thrown.
*
*
If any thread is {@linkplain Thread#interrupt interrupted} while
@@ -412,16 +416,17 @@ public class CyclicBarrier {
* @param timeout the time to wait for the barrier
* @param unit the time unit of the timeout parameter
* @return the arrival index of the current thread, where index
- * {@link #getParties()} - 1 indicates the first
+ * {@code getParties() - 1} indicates the first
* to arrive and zero indicates the last to arrive
* @throws InterruptedException if the current thread was interrupted
* while waiting
- * @throws TimeoutException if the specified timeout elapses
+ * @throws TimeoutException if the specified timeout elapses.
+ * In this case the barrier will be broken.
* @throws BrokenBarrierException if another thread was
* interrupted or timed out while the current thread was
* waiting, or the barrier was reset, or the barrier was broken
* when {@code await} was called, or the barrier action (if
- * present) failed due an exception
+ * present) failed due to an exception
*/
public int await(long timeout, TimeUnit unit)
throws InterruptedException,
diff --git a/jdk/src/share/classes/java/util/concurrent/Exchanger.java b/jdk/src/share/classes/java/util/concurrent/Exchanger.java
index 5accdb1ce58..980b0e187a4 100644
--- a/jdk/src/share/classes/java/util/concurrent/Exchanger.java
+++ b/jdk/src/share/classes/java/util/concurrent/Exchanger.java
@@ -35,7 +35,8 @@
*/
package java.util.concurrent;
-import java.util.concurrent.atomic.*;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.LockSupport;
/**
@@ -52,7 +53,7 @@ import java.util.concurrent.locks.LockSupport;
* to swap buffers between threads so that the thread filling the
* buffer gets a freshly emptied one when it needs it, handing off the
* filled one to the thread emptying the buffer.
- *
{@code
+ *
{@code
* class FillAndEmpty {
* Exchanger exchanger = new Exchanger();
* DataBuffer initialEmptyBuffer = ... a made-up type
@@ -88,8 +89,7 @@ import java.util.concurrent.locks.LockSupport;
* new Thread(new FillingLoop()).start();
* new Thread(new EmptyingLoop()).start();
* }
- * }
- * }
+ * }}
*
*
Memory consistency effects: For each pair of threads that
* successfully exchange objects via an {@code Exchanger}, actions
@@ -103,486 +103,425 @@ import java.util.concurrent.locks.LockSupport;
* @param The type of objects that may be exchanged
*/
public class Exchanger {
+
/*
- * Algorithm Description:
+ * Overview: The core algorithm is, for an exchange "slot",
+ * and a participant (caller) with an item:
*
- * The basic idea is to maintain a "slot", which is a reference to
- * a Node containing both an Item to offer and a "hole" waiting to
- * get filled in. If an incoming "occupying" thread sees that the
- * slot is null, it CAS'es (compareAndSets) a Node there and waits
- * for another to invoke exchange. That second "fulfilling" thread
- * sees that the slot is non-null, and so CASes it back to null,
- * also exchanging items by CASing the hole, plus waking up the
- * occupying thread if it is blocked. In each case CAS'es may
- * fail because a slot at first appears non-null but is null upon
- * CAS, or vice-versa. So threads may need to retry these
- * actions.
+ * for (;;) {
+ * if (slot is empty) { // offer
+ * place item in a Node;
+ * if (can CAS slot from empty to node) {
+ * wait for release;
+ * return matching item in node;
+ * }
+ * }
+ * else if (can CAS slot from node to empty) { // release
+ * get the item in node;
+ * set matching item in node;
+ * release waiting thread;
+ * }
+ * // else retry on CAS failure
+ * }
*
- * This simple approach works great when there are only a few
- * threads using an Exchanger, but performance rapidly
- * deteriorates due to CAS contention on the single slot when
- * there are lots of threads using an exchanger. So instead we use
- * an "arena"; basically a kind of hash table with a dynamically
- * varying number of slots, any one of which can be used by
- * threads performing an exchange. Incoming threads pick slots
- * based on a hash of their Thread ids. If an incoming thread
- * fails to CAS in its chosen slot, it picks an alternative slot
- * instead. And similarly from there. If a thread successfully
- * CASes into a slot but no other thread arrives, it tries
- * another, heading toward the zero slot, which always exists even
- * if the table shrinks. The particular mechanics controlling this
- * are as follows:
+ * This is among the simplest forms of a "dual data structure" --
+ * see Scott and Scherer's DISC 04 paper and
+ * http://www.cs.rochester.edu/research/synchronization/pseudocode/duals.html
*
- * Waiting: Slot zero is special in that it is the only slot that
- * exists when there is no contention. A thread occupying slot
- * zero will block if no thread fulfills it after a short spin.
- * In other cases, occupying threads eventually give up and try
- * another slot. Waiting threads spin for a while (a period that
- * should be a little less than a typical context-switch time)
- * before either blocking (if slot zero) or giving up (if other
- * slots) and restarting. There is no reason for threads to block
- * unless there are unlikely to be any other threads present.
- * Occupants are mainly avoiding memory contention so sit there
- * quietly polling for a shorter period than it would take to
- * block and then unblock them. Non-slot-zero waits that elapse
- * because of lack of other threads waste around one extra
- * context-switch time per try, which is still on average much
- * faster than alternative approaches.
+ * This works great in principle. But in practice, like many
+ * algorithms centered on atomic updates to a single location, it
+ * scales horribly when there are more than a few participants
+ * using the same Exchanger. So the implementation instead uses a
+ * form of elimination arena, that spreads out this contention by
+ * arranging that some threads typically use different slots,
+ * while still ensuring that eventually, any two parties will be
+ * able to exchange items. That is, we cannot completely partition
+ * across threads, but instead give threads arena indices that
+ * will on average grow under contention and shrink under lack of
+ * contention. We approach this by defining the Nodes that we need
+ * anyway as ThreadLocals, and include in them per-thread index
+ * and related bookkeeping state. (We can safely reuse per-thread
+ * nodes rather than creating them fresh each time because slots
+ * alternate between pointing to a node vs null, so cannot
+ * encounter ABA problems. However, we do need some care in
+ * resetting them between uses.)
*
- * Sizing: Usually, using only a few slots suffices to reduce
- * contention. Especially with small numbers of threads, using
- * too many slots can lead to just as poor performance as using
- * too few of them, and there's not much room for error. The
- * variable "max" maintains the number of slots actually in
- * use. It is increased when a thread sees too many CAS
- * failures. (This is analogous to resizing a regular hash table
- * based on a target load factor, except here, growth steps are
- * just one-by-one rather than proportional.) Growth requires
- * contention failures in each of three tried slots. Requiring
- * multiple failures for expansion copes with the fact that some
- * failed CASes are not due to contention but instead to simple
- * races between two threads or thread pre-emptions occurring
- * between reading and CASing. Also, very transient peak
- * contention can be much higher than the average sustainable
- * levels. An attempt to decrease the max limit is usually made
- * when a non-slot-zero wait elapses without being fulfilled.
- * Threads experiencing elapsed waits move closer to zero, so
- * eventually find existing (or future) threads even if the table
- * has been shrunk due to inactivity. The chosen mechanics and
- * thresholds for growing and shrinking are intrinsically
- * entangled with indexing and hashing inside the exchange code,
- * and can't be nicely abstracted out.
+ * Implementing an effective arena requires allocating a bunch of
+ * space, so we only do so upon detecting contention (except on
+ * uniprocessors, where they wouldn't help, so aren't used).
+ * Otherwise, exchanges use the single-slot slotExchange method.
+ * On contention, not only must the slots be in different
+ * locations, but the locations must not encounter memory
+ * contention due to being on the same cache line (or more
+ * generally, the same coherence unit). Because, as of this
+ * writing, there is no way to determine cacheline size, we define
+ * a value that is enough for common platforms. Additionally,
+ * extra care elsewhere is taken to avoid other false/unintended
+ * sharing and to enhance locality, including adding padding (via
+ * sun.misc.Contended) to Nodes, embedding "bound" as an Exchanger
+ * field, and reworking some park/unpark mechanics compared to
+ * LockSupport versions.
*
- * Hashing: Each thread picks its initial slot to use in accord
- * with a simple hashcode. The sequence is the same on each
- * encounter by any given thread, but effectively random across
- * threads. Using arenas encounters the classic cost vs quality
- * tradeoffs of all hash tables. Here, we use a one-step FNV-1a
- * hash code based on the current thread's Thread.getId(), along
- * with a cheap approximation to a mod operation to select an
- * index. The downside of optimizing index selection in this way
- * is that the code is hardwired to use a maximum table size of
- * 32. But this value more than suffices for known platforms and
- * applications.
+ * The arena starts out with only one used slot. We expand the
+ * effective arena size by tracking collisions; i.e., failed CASes
+ * while trying to exchange. By nature of the above algorithm, the
+ * only kinds of collision that reliably indicate contention are
+ * when two attempted releases collide -- one of two attempted
+ * offers can legitimately fail to CAS without indicating
+ * contention by more than one other thread. (Note: it is possible
+ * but not worthwhile to more precisely detect contention by
+ * reading slot values after CAS failures.) When a thread has
+ * collided at each slot within the current arena bound, it tries
+ * to expand the arena size by one. We track collisions within
+ * bounds by using a version (sequence) number on the "bound"
+ * field, and conservatively reset collision counts when a
+ * participant notices that bound has been updated (in either
+ * direction).
*
- * Probing: On sensed contention of a selected slot, we probe
- * sequentially through the table, analogously to linear probing
- * after collision in a hash table. (We move circularly, in
- * reverse order, to mesh best with table growth and shrinkage
- * rules.) Except that to minimize the effects of false-alarms
- * and cache thrashing, we try the first selected slot twice
- * before moving.
+ * The effective arena size is reduced (when there is more than
+ * one slot) by giving up on waiting after a while and trying to
+ * decrement the arena size on expiration. The value of "a while"
+ * is an empirical matter. We implement by piggybacking on the
+ * use of spin->yield->block that is essential for reasonable
+ * waiting performance anyway -- in a busy exchanger, offers are
+ * usually almost immediately released, in which case context
+ * switching on multiprocessors is extremely slow/wasteful. Arena
+ * waits just omit the blocking part, and instead cancel. The spin
+ * count is empirically chosen to be a value that avoids blocking
+ * 99% of the time under maximum sustained exchange rates on a
+ * range of test machines. Spins and yields entail some limited
+ * randomness (using a cheap xorshift) to avoid regular patterns
+ * that can induce unproductive grow/shrink cycles. (Using a
+ * pseudorandom also helps regularize spin cycle duration by
+ * making branches unpredictable.) Also, during an offer, a
+ * waiter can "know" that it will be released when its slot has
+ * changed, but cannot yet proceed until match is set. In the
+ * mean time it cannot cancel the offer, so instead spins/yields.
+ * Note: It is possible to avoid this secondary check by changing
+ * the linearization point to be a CAS of the match field (as done
+ * in one case in the Scott & Scherer DISC paper), which also
+ * increases asynchrony a bit, at the expense of poorer collision
+ * detection and inability to always reuse per-thread nodes. So
+ * the current scheme is typically a better tradeoff.
*
- * Padding: Even with contention management, slots are heavily
- * contended, so use cache-padding to avoid poor memory
- * performance. Because of this, slots are lazily constructed
- * only when used, to avoid wasting this space unnecessarily.
- * While isolation of locations is not much of an issue at first
- * in an application, as time goes on and garbage-collectors
- * perform compaction, slots are very likely to be moved adjacent
- * to each other, which can cause much thrashing of cache lines on
- * MPs unless padding is employed.
+ * On collisions, indices traverse the arena cyclically in reverse
+ * order, restarting at the maximum index (which will tend to be
+ * sparsest) when bounds change. (On expirations, indices instead
+ * are halved until reaching 0.) It is possible (and has been
+ * tried) to use randomized, prime-value-stepped, or double-hash
+ * style traversal instead of simple cyclic traversal to reduce
+ * bunching. But empirically, whatever benefits these may have
+ * don't overcome their added overhead: We are managing operations
+ * that occur very quickly unless there is sustained contention,
+ * so simpler/faster control policies work better than more
+ * accurate but slower ones.
*
- * This is an improvement of the algorithm described in the paper
- * "A Scalable Elimination-based Exchange Channel" by William
- * Scherer, Doug Lea, and Michael Scott in Proceedings of SCOOL05
- * workshop. Available at: http://hdl.handle.net/1802/2104
+ * Because we use expiration for arena size control, we cannot
+ * throw TimeoutExceptions in the timed version of the public
+ * exchange method until the arena size has shrunken to zero (or
+ * the arena isn't enabled). This may delay response to timeout
+ * but is still within spec.
+ *
+ * Essentially all of the implementation is in methods
+ * slotExchange and arenaExchange. These have similar overall
+ * structure, but differ in too many details to combine. The
+ * slotExchange method uses the single Exchanger field "slot"
+ * rather than arena array elements. However, it still needs
+ * minimal collision detection to trigger arena construction.
+ * (The messiest part is making sure interrupt status and
+ * InterruptedExceptions come out right during transitions when
+ * both methods may be called. This is done by using null return
+ * as a sentinel to recheck interrupt status.)
+ *
+ * As is too common in this sort of code, methods are monolithic
+ * because most of the logic relies on reads of fields that are
+ * maintained as local variables so can't be nicely factored --
+ * mainly, here, bulky spin->yield->block/cancel code), and
+ * heavily dependent on intrinsics (Unsafe) to use inlined
+ * embedded CAS and related memory access operations (that tend
+ * not to be as readily inlined by dynamic compilers when they are
+ * hidden behind other methods that would more nicely name and
+ * encapsulate the intended effects). This includes the use of
+ * putOrderedX to clear fields of the per-thread Nodes between
+ * uses. Note that field Node.item is not declared as volatile
+ * even though it is read by releasing threads, because they only
+ * do so after CAS operations that must precede access, and all
+ * uses by the owning thread are otherwise acceptably ordered by
+ * other operations. (Because the actual points of atomicity are
+ * slot CASes, it would also be legal for the write to Node.match
+ * in a release to be weaker than a full volatile write. However,
+ * this is not done because it could allow further postponement of
+ * the write, delaying progress.)
*/
+ /**
+ * The byte distance (as a shift value) between any two used slots
+ * in the arena. 1 << ASHIFT should be at least cacheline size.
+ */
+ private static final int ASHIFT = 7;
+
+ /**
+ * The maximum supported arena index. The maximum allocatable
+ * arena size is MMASK + 1. Must be a power of two minus one, less
+ * than (1<<(31-ASHIFT)). The cap of 255 (0xff) more than suffices
+ * for the expected scaling limits of the main algorithms.
+ */
+ private static final int MMASK = 0xff;
+
+ /**
+ * Unit for sequence/version bits of bound field. Each successful
+ * change to the bound also adds SEQ.
+ */
+ private static final int SEQ = MMASK + 1;
+
/** The number of CPUs, for sizing and spin control */
private static final int NCPU = Runtime.getRuntime().availableProcessors();
/**
- * The capacity of the arena. Set to a value that provides more
- * than enough space to handle contention. On small machines
- * most slots won't be used, but it is still not wasted because
- * the extra space provides some machine-level address padding
- * to minimize interference with heavily CAS'ed Slot locations.
- * And on very large machines, performance eventually becomes
- * bounded by memory bandwidth, not numbers of threads/CPUs.
- * This constant cannot be changed without also modifying
- * indexing and hashing algorithms.
+ * The maximum slot index of the arena: The number of slots that
+ * can in principle hold all threads without contention, or at
+ * most the maximum indexable value.
*/
- private static final int CAPACITY = 32;
+ static final int FULL = (NCPU >= (MMASK << 1)) ? MMASK : NCPU >>> 1;
/**
- * The value of "max" that will hold all threads without
- * contention. When this value is less than CAPACITY, some
- * otherwise wasted expansion can be avoided.
+ * The bound for spins while waiting for a match. The actual
+ * number of iterations will on average be about twice this value
+ * due to randomization. Note: Spinning is disabled when NCPU==1.
*/
- private static final int FULL =
- Math.max(0, Math.min(CAPACITY, NCPU / 2) - 1);
-
- /**
- * The number of times to spin (doing nothing except polling a
- * memory location) before blocking or giving up while waiting to
- * be fulfilled. Should be zero on uniprocessors. On
- * multiprocessors, this value should be large enough so that two
- * threads exchanging items as fast as possible block only when
- * one of them is stalled (due to GC or preemption), but not much
- * longer, to avoid wasting CPU resources. Seen differently, this
- * value is a little over half the number of cycles of an average
- * context switch time on most systems. The value here is
- * approximately the average of those across a range of tested
- * systems.
- */
- private static final int SPINS = (NCPU == 1) ? 0 : 2000;
-
- /**
- * The number of times to spin before blocking in timed waits.
- * Timed waits spin more slowly because checking the time takes
- * time. The best value relies mainly on the relative rate of
- * System.nanoTime vs memory accesses. The value is empirically
- * derived to work well across a variety of systems.
- */
- private static final int TIMED_SPINS = SPINS / 20;
-
- /**
- * Sentinel item representing cancellation of a wait due to
- * interruption, timeout, or elapsed spin-waits. This value is
- * placed in holes on cancellation, and used as a return value
- * from waiting methods to indicate failure to set or get hole.
- */
- private static final Object CANCEL = new Object();
+ private static final int SPINS = 1 << 10;
/**
* Value representing null arguments/returns from public
- * methods. This disambiguates from internal requirement that
- * holes start out as null to mean they are not yet set.
+ * methods. Needed because the API originally didn't disallow null
+ * arguments, which it should have.
*/
private static final Object NULL_ITEM = new Object();
/**
- * Nodes hold partially exchanged data. This class
- * opportunistically subclasses AtomicReference to represent the
- * hole. So get() returns hole, and compareAndSet CAS'es value
- * into hole. This class cannot be parameterized as "V" because
- * of the use of non-V CANCEL sentinels.
+ * Sentinel value returned by internal exchange methods upon
+ * timeout, to avoid need for separate timed versions of these
+ * methods.
*/
- @SuppressWarnings("serial")
- private static final class Node extends AtomicReference