8047104: cleanup misc issues prior to Contended Locking reorder and cache

Checkpoint misc cleanups for Contended Locking prior to first optimization bucket. Reviewed-by: dholmes, sspitsyn, dice
2014-07-03 11:07:51 -07:00 · 2014-07-03 11:07:51 -07:00 · 2876714328
commit 2876714328
parent a145a396fc
16 changed files with 173 additions and 200 deletions
--- a/hotspot/src/os/bsd/vm/os_bsd.cpp
+++ b/hotspot/src/os/bsd/vm/os_bsd.cpp
@ -4218,22 +4218,12 @@ static struct timespec* compute_abstime(struct timespec* abstime, jlong millis)
  return abstime;
 }

-
-// Test-and-clear _Event, always leaves _Event set to 0, returns immediately.
-// Conceptually TryPark() should be equivalent to park(0).
-
-int os::PlatformEvent::TryPark() {
-  for (;;) {
-    const int v = _Event;
-    guarantee((v == 0) || (v == 1), "invariant");
-    if (Atomic::cmpxchg(0, &_Event, v) == v) return v;
-  }
-}
-
 void os::PlatformEvent::park() {       // AKA "down()"
  // Invariant: Only the thread associated with the Event/PlatformEvent
  // may call park().
  // TODO: assert that _Assoc != NULL or _Assoc == Self
+  assert(_nParked == 0, "invariant");
+
  int v;
  for (;;) {
      v = _Event;
@ -4333,8 +4323,7 @@ void os::PlatformEvent::unpark() {
  //    1 :=> 1
  //   -1 :=> either 0 or 1; must signal target thread
  //          That is, we can safely transition _Event from -1 to either
-  //          0 or 1. Forcing 1 is slightly more efficient for back-to-back
-  //          unpark() calls.
+  //          0 or 1.
  // See also: "Semaphores in Plan 9" by Mullender & Cox
  //
  // Note: Forcing a transition from "-1" to "1" on an unpark() means
@ -4541,10 +4530,9 @@ void Parker::park(bool isAbsolute, jlong time) {
 }

 void Parker::unpark() {
-  int s, status;
-  status = pthread_mutex_lock(_mutex);
+  int status = pthread_mutex_lock(_mutex);
  assert(status == 0, "invariant");
-  s = _counter;
+  const int s = _counter;
  _counter = 1;
  if (s < 1) {
     if (WorkAroundNPTLTimedWaitHang) {
--- a/hotspot/src/os/bsd/vm/os_bsd.hpp
+++ b/hotspot/src/os/bsd/vm/os_bsd.hpp
@ -219,7 +219,6 @@ class PlatformEvent : public CHeapObj<mtInternal> {
    int  fired() { return _Event; }
    void park();
    void unpark();
-    int  TryPark();
    int  park(jlong millis);
    void SetAssociation(Thread * a) { _Assoc = a; }
 };
--- a/hotspot/src/os/linux/vm/os_linux.cpp
+++ b/hotspot/src/os/linux/vm/os_linux.cpp
@ -5457,22 +5457,12 @@ static struct timespec* compute_abstime(timespec* abstime, jlong millis) {
  return abstime;
 }

-
-// Test-and-clear _Event, always leaves _Event set to 0, returns immediately.
-// Conceptually TryPark() should be equivalent to park(0).
-
-int os::PlatformEvent::TryPark() {
-  for (;;) {
-    const int v = _Event;
-    guarantee((v == 0) || (v == 1), "invariant");
-    if (Atomic::cmpxchg(0, &_Event, v) == v) return v;
-  }
-}
-
 void os::PlatformEvent::park() {       // AKA "down()"
  // Invariant: Only the thread associated with the Event/PlatformEvent
  // may call park().
  // TODO: assert that _Assoc != NULL or _Assoc == Self
+  assert(_nParked == 0, "invariant");
+
  int v;
  for (;;) {
      v = _Event;
@ -5572,8 +5562,7 @@ void os::PlatformEvent::unpark() {
  //    1 :=> 1
  //   -1 :=> either 0 or 1; must signal target thread
  //          That is, we can safely transition _Event from -1 to either
-  //          0 or 1. Forcing 1 is slightly more efficient for back-to-back
-  //          unpark() calls.
+  //          0 or 1.
  // See also: "Semaphores in Plan 9" by Mullender & Cox
  //
  // Note: Forcing a transition from "-1" to "1" on an unpark() means
@ -5801,10 +5790,9 @@ void Parker::park(bool isAbsolute, jlong time) {
 }

 void Parker::unpark() {
-  int s, status;
-  status = pthread_mutex_lock(_mutex);
+  int status = pthread_mutex_lock(_mutex);
  assert(status == 0, "invariant");
-  s = _counter;
+  const int s = _counter;
  _counter = 1;
  if (s < 1) {
    // thread might be parked
--- a/hotspot/src/os/linux/vm/os_linux.hpp
+++ b/hotspot/src/os/linux/vm/os_linux.hpp
@ -315,7 +315,6 @@ class PlatformEvent : public CHeapObj<mtInternal> {
    int  fired() { return _Event; }
    void park();
    void unpark();
-    int  TryPark();
    int  park(jlong millis); // relative timed-wait only
    void SetAssociation(Thread * a) { _Assoc = a; }
 };
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp
@ -5440,20 +5440,11 @@ static timestruc_t* compute_abstime(timestruc_t* abstime, jlong millis) {
  return abstime;
 }

-// Test-and-clear _Event, always leaves _Event set to 0, returns immediately.
-// Conceptually TryPark() should be equivalent to park(0).
-
-int os::PlatformEvent::TryPark() {
-  for (;;) {
-    const int v = _Event;
-    guarantee((v == 0) || (v == 1), "invariant");
-    if (Atomic::cmpxchg(0, &_Event, v) == v) return v;
-  }
-}
-
 void os::PlatformEvent::park() {           // AKA: down()
  // Invariant: Only the thread associated with the Event/PlatformEvent
  // may call park().
+  assert(_nParked == 0, "invariant");
+
  int v;
  for (;;) {
      v = _Event;
@ -5540,8 +5531,7 @@ void os::PlatformEvent::unpark() {
  //    1 :=> 1
  //   -1 :=> either 0 or 1; must signal target thread
  //          That is, we can safely transition _Event from -1 to either
-  //          0 or 1. Forcing 1 is slightly more efficient for back-to-back
-  //          unpark() calls.
+  //          0 or 1.
  // See also: "Semaphores in Plan 9" by Mullender & Cox
  //
  // Note: Forcing a transition from "-1" to "1" on an unpark() means
@ -5745,10 +5735,9 @@ void Parker::park(bool isAbsolute, jlong time) {
 }

 void Parker::unpark() {
-  int s, status;
-  status = os::Solaris::mutex_lock(_mutex);
+  int status = os::Solaris::mutex_lock(_mutex);
  assert(status == 0, "invariant");
-  s = _counter;
+  const int s = _counter;
  _counter = 1;
  status = os::Solaris::mutex_unlock(_mutex);
  assert(status == 0, "invariant");
--- a/hotspot/src/os/solaris/vm/os_solaris.hpp
+++ b/hotspot/src/os/solaris/vm/os_solaris.hpp
@ -332,7 +332,6 @@ class PlatformEvent : public CHeapObj<mtInternal> {
    int  fired() { return _Event; }
    void park();
    int  park(jlong millis);
-    int  TryPark();
    void unpark();
 };

--- a/hotspot/src/os/windows/vm/os_windows.cpp
+++ b/hotspot/src/os/windows/vm/os_windows.cpp
@ -4876,8 +4876,7 @@ void os::PlatformEvent::unpark() {
  //    1 :=> 1
  //   -1 :=> either 0 or 1; must signal target thread
  //          That is, we can safely transition _Event from -1 to either
-  //          0 or 1. Forcing 1 is slightly more efficient for back-to-back
-  //          unpark() calls.
+  //          0 or 1.
  // See also: "Semaphores in Plan 9" by Mullender & Cox
  //
  // Note: Forcing a transition from "-1" to "1" on an unpark() means
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -1130,29 +1130,30 @@ class CommandLineFlags {
          "Use LWP-based instead of libthread-based synchronization "       \
          "(SPARC only)")                                                   \
                                                                            \
-  product(ccstr, SyncKnobs, NULL,                                           \
-          "(Unstable) Various monitor synchronization tunables")            \
+  experimental(ccstr, SyncKnobs, NULL,                                      \
+               "(Unstable) Various monitor synchronization tunables")       \
                                                                            \
-  product(intx, EmitSync, 0,                                                \
-          "(Unsafe, Unstable) "                                             \
-          "Control emission of inline sync fast-path code")                 \
+  experimental(intx, EmitSync, 0,                                           \
+               "(Unsafe, Unstable) "                                        \
+               "Control emission of inline sync fast-path code")            \
                                                                            \
  product(intx, MonitorBound, 0, "Bound Monitor population")                \
                                                                            \
  product(bool, MonitorInUseLists, false, "Track Monitors for Deflation")   \
                                                                            \
-  product(intx, SyncFlags, 0, "(Unsafe, Unstable) Experimental Sync flags") \
+  experimental(intx, SyncFlags, 0, "(Unsafe, Unstable) "                    \
+               "Experimental Sync flags")                                   \
                                                                            \
-  product(intx, SyncVerbose, 0, "(Unstable)")                               \
+  experimental(intx, SyncVerbose, 0, "(Unstable)")                          \
                                                                            \
-  product(intx, ClearFPUAtPark, 0, "(Unsafe, Unstable)")                    \
+  experimental(intx, ClearFPUAtPark, 0, "(Unsafe, Unstable)")               \
                                                                            \
-  product(intx, hashCode, 5,                                                \
-          "(Unstable) select hashCode generation algorithm")                \
+  experimental(intx, hashCode, 5,                                           \
+               "(Unstable) select hashCode generation algorithm")           \
                                                                            \
-  product(intx, WorkAroundNPTLTimedWaitHang, 1,                             \
-          "(Unstable, Linux-specific) "                                     \
-          "avoid NPTL-FUTEX hang pthread_cond_timedwait")                   \
+  experimental(intx, WorkAroundNPTLTimedWaitHang, 1,                        \
+               "(Unstable, Linux-specific) "                                \
+               "avoid NPTL-FUTEX hang pthread_cond_timedwait")              \
                                                                            \
  product(bool, FilterSpuriousWakeups, true,                                \
          "When true prevents OS-level spurious, or premature, wakeups "    \
--- a/hotspot/src/share/vm/runtime/mutex.cpp
+++ b/hotspot/src/share/vm/runtime/mutex.cpp
@ -1,4 +1,3 @@
-
 /*
 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@ -488,7 +487,6 @@ void Monitor::ILock (Thread * Self) {
  for (;;) {
    assert(_OnDeck == ESelf, "invariant");
    if (TrySpin(Self)) break;
-    // CONSIDER: if ESelf->TryPark() && TryLock() break ...
    // It's probably wise to spin only if we *actually* blocked
    // CONSIDER: check the lockbyte, if it remains set then
    // preemptively drain the cxq into the EntryList.
--- a/hotspot/src/share/vm/runtime/objectMonitor.cpp
+++ b/hotspot/src/share/vm/runtime/objectMonitor.cpp
@ -46,9 +46,9 @@

 #if defined(__GNUC__) && !defined(IA64) && !defined(PPC64)
  // Need to inhibit inlining for older versions of GCC to avoid build-time failures
-  #define ATTR __attribute__((noinline))
+  #define NOINLINE __attribute__((noinline))
 #else
-  #define ATTR
+  #define NOINLINE
 #endif


@ -103,38 +103,39 @@
 // The knob* variables are effectively final.  Once set they should
 // never be modified hence.  Consider using __read_mostly with GCC.

-int ObjectMonitor::Knob_Verbose    = 0;
-int ObjectMonitor::Knob_SpinLimit  = 5000;    // derived by an external tool -
-static int Knob_LogSpins           = 0;       // enable jvmstat tally for spins
-static int Knob_HandOff            = 0;
-static int Knob_ReportSettings     = 0;
+int ObjectMonitor::Knob_Verbose     = 0;
+int ObjectMonitor::Knob_VerifyInUse = 0;
+int ObjectMonitor::Knob_SpinLimit   = 5000;    // derived by an external tool -
+static int Knob_LogSpins            = 0;       // enable jvmstat tally for spins
+static int Knob_HandOff             = 0;
+static int Knob_ReportSettings      = 0;

-static int Knob_SpinBase           = 0;       // Floor AKA SpinMin
-static int Knob_SpinBackOff        = 0;       // spin-loop backoff
-static int Knob_CASPenalty         = -1;      // Penalty for failed CAS
-static int Knob_OXPenalty          = -1;      // Penalty for observed _owner change
-static int Knob_SpinSetSucc        = 1;       // spinners set the _succ field
-static int Knob_SpinEarly          = 1;
-static int Knob_SuccEnabled        = 1;       // futile wake throttling
-static int Knob_SuccRestrict       = 0;       // Limit successors + spinners to at-most-one
-static int Knob_MaxSpinners        = -1;      // Should be a function of # CPUs
-static int Knob_Bonus              = 100;     // spin success bonus
-static int Knob_BonusB             = 100;     // spin success bonus
-static int Knob_Penalty            = 200;     // spin failure penalty
-static int Knob_Poverty            = 1000;
-static int Knob_SpinAfterFutile    = 1;       // Spin after returning from park()
-static int Knob_FixedSpin          = 0;
-static int Knob_OState             = 3;       // Spinner checks thread state of _owner
-static int Knob_UsePause           = 1;
-static int Knob_ExitPolicy         = 0;
-static int Knob_PreSpin            = 10;      // 20-100 likely better
-static int Knob_ResetEvent         = 0;
-static int BackOffMask             = 0;
+static int Knob_SpinBase            = 0;       // Floor AKA SpinMin
+static int Knob_SpinBackOff         = 0;       // spin-loop backoff
+static int Knob_CASPenalty          = -1;      // Penalty for failed CAS
+static int Knob_OXPenalty           = -1;      // Penalty for observed _owner change
+static int Knob_SpinSetSucc         = 1;       // spinners set the _succ field
+static int Knob_SpinEarly           = 1;
+static int Knob_SuccEnabled         = 1;       // futile wake throttling
+static int Knob_SuccRestrict        = 0;       // Limit successors + spinners to at-most-one
+static int Knob_MaxSpinners         = -1;      // Should be a function of # CPUs
+static int Knob_Bonus               = 100;     // spin success bonus
+static int Knob_BonusB              = 100;     // spin success bonus
+static int Knob_Penalty             = 200;     // spin failure penalty
+static int Knob_Poverty             = 1000;
+static int Knob_SpinAfterFutile     = 1;       // Spin after returning from park()
+static int Knob_FixedSpin           = 0;
+static int Knob_OState              = 3;       // Spinner checks thread state of _owner
+static int Knob_UsePause            = 1;
+static int Knob_ExitPolicy          = 0;
+static int Knob_PreSpin             = 10;      // 20-100 likely better
+static int Knob_ResetEvent          = 0;
+static int BackOffMask              = 0;

-static int Knob_FastHSSEC          = 0;
-static int Knob_MoveNotifyee       = 2;       // notify() - disposition of notifyee
-static int Knob_QMode              = 0;       // EntryList-cxq policy - queue discipline
-static volatile int InitDone       = 0;
+static int Knob_FastHSSEC           = 0;
+static int Knob_MoveNotifyee        = 2;       // notify() - disposition of notifyee
+static int Knob_QMode               = 0;       // EntryList-cxq policy - queue discipline
+static volatile int InitDone        = 0;

 #define TrySpin TrySpin_VaryDuration

@ -199,7 +200,7 @@ static volatile int InitDone       = 0;
 //   on EntryList|cxq.  That is, spinning relieves contention on the "inner"
 //   locks and monitor metadata.
 //
-//   Cxq points to the the set of Recently Arrived Threads attempting entry.
+//   Cxq points to the set of Recently Arrived Threads attempting entry.
 //   Because we push threads onto _cxq with CAS, the RATs must take the form of
 //   a singly-linked LIFO.  We drain _cxq into EntryList  at unlock-time when
 //   the unlocking thread notices that EntryList is null but _cxq is != null.
@ -269,13 +270,12 @@ bool ObjectMonitor::try_enter(Thread* THREAD) {
  }
 }

-void ATTR ObjectMonitor::enter(TRAPS) {
+void NOINLINE ObjectMonitor::enter(TRAPS) {
  // The following code is ordered to check the most common cases first
  // and to reduce RTS->RTO cache line upgrades on SPARC and IA32 processors.
  Thread * const Self = THREAD;
-  void * cur;

-  cur = Atomic::cmpxchg_ptr(Self, &_owner, NULL);
+  void * cur = Atomic::cmpxchg_ptr (Self, &_owner, NULL);
  if (cur == NULL) {
     // Either ASSERT _recursions == 0 or explicitly set _recursions = 0.
     assert(_recursions == 0   , "invariant");
@ -435,26 +435,24 @@ void ATTR ObjectMonitor::enter(TRAPS) {
 // Callers must compensate as needed.

 int ObjectMonitor::TryLock (Thread * Self) {
-   for (;;) {
-      void * own = _owner;
-      if (own != NULL) return 0;
-      if (Atomic::cmpxchg_ptr (Self, &_owner, NULL) == NULL) {
-         // Either guarantee _recursions == 0 or set _recursions = 0.
-         assert(_recursions == 0, "invariant");
-         assert(_owner == Self, "invariant");
-         // CONSIDER: set or assert that OwnerIsThread == 1
-         return 1;
-      }
-      // The lock had been free momentarily, but we lost the race to the lock.
-      // Interference -- the CAS failed.
-      // We can either return -1 or retry.
-      // Retry doesn't make as much sense because the lock was just acquired.
-      if (true) return -1;
-   }
+  void * own = _owner;
+  if (own != NULL) return 0;
+  if (Atomic::cmpxchg_ptr (Self, &_owner, NULL) == NULL) {
+    // Either guarantee _recursions == 0 or set _recursions = 0.
+    assert(_recursions == 0, "invariant");
+    assert(_owner == Self, "invariant");
+    // CONSIDER: set or assert that OwnerIsThread == 1
+    return 1;
+  }
+  // The lock had been free momentarily, but we lost the race to the lock.
+  // Interference -- the CAS failed.
+  // We can either return -1 or retry.
+  // Retry doesn't make as much sense because the lock was just acquired.
+  return -1;
 }

-void ATTR ObjectMonitor::EnterI (TRAPS) {
-    Thread * Self = THREAD;
+void NOINLINE ObjectMonitor::EnterI (TRAPS) {
+    Thread * const Self = THREAD;
    assert(Self->is_Java_thread(), "invariant");
    assert(((JavaThread *) Self)->thread_state() == _thread_blocked   , "invariant");

@ -550,7 +548,7 @@ void ATTR ObjectMonitor::EnterI (TRAPS) {
        Atomic::cmpxchg_ptr(Self, &_Responsible, NULL);
    }

-    // The lock have been released while this thread was occupied queueing
+    // The lock might have been released while this thread was occupied queueing
    // itself onto _cxq.  To close the race and avoid "stranding" and
    // progress-liveness failure we must resample-retry _owner before parking.
    // Note the Dekker/Lamport duality: ST cxq; MEMBAR; LD Owner.
@ -702,7 +700,7 @@ void ATTR ObjectMonitor::EnterI (TRAPS) {
 // Knob_Reset and Knob_SpinAfterFutile support and restructuring the
 // loop accordingly.

-void ATTR ObjectMonitor::ReenterI (Thread * Self, ObjectWaiter * SelfNode) {
+void NOINLINE ObjectMonitor::ReenterI (Thread * Self, ObjectWaiter * SelfNode) {
    assert(Self != NULL                , "invariant");
    assert(SelfNode != NULL            , "invariant");
    assert(SelfNode->_thread == Self   , "invariant");
@ -790,6 +788,7 @@ void ATTR ObjectMonitor::ReenterI (Thread * Self, ObjectWaiter * SelfNode) {
    OrderAccess::fence();      // see comments at the end of EnterI()
 }

+// By convention we unlink a contending thread from EntryList|cxq immediately
 // after the thread acquires the lock in ::enter().  Equally, we could defer
 // unlinking the thread until ::exit()-time.

@ -810,7 +809,7 @@ void ObjectMonitor::UnlinkAfterAcquire (Thread * Self, ObjectWaiter * SelfNode)
        assert(prv == NULL || prv->TState == ObjectWaiter::TS_ENTER, "invariant");
        TEVENT(Unlink from EntryList);
    } else {
-        guarantee(SelfNode->TState == ObjectWaiter::TS_CXQ, "invariant");
+        assert(SelfNode->TState == ObjectWaiter::TS_CXQ, "invariant");
        // Inopportune interleaving -- Self is still on the cxq.
        // This usually means the enqueue of self raced an exiting thread.
        // Normally we'll find Self near the front of the cxq, so
@ -850,10 +849,12 @@ void ObjectMonitor::UnlinkAfterAcquire (Thread * Self, ObjectWaiter * SelfNode)
        TEVENT(Unlink from cxq);
    }

+#ifdef ASSERT
    // Diagnostic hygiene ...
    SelfNode->_prev  = (ObjectWaiter *) 0xBAD;
    SelfNode->_next  = (ObjectWaiter *) 0xBAD;
    SelfNode->TState = ObjectWaiter::TS_RUN;
+#endif
 }

 // -----------------------------------------------------------------------------
@ -906,9 +907,15 @@ void ObjectMonitor::UnlinkAfterAcquire (Thread * Self, ObjectWaiter * SelfNode)
 // the integral of the # of active timers at any instant over time).
 // Both impinge on OS scalability.  Given that, at most one thread parked on
 // a monitor will use a timer.
+//
+// There is also the risk of a futile wake-up. If we drop the lock
+// another thread can reacquire the lock immediately, and we can
+// then wake a thread unnecessarily. This is benign, and we've
+// structured the code so the windows are short and the frequency
+// of such futile wakups is low.

-void ATTR ObjectMonitor::exit(bool not_suspended, TRAPS) {
-   Thread * Self = THREAD;
+void NOINLINE ObjectMonitor::exit(bool not_suspended, TRAPS) {
+   Thread * const Self = THREAD;
   if (THREAD != _owner) {
     if (THREAD->is_lock_owned((address) _owner)) {
       // Transmute _owner from a BasicLock pointer to a Thread address.
@ -920,14 +927,17 @@ void ATTR ObjectMonitor::exit(bool not_suspended, TRAPS) {
       _recursions = 0;
       OwnerIsThread = 1;
     } else {
-       // NOTE: we need to handle unbalanced monitor enter/exit
-       // in native code by throwing an exception.
-       // TODO: Throw an IllegalMonitorStateException ?
+       // Apparent unbalanced locking ...
+       // Naively we'd like to throw IllegalMonitorStateException.
+       // As a practical matter we can neither allocate nor throw an
+       // exception as ::exit() can be called from leaf routines.
+       // see x86_32.ad Fast_Unlock() and the I1 and I2 properties.
+       // Upon deeper reflection, however, in a properly run JVM the only
+       // way we should encounter this situation is in the presence of
+       // unbalanced JNI locking. TODO: CheckJNICalls.
+       // See also: CR4414101
       TEVENT(Exit - Throw IMSX);
-       assert(false, "Non-balanced monitor enter/exit!");
-       if (false) {
-          THROW(vmSymbols::java_lang_IllegalMonitorStateException());
-       }
+       assert(false, "Non-balanced monitor enter/exit! Likely JNI locking");
       return;
     }
   }
@ -976,6 +986,7 @@ void ATTR ObjectMonitor::exit(bool not_suspended, TRAPS) {
            return;
         }
         TEVENT(Inflated exit - complex egress);
+         // Other threads are blocked trying to acquire the lock.

         // Normally the exiting thread is responsible for ensuring succession,
         // but if other successors are ready or other entering threads are spinning
@ -1142,9 +1153,9 @@ void ATTR ObjectMonitor::exit(bool not_suspended, TRAPS) {
      if (w != NULL) {
          // I'd like to write: guarantee (w->_thread != Self).
          // But in practice an exiting thread may find itself on the EntryList.
-          // Lets say thread T1 calls O.wait().  Wait() enqueues T1 on O's waitset and
+          // Let's say thread T1 calls O.wait().  Wait() enqueues T1 on O's waitset and
          // then calls exit().  Exit release the lock by setting O._owner to NULL.
-          // Lets say T1 then stalls.  T2 acquires O and calls O.notify().  The
+          // Let's say T1 then stalls.  T2 acquires O and calls O.notify().  The
          // notify() operation moves T1 from O's waitset to O's EntryList. T2 then
          // release the lock "O".  T2 resumes immediately after the ST of null into
          // _owner, above.  T2 notices that the EntryList is populated, so it
@ -1261,10 +1272,13 @@ void ATTR ObjectMonitor::exit(bool not_suspended, TRAPS) {
 //       MEMBAR
 //       LD Self_>_suspend_flags
 //
+// UPDATE 2007-10-6: since I've replaced the native Mutex/Monitor subsystem
+// with a more efficient implementation, the need to use "FastHSSEC" has
+// decreased. - Dave


 bool ObjectMonitor::ExitSuspendEquivalent (JavaThread * jSelf) {
-   int Mode = Knob_FastHSSEC;
+   const int Mode = Knob_FastHSSEC;
   if (Mode && !jSelf->is_external_suspend()) {
      assert(jSelf->is_suspend_equivalent(), "invariant");
      jSelf->clear_suspend_equivalent();
@ -1413,7 +1427,7 @@ void ObjectMonitor::post_monitor_wait_event(EventJavaMonitorWait* event,
 // Wait/Notify/NotifyAll
 //
 // Note: a subset of changes to ObjectMonitor::wait()
-// will need to be replicated in complete_exit above
+// will need to be replicated in complete_exit
 void ObjectMonitor::wait(jlong millis, bool interruptible, TRAPS) {
   Thread * const Self = THREAD;
   assert(Self->is_Java_thread(), "Must be Java thread!");
@ -2268,12 +2282,12 @@ ObjectWaiter::ObjectWaiter(Thread* thread) {
  assert(_event != NULL, "invariant");
 }

-void ObjectWaiter::wait_reenter_begin(ObjectMonitor *mon) {
+void ObjectWaiter::wait_reenter_begin(ObjectMonitor * const mon) {
  JavaThread *jt = (JavaThread *)this->_thread;
  _active = JavaThreadBlockedOnMonitorEnterState::wait_reenter_begin(jt, mon);
 }

-void ObjectWaiter::wait_reenter_end(ObjectMonitor *mon) {
+void ObjectWaiter::wait_reenter_end(ObjectMonitor * const mon) {
  JavaThread *jt = (JavaThread *)this->_thread;
  JavaThreadBlockedOnMonitorEnterState::wait_reenter_end(jt, _active);
 }
@ -2455,6 +2469,7 @@ void ObjectMonitor::DeferredInitialize() {
  #define SETKNOB(x) { Knob_##x = kvGetInt (knobs, #x, Knob_##x); }
  SETKNOB(ReportSettings);
  SETKNOB(Verbose);
+  SETKNOB(VerifyInUse);
  SETKNOB(FixedSpin);
  SETKNOB(SpinLimit);
  SETKNOB(SpinBase);
--- a/hotspot/src/share/vm/runtime/objectMonitor.hpp
+++ b/hotspot/src/share/vm/runtime/objectMonitor.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -311,6 +311,7 @@ public:

 public:
  static int Knob_Verbose;
+  static int Knob_VerifyInUse;
  static int Knob_SpinLimit;
  void* operator new (size_t size) throw() {
    return AllocateHeap(size, mtInternal);
--- a/hotspot/src/share/vm/runtime/sharedRuntime.cpp
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.cpp
@ -1810,14 +1810,8 @@ JRT_END


 // Handles the uncommon case in locking, i.e., contention or an inflated lock.
-#ifndef PRODUCT
-int SharedRuntime::_monitor_enter_ctr=0;
-#endif
 JRT_ENTRY_NO_ASYNC(void, SharedRuntime::complete_monitor_locking_C(oopDesc* _obj, BasicLock* lock, JavaThread* thread))
  oop obj(_obj);
-#ifndef PRODUCT
-  _monitor_enter_ctr++;             // monitor enter slow
-#endif
  if (PrintBiasedLockingStatistics) {
    Atomic::inc(BiasedLocking::slow_path_entry_count_addr());
  }
@ -1831,15 +1825,9 @@ JRT_ENTRY_NO_ASYNC(void, SharedRuntime::complete_monitor_locking_C(oopDesc* _obj
  assert(!HAS_PENDING_EXCEPTION, "Should have no exception here");
 JRT_END

-#ifndef PRODUCT
-int SharedRuntime::_monitor_exit_ctr=0;
-#endif
 // Handles the uncommon cases of monitor unlocking in compiled code
 JRT_LEAF(void, SharedRuntime::complete_monitor_unlocking_C(oopDesc* _obj, BasicLock* lock))
   oop obj(_obj);
-#ifndef PRODUCT
-  _monitor_exit_ctr++;              // monitor exit slow
-#endif
  Thread* THREAD = JavaThread::current();
  // I'm not convinced we need the code contained by MIGHT_HAVE_PENDING anymore
  // testing was unable to ever fire the assert that guarded it so I have removed it.
@ -1879,8 +1867,6 @@ void SharedRuntime::print_statistics() {
  ttyLocker ttyl;
  if (xtty != NULL)  xtty->head("statistics type='SharedRuntime'");

-  if (_monitor_enter_ctr) tty->print_cr("%5d monitor enter slow",  _monitor_enter_ctr);
-  if (_monitor_exit_ctr) tty->print_cr("%5d monitor exit slow",   _monitor_exit_ctr);
  if (_throw_null_ctr) tty->print_cr("%5d implicit null throw", _throw_null_ctr);

  SharedRuntime::print_ic_miss_histogram();
@ -2464,9 +2450,9 @@ AdapterHandlerEntry* AdapterHandlerLibrary::get_adapter(methodHandle method) {
    if (PrintAdapterHandlers || PrintStubCode) {
      ttyLocker ttyl;
      entry->print_adapter_on(tty);
-      tty->print_cr("i2c argument handler #%d for: %s %s (%d bytes generated)",
+      tty->print_cr("i2c argument handler #%d for: %s %s %s (%d bytes generated)",
                    _adapters->number_of_entries(), (method->is_static() ? "static" : "receiver"),
-                    method->signature()->as_C_string(), insts_size);
+                    method->signature()->as_C_string(), fingerprint->as_string(), insts_size);
      tty->print_cr("c2i argument handler starts at %p", entry->get_c2i_entry());
      if (Verbose || PrintStubCode) {
        address first_pc = entry->base_address();
--- a/hotspot/src/share/vm/runtime/sharedRuntime.hpp
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.hpp
@ -516,8 +516,6 @@ class SharedRuntime: AllStatic {
  static void trace_ic_miss(address at);

 public:
-  static int _monitor_enter_ctr;                 // monitor enter slow
-  static int _monitor_exit_ctr;                  // monitor exit slow
  static int _throw_null_ctr;                    // throwing a null-pointer exception
  static int _ic_miss_ctr;                       // total # of IC misses
  static int _wrong_method_ctr;
--- a/hotspot/src/share/vm/runtime/synchronizer.cpp
+++ b/hotspot/src/share/vm/runtime/synchronizer.cpp
@ -44,9 +44,9 @@

 #if defined(__GNUC__) && !defined(PPC64)
  // Need to inhibit inlining for older versions of GCC to avoid build-time failures
-  #define ATTR __attribute__((noinline))
+  #define NOINLINE __attribute__((noinline))
 #else
-  #define ATTR
+  #define NOINLINE
 #endif

 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
@ -206,14 +206,6 @@ void ObjectSynchronizer::slow_enter(Handle obj, BasicLock* lock, TRAPS) {
    return;
  }

-#if 0
-  // The following optimization isn't particularly useful.
-  if (mark->has_monitor() && mark->monitor()->is_entered(THREAD)) {
-    lock->set_displaced_header(NULL);
-    return;
-  }
-#endif
-
  // The object header will never be displaced to this lock,
  // so it does not matter what the value is, except that it
  // must be non-zero to avoid looking like a re-entrant lock,
@ -573,7 +565,7 @@ intptr_t ObjectSynchronizer::FastHashCode (Thread * Self, oop obj) {
    // added check of the bias pattern is to avoid useless calls to
    // thread-local storage.
    if (obj->mark()->has_bias_pattern()) {
-      // Box and unbox the raw reference just in case we cause a STW safepoint.
+      // Handle for oop obj in case of STW safepoint
      Handle hobj(Self, obj);
      // Relaxing assertion for bug 6320749.
      assert(Universe::verify_in_progress() ||
@ -890,23 +882,23 @@ static void InduceScavenge (Thread * Self, const char * Whence) {
    }
  }
 }
-/* Too slow for general assert or debug
+
 void ObjectSynchronizer::verifyInUse (Thread *Self) {
   ObjectMonitor* mid;
   int inusetally = 0;
   for (mid = Self->omInUseList; mid != NULL; mid = mid->FreeNext) {
-     inusetally ++;
+     inusetally++;
   }
   assert(inusetally == Self->omInUseCount, "inuse count off");

   int freetally = 0;
   for (mid = Self->omFreeList; mid != NULL; mid = mid->FreeNext) {
-     freetally ++;
+     freetally++;
   }
   assert(freetally == Self->omFreeCount, "free count off");
 }
-*/
-ObjectMonitor * ATTR ObjectSynchronizer::omAlloc (Thread * Self) {
+
+ObjectMonitor * NOINLINE ObjectSynchronizer::omAlloc (Thread * Self) {
    // A large MAXPRIVATE value reduces both list lock contention
    // and list coherency traffic, but also tends to increase the
    // number of objectMonitors in circulation as well as the STW
@ -932,7 +924,9 @@ ObjectMonitor * ATTR ObjectSynchronizer::omAlloc (Thread * Self) {
             m->FreeNext = Self->omInUseList;
             Self->omInUseList = m;
             Self->omInUseCount++;
-             // verifyInUse(Self);
+             if (ObjectMonitor::Knob_VerifyInUse) {
+               verifyInUse(Self);
+             }
           } else {
             m->FreeNext = NULL;
           }
@ -1052,7 +1046,9 @@ void ObjectSynchronizer::omRelease (Thread * Self, ObjectMonitor * m, bool fromP
           curmidinuse->FreeNext = mid->FreeNext; // maintain the current thread inuselist
         }
         Self->omInUseCount--;
-         // verifyInUse(Self);
+         if (ObjectMonitor::Knob_VerifyInUse) {
+           verifyInUse(Self);
+         }
         break;
       } else {
         curmidinuse = mid;
@ -1061,7 +1057,7 @@ void ObjectSynchronizer::omRelease (Thread * Self, ObjectMonitor * m, bool fromP
    }
  }

-  // FreeNext is used for both onInUseList and omFreeList, so clear old before setting new
+  // FreeNext is used for both omInUseList and omFreeList, so clear old before setting new
  m->FreeNext = Self->omFreeList;
  Self->omFreeList = m;
  Self->omFreeCount++;
@ -1074,7 +1070,7 @@ void ObjectSynchronizer::omRelease (Thread * Self, ObjectMonitor * m, bool fromP
 // consecutive STW safepoints.  Relatedly, we might decay
 // omFreeProvision at STW safepoints.
 //
-// Also return the monitors of a moribund thread"s omInUseList to
+// Also return the monitors of a moribund thread's omInUseList to
 // a global gOmInUseList under the global list lock so these
 // will continue to be scanned.
 //
@ -1115,7 +1111,6 @@ void ObjectSynchronizer::omFlush (Thread * Self) {
        InUseTail = curom;
        InUseTally++;
      }
-// TODO debug
      assert(Self->omInUseCount == InUseTally, "inuse count off");
      Self->omInUseCount = 0;
      guarantee(InUseTail != NULL && InUseList != NULL, "invariant");
@ -1154,7 +1149,7 @@ ObjectMonitor* ObjectSynchronizer::inflate_helper(oop obj) {
 // multiple locks occupy the same $ line.  Padding might be appropriate.


-ObjectMonitor * ATTR ObjectSynchronizer::inflate (Thread * Self, oop object) {
+ObjectMonitor * NOINLINE ObjectSynchronizer::inflate (Thread * Self, oop object) {
  // Inflate mutates the heap ...
  // Relaxing assertion for bug 6320749.
  assert(Universe::verify_in_progress() ||
@ -1385,7 +1380,7 @@ enum ManifestConstants {
 // Deflate a single monitor if not in use
 // Return true if deflated, false if in use
 bool ObjectSynchronizer::deflate_monitor(ObjectMonitor* mid, oop obj,
-                                         ObjectMonitor** FreeHeadp, ObjectMonitor** FreeTailp) {
+                                         ObjectMonitor** freeHeadp, ObjectMonitor** freeTailp) {
  bool deflated;
  // Normal case ... The monitor is associated with obj.
  guarantee(obj->mark() == markOopDesc::encode(mid), "invariant");
@ -1415,13 +1410,13 @@ bool ObjectSynchronizer::deflate_monitor(ObjectMonitor* mid, oop obj,
     assert(mid->object() == NULL, "invariant");

     // Move the object to the working free list defined by FreeHead,FreeTail.
-     if (*FreeHeadp == NULL) *FreeHeadp = mid;
-     if (*FreeTailp != NULL) {
-       ObjectMonitor * prevtail = *FreeTailp;
+     if (*freeHeadp == NULL) *freeHeadp = mid;
+     if (*freeTailp != NULL) {
+       ObjectMonitor * prevtail = *freeTailp;
       assert(prevtail->FreeNext == NULL, "cleaned up deflated?"); // TODO KK
       prevtail->FreeNext = mid;
      }
-     *FreeTailp = mid;
+     *freeTailp = mid;
     deflated = true;
  }
  return deflated;
@ -1429,7 +1424,7 @@ bool ObjectSynchronizer::deflate_monitor(ObjectMonitor* mid, oop obj,

 // Caller acquires ListLock
 int ObjectSynchronizer::walk_monitor_list(ObjectMonitor** listheadp,
-                                          ObjectMonitor** FreeHeadp, ObjectMonitor** FreeTailp) {
+                                          ObjectMonitor** freeHeadp, ObjectMonitor** freeTailp) {
  ObjectMonitor* mid;
  ObjectMonitor* next;
  ObjectMonitor* curmidinuse = NULL;
@ -1439,7 +1434,7 @@ int ObjectSynchronizer::walk_monitor_list(ObjectMonitor** listheadp,
     oop obj = (oop) mid->object();
     bool deflated = false;
     if (obj != NULL) {
-       deflated = deflate_monitor(mid, obj, FreeHeadp, FreeTailp);
+       deflated = deflate_monitor(mid, obj, freeHeadp, freeTailp);
     }
     if (deflated) {
       // extract from per-thread in-use-list
@ -1482,7 +1477,9 @@ void ObjectSynchronizer::deflate_idle_monitors() {
      nInCirculation+= cur->omInUseCount;
      int deflatedcount = walk_monitor_list(cur->omInUseList_addr(), &FreeHead, &FreeTail);
      cur->omInUseCount-= deflatedcount;
-      // verifyInUse(cur);
+      if (ObjectMonitor::Knob_VerifyInUse) {
+        verifyInUse(cur);
+      }
      nScavenged += deflatedcount;
      nInuse += cur->omInUseCount;
     }
--- a/hotspot/src/share/vm/runtime/synchronizer.hpp
+++ b/hotspot/src/share/vm/runtime/synchronizer.hpp
@ -84,7 +84,7 @@ class ObjectSynchronizer : AllStatic {
  static void reenter            (Handle obj, intptr_t recursion, TRAPS);

  // thread-specific and global objectMonitor free list accessors
-//  static void verifyInUse (Thread * Self) ; too slow for general assert/debug
+  static void verifyInUse(Thread * Self);
  static ObjectMonitor * omAlloc(Thread * Self);
  static void omRelease(Thread * Self, ObjectMonitor * m, bool FromPerThreadAlloc);
  static void omFlush(Thread * Self);
@ -114,10 +114,10 @@ class ObjectSynchronizer : AllStatic {
  // An adaptive profile-based deflation policy could be used if needed
  static void deflate_idle_monitors();
  static int walk_monitor_list(ObjectMonitor** listheadp,
-                               ObjectMonitor** FreeHeadp,
-                               ObjectMonitor** FreeTailp);
-  static bool deflate_monitor(ObjectMonitor* mid, oop obj, ObjectMonitor** FreeHeadp,
-                              ObjectMonitor** FreeTailp);
+                               ObjectMonitor** freeHeadp,
+                               ObjectMonitor** freeTailp);
+  static bool deflate_monitor(ObjectMonitor* mid, oop obj, ObjectMonitor** freeHeadp,
+                              ObjectMonitor** freeTailp);
  static void oops_do(OopClosure* f);

  // debugging
@ -130,7 +130,10 @@ class ObjectSynchronizer : AllStatic {
  enum { _BLOCKSIZE = 128 };
  static ObjectMonitor* gBlockList;
  static ObjectMonitor * volatile gFreeList;
-  static ObjectMonitor * volatile gOmInUseList; // for moribund thread, so monitors they inflated still get scanned
+  // global monitor in use list, for moribund threads,
+  // monitors they inflated need to be scanned for deflation
+  static ObjectMonitor * volatile gOmInUseList;
+  // count of entries in gOmInUseList
  static int gOmInUseCount;

 };
--- a/hotspot/src/share/vm/runtime/thread.cpp
+++ b/hotspot/src/share/vm/runtime/thread.cpp
@ -4392,6 +4392,12 @@ void Thread::SpinRelease (volatile int * adr) {
  // It's safe if subsequent LDs and STs float "up" into the critical section,
  // but prior LDs and STs within the critical section can't be allowed
  // to reorder or float past the ST that releases the lock.
+  // Loads and stores in the critical section - which appear in program
+  // order before the store that releases the lock - must also appear
+  // before the store that releases the lock in memory visibility order.
+  // Conceptually we need a #loadstore|#storestore "release" MEMBAR before
+  // the ST of 0 into the lock-word which releases the lock, so fence
+  // more than covers this on all platforms.
  *adr = 0;
 }

@ -4573,18 +4579,25 @@ void Thread::muxAcquireW (volatile intptr_t * Lock, ParkEvent * ev) {
 // This implementation pops from the head of the list.  This is unfair,
 // but tends to provide excellent throughput as hot threads remain hot.
 // (We wake recently run threads first).
-
+//
+// All paths through muxRelease() will execute a CAS.
+// Release consistency -- We depend on the CAS in muxRelease() to provide full
+// bidirectional fence/MEMBAR semantics, ensuring that all prior memory operations
+// executed within the critical section are complete and globally visible before the
+// store (CAS) to the lock-word that releases the lock becomes globally visible.
 void Thread::muxRelease (volatile intptr_t * Lock)  {
  for (;;) {
    const intptr_t w = Atomic::cmpxchg_ptr(0, Lock, LOCKBIT);
    assert(w & LOCKBIT, "invariant");
    if (w == LOCKBIT) return;
-    ParkEvent * List = (ParkEvent *)(w & ~LOCKBIT);
+    ParkEvent * const List = (ParkEvent *) (w & ~LOCKBIT);
    assert(List != NULL, "invariant");
    assert(List->OnList == intptr_t(Lock), "invariant");
-    ParkEvent * nxt = List->ListNext;
+    ParkEvent * const nxt = List->ListNext;
+    guarantee((intptr_t(nxt) & LOCKBIT) == 0, "invariant");

    // The following CAS() releases the lock and pops the head element.
+    // The CAS() also ratifies the previously fetched lock-word value.
    if (Atomic::cmpxchg_ptr (intptr_t(nxt), Lock, w) != w) {
      continue;
    }