8225631: Consider replacing muxAcquire/Release with PlatformMonitor

Reviewed-by: coleenp, dcubed, kbarrett
This commit is contained in:
David Holmes 2020-11-18 22:45:49 +00:00
parent 646c20022c
commit 99eac53580
6 changed files with 26 additions and 183 deletions

View File

@ -1520,7 +1520,7 @@ void os::PlatformEvent::unpark() {
// shake out uses of park() and unpark() without checking state conditions
// properly. This spurious return doesn't manifest itself in any user code
// but only in the correctly written condition checking loops of ObjectMonitor,
// Mutex/Monitor, Thread::muxAcquire and JavaThread::sleep
// Mutex/Monitor, and JavaThread::sleep
if (Atomic::xchg(&_event, 1) >= 0) return;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -125,7 +125,6 @@ class ParkEvent : public os::PlatformEvent {
public:
// MCS-CLH list linkage and Native Mutex/Monitor
ParkEvent * volatile ListNext ;
volatile intptr_t OnList ;
volatile int TState ;
volatile int Notified ; // for native monitor construct
@ -146,7 +145,6 @@ class ParkEvent : public os::PlatformEvent {
AssociatedWith = NULL ;
FreeNext = NULL ;
ListNext = NULL ;
OnList = 0 ;
TState = 0 ;
Notified = 0 ;
}

View File

@ -227,8 +227,14 @@ int dtrace_waited_probe(ObjectMonitor* monitor, Handle obj, Thread* thr) {
return 0;
}
#define NINFLATIONLOCKS 256
static volatile intptr_t gInflationLocks[NINFLATIONLOCKS];
static const int NINFLATIONLOCKS = 256;
static os::PlatformMutex* gInflationLocks[NINFLATIONLOCKS];
void ObjectSynchronizer::initialize() {
for (int i = 0; i < NINFLATIONLOCKS; i++) {
gInflationLocks[i] = new os::PlatformMutex();
}
}
static MonitorList _in_use_list;
// The ratio of the current _in_use_list count to the ceiling is used
@ -749,13 +755,7 @@ static markWord read_stable_mark(oop obj) {
// The object is being inflated by some other thread.
// The caller of read_stable_mark() must wait for inflation to complete.
// Avoid live-lock
// TODO: consider calling SafepointSynchronize::do_call_back() while
// spinning to see if there's a safepoint pending. If so, immediately
// yielding or blocking would be appropriate. Avoid spinning while
// there is a safepoint pending.
// TODO: add inflation contention performance counters.
// TODO: restrict the aggregate number of spinners.
// Avoid live-lock.
++its;
if (its > 10000 || !os::is_MP()) {
@ -775,15 +775,15 @@ static markWord read_stable_mark(oop obj) {
// and calling park(). When inflation was complete the thread that accomplished inflation
// would detach the list and set the markword to inflated with a single CAS and
// then for each thread on the list, set the flag and unpark() the thread.
// This is conceptually similar to muxAcquire-muxRelease, except that muxRelease
// wakes at most one thread whereas we need to wake the entire list.
// Index into the lock array based on the current object address.
static_assert(is_power_of_2(NINFLATIONLOCKS), "must be");
int ix = (cast_from_oop<intptr_t>(obj) >> 5) & (NINFLATIONLOCKS-1);
int YieldThenBlock = 0;
assert(ix >= 0 && ix < NINFLATIONLOCKS, "invariant");
assert((NINFLATIONLOCKS & (NINFLATIONLOCKS-1)) == 0, "invariant");
Thread::muxAcquire(gInflationLocks + ix, "gInflationLock");
gInflationLocks[ix]->lock();
while (obj->mark() == markWord::INFLATING()) {
// Beware: NakedYield() is advisory and has almost no effect on some platforms
// Beware: naked_yield() is advisory and has almost no effect on some platforms
// so we periodically call self->_ParkEvent->park(1).
// We use a mixed spin/yield/block mechanism.
if ((YieldThenBlock++) >= 16) {
@ -792,7 +792,7 @@ static markWord read_stable_mark(oop obj) {
os::naked_yield();
}
}
Thread::muxRelease(gInflationLocks + ix);
gInflationLocks[ix]->unlock();
}
} else {
SpinPause(); // SMP-polite spinning

View File

@ -29,6 +29,7 @@
#include "oops/markWord.hpp"
#include "runtime/basicLock.hpp"
#include "runtime/handles.hpp"
#include "runtime/os.hpp"
#include "runtime/perfData.hpp"
class LogStream;
@ -114,6 +115,9 @@ class ObjectSynchronizer : AllStatic {
static void release_monitors_owned_by_thread(TRAPS);
static void monitors_iterate(MonitorClosure* m);
// Initialize the gInflationLocks
static void initialize();
// GC: we current use aggressive monitor deflation policy
// Basically we try to deflate all monitors that are not busy.
static size_t deflate_idle_monitors();

View File

@ -291,7 +291,6 @@ Thread::Thread() {
// The stack would act as a cache to avoid calls to ParkEvent::Allocate()
// and ::Release()
_ParkEvent = ParkEvent::Allocate(this);
_MuxEvent = ParkEvent::Allocate(this);
#ifdef CHECK_UNHANDLED_OOPS
if (CheckUnhandledOops) {
@ -439,7 +438,6 @@ Thread::~Thread() {
// It's possible we can encounter a null _ParkEvent, etc., in stillborn threads.
// We NULL out the fields for good hygiene.
ParkEvent::Release(_ParkEvent); _ParkEvent = NULL;
ParkEvent::Release(_MuxEvent); _MuxEvent = NULL;
delete handle_area();
delete metadata_handles();
@ -3560,6 +3558,7 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) {
// Initialize Java-Level synchronization subsystem
ObjectMonitor::Initialize();
ObjectSynchronizer::initialize();
// Initialize global modules
jint status = init_globals();
@ -4582,22 +4581,11 @@ void Threads::print_threads_compiling(outputStream* st, char* buf, int buflen, b
}
// Internal SpinLock and Mutex
// Based on ParkEvent
// Ad-hoc mutual exclusion primitives: SpinLock and Mux
// Ad-hoc mutual exclusion primitives: SpinLock
//
// We employ SpinLocks _only for low-contention, fixed-length
// short-duration critical sections where we're concerned
// about native mutex_t or HotSpot Mutex:: latency.
// The mux construct provides a spin-then-block mutual exclusion
// mechanism.
//
// Testing has shown that contention on the ListLock guarding gFreeList
// is common. If we implement ListLock as a simple SpinLock it's common
// for the JVM to devolve to yielding with little progress. This is true
// despite the fact that the critical sections protected by ListLock are
// extremely short.
//
// TODO-FIXME: ListLock should be of type SpinLock.
// We should make this a 1st-class type, integrated into the lock
@ -4650,150 +4638,6 @@ void Thread::SpinRelease(volatile int * adr) {
*adr = 0;
}
// muxAcquire and muxRelease:
//
// * muxAcquire and muxRelease support a single-word lock-word construct.
// The LSB of the word is set IFF the lock is held.
// The remainder of the word points to the head of a singly-linked list
// of threads blocked on the lock.
//
// * The current implementation of muxAcquire-muxRelease uses its own
// dedicated Thread._MuxEvent instance. If we're interested in
// minimizing the peak number of extant ParkEvent instances then
// we could eliminate _MuxEvent and "borrow" _ParkEvent as long
// as certain invariants were satisfied. Specifically, care would need
// to be taken with regards to consuming unpark() "permits".
// A safe rule of thumb is that a thread would never call muxAcquire()
// if it's enqueued (cxq, EntryList, WaitList, etc) and will subsequently
// park(). Otherwise the _ParkEvent park() operation in muxAcquire() could
// consume an unpark() permit intended for monitorenter, for instance.
// One way around this would be to widen the restricted-range semaphore
// implemented in park(). Another alternative would be to provide
// multiple instances of the PlatformEvent() for each thread. One
// instance would be dedicated to muxAcquire-muxRelease, for instance.
//
// * Usage:
// -- Only as leaf locks
// -- for short-term locking only as muxAcquire does not perform
// thread state transitions.
//
// Alternatives:
// * We could implement muxAcquire and muxRelease with MCS or CLH locks
// but with parking or spin-then-park instead of pure spinning.
// * Use Taura-Oyama-Yonenzawa locks.
// * It's possible to construct a 1-0 lock if we encode the lockword as
// (List,LockByte). Acquire will CAS the full lockword while Release
// will STB 0 into the LockByte. The 1-0 scheme admits stranding, so
// acquiring threads use timers (ParkTimed) to detect and recover from
// the stranding window. Thread/Node structures must be aligned on 256-byte
// boundaries by using placement-new.
// * Augment MCS with advisory back-link fields maintained with CAS().
// Pictorially: LockWord -> T1 <-> T2 <-> T3 <-> ... <-> Tn <-> Owner.
// The validity of the backlinks must be ratified before we trust the value.
// If the backlinks are invalid the exiting thread must back-track through the
// the forward links, which are always trustworthy.
// * Add a successor indication. The LockWord is currently encoded as
// (List, LOCKBIT:1). We could also add a SUCCBIT or an explicit _succ variable
// to provide the usual futile-wakeup optimization.
// See RTStt for details.
//
const intptr_t LOCKBIT = 1;
void Thread::muxAcquire(volatile intptr_t * Lock, const char * LockName) {
intptr_t w = Atomic::cmpxchg(Lock, (intptr_t)0, LOCKBIT);
if (w == 0) return;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
return;
}
ParkEvent * const Self = Thread::current()->_MuxEvent;
assert((intptr_t(Self) & LOCKBIT) == 0, "invariant");
for (;;) {
int its = (os::is_MP() ? 100 : 0) + 1;
// Optional spin phase: spin-then-park strategy
while (--its >= 0) {
w = *Lock;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
return;
}
}
Self->reset();
Self->OnList = intptr_t(Lock);
// The following fence() isn't _strictly necessary as the subsequent
// CAS() both serializes execution and ratifies the fetched *Lock value.
OrderAccess::fence();
for (;;) {
w = *Lock;
if ((w & LOCKBIT) == 0) {
if (Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
Self->OnList = 0; // hygiene - allows stronger asserts
return;
}
continue; // Interference -- *Lock changed -- Just retry
}
assert(w & LOCKBIT, "invariant");
Self->ListNext = (ParkEvent *) (w & ~LOCKBIT);
if (Atomic::cmpxchg(Lock, w, intptr_t(Self)|LOCKBIT) == w) break;
}
while (Self->OnList != 0) {
Self->park();
}
}
}
// Release() must extract a successor from the list and then wake that thread.
// It can "pop" the front of the list or use a detach-modify-reattach (DMR) scheme
// similar to that used by ParkEvent::Allocate() and ::Release(). DMR-based
// Release() would :
// (A) CAS() or swap() null to *Lock, releasing the lock and detaching the list.
// (B) Extract a successor from the private list "in-hand"
// (C) attempt to CAS() the residual back into *Lock over null.
// If there were any newly arrived threads and the CAS() would fail.
// In that case Release() would detach the RATs, re-merge the list in-hand
// with the RATs and repeat as needed. Alternately, Release() might
// detach and extract a successor, but then pass the residual list to the wakee.
// The wakee would be responsible for reattaching and remerging before it
// competed for the lock.
//
// Both "pop" and DMR are immune from ABA corruption -- there can be
// multiple concurrent pushers, but only one popper or detacher.
// This implementation pops from the head of the list. This is unfair,
// but tends to provide excellent throughput as hot threads remain hot.
// (We wake recently run threads first).
//
// All paths through muxRelease() will execute a CAS.
// Release consistency -- We depend on the CAS in muxRelease() to provide full
// bidirectional fence/MEMBAR semantics, ensuring that all prior memory operations
// executed within the critical section are complete and globally visible before the
// store (CAS) to the lock-word that releases the lock becomes globally visible.
void Thread::muxRelease(volatile intptr_t * Lock) {
for (;;) {
const intptr_t w = Atomic::cmpxchg(Lock, LOCKBIT, (intptr_t)0);
assert(w & LOCKBIT, "invariant");
if (w == LOCKBIT) return;
ParkEvent * const List = (ParkEvent *) (w & ~LOCKBIT);
assert(List != NULL, "invariant");
assert(List->OnList == intptr_t(Lock), "invariant");
ParkEvent * const nxt = List->ListNext;
guarantee((intptr_t(nxt) & LOCKBIT) == 0, "invariant");
// The following CAS() releases the lock and pops the head element.
// The CAS() also ratifies the previously fetched lock-word value.
if (Atomic::cmpxchg(Lock, w, intptr_t(nxt)) != w) {
continue;
}
List->OnList = 0;
OrderAccess::fence();
List->unpark();
return;
}
}
void Threads::verify() {
ALL_JAVA_THREADS(p) {

View File

@ -827,8 +827,8 @@ protected:
public:
volatile intptr_t _Stalled;
volatile int _TypeTag;
ParkEvent * _ParkEvent; // for Object monitors and JVMTI raw monitors
ParkEvent * _MuxEvent; // for low-level muxAcquire-muxRelease
ParkEvent * _ParkEvent; // for Object monitors, JVMTI raw monitors,
// and ObjectSynchronizer::read_stable_mark
int NativeSyncRecursion; // diagnostic
volatile int _OnTrap; // Resume-at IP delta
@ -837,13 +837,10 @@ protected:
jint _hashStateY;
jint _hashStateZ;
// Low-level leaf-lock primitives used to implement synchronization
// and native monitor-mutex infrastructure.
// Low-level leaf-lock primitives used to implement synchronization.
// Not for general synchronization use.
static void SpinAcquire(volatile int * Lock, const char * Name);
static void SpinRelease(volatile int * Lock);
static void muxAcquire(volatile intptr_t * Lock, const char * Name);
static void muxRelease(volatile intptr_t * Lock);
};
// Inline implementation of Thread::current()