diff --git a/make/test/JtregNativeHotspot.gmk b/make/test/JtregNativeHotspot.gmk
index 6b5c2770af1..bd0047468f4 100644
--- a/make/test/JtregNativeHotspot.gmk
+++ b/make/test/JtregNativeHotspot.gmk
@@ -874,7 +874,7 @@ BUILD_HOTSPOT_JTREG_EXECUTABLES_LIBS_exesigtest := -ljvm
 
 ifeq ($(call isTargetOs, windows), true)
     BUILD_HOTSPOT_JTREG_EXECUTABLES_CFLAGS_exeFPRegs := -MT
-    BUILD_HOTSPOT_JTREG_EXCLUDE += exesigtest.c libterminatedThread.c libTestJNI.c
+    BUILD_HOTSPOT_JTREG_EXCLUDE += exesigtest.c libterminatedThread.c libTestJNI.c libCompleteExit.c
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libatExit := jvm.lib
 else
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libbootclssearch_agent += -lpthread
@@ -1512,6 +1512,7 @@ else
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libgetphase002 += -lpthread
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libterminatedThread += -lpthread
     BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libatExit += -ljvm
+    BUILD_HOTSPOT_JTREG_LIBRARIES_LIBS_libCompleteExit += -lpthread
 endif
 
 # This evaluation is expensive and should only be done if this target was
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 66b83c0cb86..2b8ed2e8563 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -3959,7 +3959,7 @@ encode %{
     Register tmp = as_Register($tmp2$$reg);
     Label cont;
     Label object_has_monitor;
-    Label cas_failed;
+    Label no_count;
 
     assert_different_registers(oop, box, tmp, disp_hdr);
 
@@ -3995,9 +3995,6 @@ encode %{
       // If the compare-and-exchange succeeded, then we found an unlocked
       // object, will have now locked it will continue at label cont
 
-      __ bind(cas_failed);
-      // We did not see an unlocked object so try the fast recursive case.
-
       // Check if the owner is self by comparing the value in the
       // markWord of object (disp_hdr) with the stack pointer.
       __ mov(rscratch1, sp);
@@ -4042,6 +4039,11 @@ encode %{
     __ bind(cont);
     // flag == EQ indicates success
     // flag == NE indicates failure
+    __ br(Assembler::NE, no_count);
+
+    __ increment(Address(rthread, JavaThread::held_monitor_count_offset()));
+
+    __ bind(no_count);
   %}
 
   enc_class aarch64_enc_fast_unlock(iRegP object, iRegP box, iRegP tmp, iRegP tmp2) %{
@@ -4052,6 +4054,7 @@ encode %{
     Register tmp = as_Register($tmp2$$reg);
     Label cont;
     Label object_has_monitor;
+    Label no_count;
 
     assert_different_registers(oop, box, tmp, disp_hdr);
 
@@ -4110,6 +4113,11 @@ encode %{
     __ bind(cont);
     // flag == EQ indicates success
     // flag == NE indicates failure
+    __ br(Assembler::NE, no_count);
+
+    __ decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
+
+    __ bind(no_count);
   %}
 
 %}
diff --git a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
index 3bcf27d9637..764324e4bed 100644
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
@@ -437,7 +437,6 @@ int LIR_Assembler::emit_unwind_handler() {
       __ unlock_object(r5, r4, r0, *stub->entry());
     }
     __ bind(*stub->continuation());
-    __ dec_held_monitor_count(rthread);
   }
 
   if (compilation()->env()->dtrace_method_probes()) {
@@ -2577,18 +2576,7 @@ void LIR_Assembler::emit_lock(LIR_OpLock* op) {
   } else {
     Unimplemented();
   }
-  if (op->code() == lir_lock) {
-    // If deoptimization happens in Runtime1::monitorenter, inc_held_monitor_count after backing from slowpath
-    // will be skipped. Solution is:
-    // 1. Increase only in fastpath
-    // 2. Runtime1::monitorenter increase count after locking
-    __ inc_held_monitor_count(rthread);
-  }
   __ bind(*op->stub()->continuation());
-  if (op->code() == lir_unlock) {
-    // unlock in slowpath is JRT_Leaf stub, no deoptimization can happen
-    __ dec_held_monitor_count(rthread);
-  }
 }
 
 void LIR_Assembler::emit_load_klass(LIR_OpLoadKlass* op) {
diff --git a/src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp
index fcc219c35eb..afd5be0db46 100644
--- a/src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp
@@ -116,6 +116,7 @@ int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr
   cbnz(hdr, slow_case);
   // done
   bind(done);
+  increment(Address(rthread, JavaThread::held_monitor_count_offset()));
   return null_check_offset;
 }
 
@@ -147,6 +148,7 @@ void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_
   }
   // done
   bind(done);
+  decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 }
 
 
diff --git a/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp b/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp
index fe88f7b887c..622145ad155 100644
--- a/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp
@@ -34,6 +34,8 @@ const bool CCallingConventionRequiresIntsAsLongs = false;
 
 #define SUPPORTS_NATIVE_CX8
 
+#define SUPPORT_MONITOR_COUNT
+
 // Aarch64 was not originally defined to be multi-copy-atomic, but now
 // is.  See: "Simplifying ARM Concurrency: Multicopy-atomic Axiomatic
 // and Operational Models for ARMv8"
diff --git a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
index 1a473a23792..cb1b5061ac3 100644
--- a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
@@ -606,7 +606,6 @@ void InterpreterMacroAssembler::remove_activation(
 
   bind(unlock);
   unlock_object(c_rarg1);
-  dec_held_monitor_count(rthread);
   pop(state);
 
   // Check that for block-structured locking (i.e., that all locked
@@ -649,7 +648,6 @@ void InterpreterMacroAssembler::remove_activation(
 
       push(state);
       unlock_object(c_rarg1);
-      dec_held_monitor_count(rthread);
       pop(state);
 
       if (install_monitor_exception) {
@@ -732,7 +730,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
             CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
             lock_reg);
   } else {
-    Label done;
+    Label count, done;
 
     const Register swap_reg = r0;
     const Register tmp = c_rarg2;
@@ -766,7 +764,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
            "displached header must be first word in BasicObjectLock");
 
     Label fail;
-    cmpxchg_obj_header(swap_reg, lock_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+    cmpxchg_obj_header(swap_reg, lock_reg, obj_reg, rscratch1, count, /*fallthrough*/NULL);
 
     // Fast check for recursive lock.
     //
@@ -803,7 +801,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
 
     // Save the test result, for recursive case, the result is zero
     str(swap_reg, Address(lock_reg, mark_offset));
-    br(Assembler::EQ, done);
+    br(Assembler::EQ, count);
 
     bind(slow_case);
 
@@ -811,6 +809,10 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
     call_VM(noreg,
             CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
             lock_reg);
+    b(done);
+
+    bind(count);
+    increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 
     bind(done);
   }
@@ -835,7 +837,7 @@ void InterpreterMacroAssembler::unlock_object(Register lock_reg)
   if (UseHeavyMonitors) {
     call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), lock_reg);
   } else {
-    Label done;
+    Label count, done;
 
     const Register swap_reg   = r0;
     const Register header_reg = c_rarg2;  // Will contain the old oopMark
@@ -858,17 +860,20 @@ void InterpreterMacroAssembler::unlock_object(Register lock_reg)
                             BasicLock::displaced_header_offset_in_bytes()));
 
     // Test for recursion
-    cbz(header_reg, done);
+    cbz(header_reg, count);
 
     // Atomic swap back the old header
-    cmpxchg_obj_header(swap_reg, header_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+    cmpxchg_obj_header(swap_reg, header_reg, obj_reg, rscratch1, count, /*fallthrough*/NULL);
 
     // Call the runtime routine for slow case.
     str(obj_reg, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes())); // restore obj
     call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), lock_reg);
+    b(done);
+
+    bind(count);
+    decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 
     bind(done);
-
     restore_bcp();
   }
 }
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
index 86784d07b25..16af49b7ebd 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -346,20 +346,6 @@ void MacroAssembler::pop_cont_fastpath(Register java_thread) {
   bind(done);
 }
 
-void MacroAssembler::inc_held_monitor_count(Register java_thread) {
-  if (!Continuations::enabled()) return;
-  incrementw(Address(java_thread, JavaThread::held_monitor_count_offset()));
-}
-
-void MacroAssembler::dec_held_monitor_count(Register java_thread) {
-  if (!Continuations::enabled()) return;
-  decrementw(Address(java_thread, JavaThread::held_monitor_count_offset()));
-}
-
-void MacroAssembler::reset_held_monitor_count(Register java_thread) {
-  strw(zr, Address(java_thread, JavaThread::held_monitor_count_offset()));
-}
-
 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
   // we must set sp to zero to clear frame
   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
index de453084f3b..1c3a57fd494 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -900,9 +900,6 @@ public:
 
   void push_cont_fastpath(Register java_thread);
   void pop_cont_fastpath(Register java_thread);
-  void inc_held_monitor_count(Register java_thread);
-  void dec_held_monitor_count(Register java_thread);
-  void reset_held_monitor_count(Register java_thread);
 
   // Round up to a power of two
   void round_to(Register reg, int modulus);
diff --git a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
index 7764c791a14..9bef3f55916 100644
--- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
@@ -1609,7 +1609,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
   Label lock_done;
 
   if (method->is_synchronized()) {
-
+    Label count;
     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
 
     // Get the handle (the 2nd argument)
@@ -1631,9 +1631,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       __ str(swap_reg, Address(lock_reg, mark_word_offset));
 
       // src -> dest iff dest == r0 else r0 <- dest
-      { Label here;
-        __ cmpxchg_obj_header(r0, lock_reg, obj_reg, rscratch1, lock_done, /*fallthrough*/NULL);
-      }
+      __ cmpxchg_obj_header(r0, lock_reg, obj_reg, rscratch1, count, /*fallthrough*/NULL);
 
       // Hmm should this move to the slow path code area???
 
@@ -1656,6 +1654,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
     } else {
       __ b(slow_path_lock);
     }
+    __ bind(count);
+    __ increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 
     // Slow path will re-enter here
     __ bind(lock_done);
@@ -1757,14 +1757,18 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
     // Get locked oop from the handle we passed to jni
     __ ldr(obj_reg, Address(oop_handle_reg, 0));
 
-    Label done;
+    Label done, not_recursive;
 
     if (!UseHeavyMonitors) {
       // Simple recursive lock?
       __ ldr(rscratch1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
-      __ cbz(rscratch1, done);
+      __ cbnz(rscratch1, not_recursive);
+      __ decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
+      __ b(done);
     }
 
+    __ bind(not_recursive);
+
     // Must save r0 if if it is live now because cmpxchg must use it
     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
       save_native_result(masm, ret_type, stack_slots);
@@ -1777,9 +1781,10 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       __ ldr(old_hdr, Address(r0, 0));
 
       // Atomic swap old header if oop still contains the stack lock
-      Label succeed;
-      __ cmpxchg_obj_header(r0, old_hdr, obj_reg, rscratch1, succeed, &slow_path_unlock);
-      __ bind(succeed);
+      Label count;
+      __ cmpxchg_obj_header(r0, old_hdr, obj_reg, rscratch1, count, &slow_path_unlock);
+      __ bind(count);
+      __ decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
     } else {
       __ b(slow_path_unlock);
     }
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
index 941f0bd2dc5..776f2aaa049 100644
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -8092,11 +8092,11 @@ void fill_continuation_entry(MacroAssembler* masm) {
 
   __ ldr(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
   __ str(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
-  __ ldrw(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
-  __ strw(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
+  __ ldr(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
+  __ str(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
 
   __ str(zr, Address(rthread, JavaThread::cont_fastpath_offset()));
-  __ reset_held_monitor_count(rthread);
+  __ str(zr, Address(rthread, JavaThread::held_monitor_count_offset()));
 }
 
 // on entry, sp points to the ContinuationEntry
@@ -8113,8 +8113,8 @@ void continuation_enter_cleanup(MacroAssembler* masm) {
 
   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
   __ str(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
-  __ ldrw(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
-  __ strw(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
+  __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
+  __ str(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
 
   __ ldr(rscratch2, Address(sp, ContinuationEntry::parent_offset()));
   __ str(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
diff --git a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
index fcaf267bbb8..070bdf33d0f 100644
--- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
@@ -777,7 +777,6 @@ void TemplateInterpreterGenerator::lock_method() {
   __ str(r0, Address(esp, BasicObjectLock::obj_offset_in_bytes()));
   __ mov(c_rarg1, esp); // object address
   __ lock_object(c_rarg1);
-  __ inc_held_monitor_count(rthread);
 }
 
 // Generate a fixed interpreter frame. This is identical setup for
@@ -1496,7 +1495,6 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
 
       __ bind(unlock);
       __ unlock_object(c_rarg1);
-      __ dec_held_monitor_count(rthread);
     }
     __ bind(L);
   }
diff --git a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp
index c8deb29c1a6..05d3dbc1ff4 100644
--- a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp
@@ -3866,9 +3866,6 @@ void TemplateTable::monitorenter()
   __ str(r0, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
   __ lock_object(c_rarg1);
 
-  // The object is stored so counter should be increased even if stackoverflow is generated
-  __ inc_held_monitor_count(rthread);
-
   // check to make sure this monitor doesn't cause stack overflow after locking
   __ save_bcp();  // in case of exception
   __ generate_stack_overflow_check(0);
@@ -3927,7 +3924,6 @@ void TemplateTable::monitorexit()
   __ bind(found);
   __ push_ptr(r0); // make sure object is on stack (contract with oopMaps)
   __ unlock_object(c_rarg1);
-  __ dec_held_monitor_count(rthread);
   __ pop_ptr(r0); // discard object
 }
 
diff --git a/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp b/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp
index 28900c9ce07..890c4c12927 100644
--- a/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp
@@ -460,7 +460,6 @@ int LIR_Assembler::emit_unwind_handler() {
       __ unlock_object(rdi, rsi, rax, *stub->entry());
     }
     __ bind(*stub->continuation());
-    __ dec_held_monitor_count();
   }
 
   if (compilation()->env()->dtrace_method_probes()) {
@@ -3514,18 +3513,7 @@ void LIR_Assembler::emit_lock(LIR_OpLock* op) {
   } else {
     Unimplemented();
   }
-  if (op->code() == lir_lock) {
-    // If deoptimization happens in Runtime1::monitorenter, inc_held_monitor_count after backing from slowpath
-    // will be skipped. Solution is
-    // 1. Increase only in fastpath
-    // 2. Runtime1::monitorenter increase count after locking
-    __ inc_held_monitor_count();
-  }
   __ bind(*op->stub()->continuation());
-  if (op->code() == lir_unlock) {
-    // unlock in slowpath is JRT_Leaf stub, no deoptimization can happen
-    __ dec_held_monitor_count();
-  }
 }
 
 void LIR_Assembler::emit_load_klass(LIR_OpLoadKlass* op) {
diff --git a/src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp
index 26a1f36c94d..0e5c69b0864 100644
--- a/src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp
@@ -95,10 +95,12 @@ int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr
   jcc(Assembler::notZero, slow_case);
   // done
   bind(done);
+
+  inc_held_monitor_count();
+
   return null_check_offset;
 }
 
-
 void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_hdr, Label& slow_case) {
   const int aligned_mask = BytesPerWord -1;
   const int hdr_offset = oopDesc::mark_offset_in_bytes();
@@ -126,6 +128,8 @@ void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_
   jcc(Assembler::notEqual, slow_case);
   // done
   bind(done);
+
+  dec_held_monitor_count();
 }
 
 
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
index 56f480bd1fe..30ece6f7875 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -460,7 +460,7 @@ void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmp
   //    -- by other
   //
 
-  Label IsInflated, DONE_LABEL;
+  Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 
   if (DiagnoseSyncOnValueBasedClasses != 0) {
     load_klass(tmpReg, objReg, cx1Reg);
@@ -488,7 +488,7 @@ void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmp
     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
     lock();
     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
-    jcc(Assembler::equal, DONE_LABEL);           // Success
+    jcc(Assembler::equal, COUNT);           // Success
 
     // Recursive locking.
     // The object is stack-locked: markword contains stack pointer to BasicLock.
@@ -544,7 +544,7 @@ void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmp
   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
   // If we weren't able to swing _owner from NULL to the BasicLock
   // then take the slow path.
-  jccb  (Assembler::notZero, DONE_LABEL);
+  jccb  (Assembler::notZero, NO_COUNT);
   // update _owner from BasicLock to thread
   get_thread (scrReg);                    // beware: clobbers ICCs
   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
@@ -567,10 +567,10 @@ void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmp
   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
   // Propagate ICC.ZF from CAS above into DONE_LABEL.
-  jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
+  jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 
-  cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
-  jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
+  cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
+  jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 #endif // _LP64
@@ -584,7 +584,24 @@ void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmp
   // Unfortunately none of our alignment mechanisms suffice.
   bind(DONE_LABEL);
 
-  // At DONE_LABEL the icc ZFlag is set as follows ...
+  // ZFlag == 1 count in fast path
+  // ZFlag == 0 count in slow path
+  jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
+
+  bind(COUNT);
+  // Count monitors in fast path
+#ifndef _LP64
+  get_thread(tmpReg);
+  incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
+#else // _LP64
+  incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
+#endif
+
+  xorl(tmpReg, tmpReg); // Set ZF == 1
+
+  bind(NO_COUNT);
+
+  // At NO_COUNT the icc ZFlag is set as follows ...
   // fast_unlock uses the same protocol.
   // ZFlag == 1 -> Success
   // ZFlag == 0 -> Failure - force control through the slow path
@@ -626,7 +643,7 @@ void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register t
   assert(boxReg == rax, "");
   assert_different_registers(objReg, boxReg, tmpReg);
 
-  Label DONE_LABEL, Stacked, CheckSucc;
+  Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 
 #if INCLUDE_RTM_OPT
   if (UseRTMForStackLocks && use_rtm) {
@@ -644,12 +661,12 @@ void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register t
 
   if (!UseHeavyMonitors) {
     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
-    jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
+    jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
   }
-  movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
+  movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
   if (!UseHeavyMonitors) {
     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
-    jccb  (Assembler::zero, Stacked);
+    jccb   (Assembler::zero, Stacked);
   }
 
   // It's inflated.
@@ -800,6 +817,23 @@ void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register t
   }
 #endif
   bind(DONE_LABEL);
+
+  // ZFlag == 1 count in fast path
+  // ZFlag == 0 count in slow path
+  jccb(Assembler::notZero, NO_COUNT);
+
+  bind(COUNT);
+  // Count monitors in fast path
+#ifndef _LP64
+  get_thread(tmpReg);
+  decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
+#else // _LP64
+  decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
+#endif
+
+  xorl(tmpReg, tmpReg); // Set ZF == 1
+
+  bind(NO_COUNT);
 }
 
 //-------------------------------------------------------------------------------------------
diff --git a/src/hotspot/cpu/x86/globalDefinitions_x86.hpp b/src/hotspot/cpu/x86/globalDefinitions_x86.hpp
index 60e8b0d2232..1d42e4f720f 100644
--- a/src/hotspot/cpu/x86/globalDefinitions_x86.hpp
+++ b/src/hotspot/cpu/x86/globalDefinitions_x86.hpp
@@ -33,6 +33,8 @@ const bool CCallingConventionRequiresIntsAsLongs = false;
 
 #define SUPPORTS_NATIVE_CX8
 
+#define SUPPORT_MONITOR_COUNT
+
 #define CPU_MULTI_COPY_ATOMIC
 
 // The expected size in bytes of a cache line, used to pad data structures.
diff --git a/src/hotspot/cpu/x86/interp_masm_x86.cpp b/src/hotspot/cpu/x86/interp_masm_x86.cpp
index f971142ee58..c0c5c55b483 100644
--- a/src/hotspot/cpu/x86/interp_masm_x86.cpp
+++ b/src/hotspot/cpu/x86/interp_masm_x86.cpp
@@ -1064,8 +1064,6 @@ void InterpreterMacroAssembler::remove_activation(
 
   bind(unlock);
   unlock_object(robj);
-  dec_held_monitor_count();
-
   pop(state);
 
   // Check that for block-structured locking (i.e., that all locked
@@ -1110,7 +1108,6 @@ void InterpreterMacroAssembler::remove_activation(
       push(state);
       mov(robj, rmon);   // nop if robj and rmon are the same
       unlock_object(robj);
-      dec_held_monitor_count();
       pop(state);
 
       if (install_monitor_exception) {
@@ -1205,7 +1202,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg) {
             CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
             lock_reg);
   } else {
-    Label done;
+    Label count_locking, done, slow_case;
 
     const Register swap_reg = rax; // Must use rax for cmpxchg instruction
     const Register tmp_reg = rbx;
@@ -1217,8 +1214,6 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg) {
     const int mark_offset = lock_offset +
                             BasicLock::displaced_header_offset_in_bytes();
 
-    Label slow_case;
-
     // Load object pointer into obj_reg
     movptr(obj_reg, Address(lock_reg, obj_offset));
 
@@ -1243,7 +1238,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg) {
 
     lock();
     cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
-    jcc(Assembler::zero, done);
+    jcc(Assembler::zero, count_locking);
 
     const int zero_bits = LP64_ONLY(7) NOT_LP64(3);
 
@@ -1279,7 +1274,11 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg) {
 
     // Save the test result, for recursive case, the result is zero
     movptr(Address(lock_reg, mark_offset), swap_reg);
-    jcc(Assembler::zero, done);
+    jcc(Assembler::notZero, slow_case);
+
+    bind(count_locking);
+    inc_held_monitor_count();
+    jmp(done);
 
     bind(slow_case);
 
@@ -1312,7 +1311,7 @@ void InterpreterMacroAssembler::unlock_object(Register lock_reg) {
   if (UseHeavyMonitors) {
     call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), lock_reg);
   } else {
-    Label done;
+    Label count_locking, done, slow_case;
 
     const Register swap_reg   = rax;  // Must use rax for cmpxchg instruction
     const Register header_reg = LP64_ONLY(c_rarg2) NOT_LP64(rbx);  // Will contain the old oopMark
@@ -1338,16 +1337,20 @@ void InterpreterMacroAssembler::unlock_object(Register lock_reg) {
     testptr(header_reg, header_reg);
 
     // zero for recursive case
-    jcc(Assembler::zero, done);
+    jcc(Assembler::zero, count_locking);
 
     // Atomic swap back the old header
     lock();
     cmpxchgptr(header_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 
     // zero for simple unlock of a stack-lock case
-    jcc(Assembler::zero, done);
+    jcc(Assembler::notZero, slow_case);
 
+    bind(count_locking);
+    dec_held_monitor_count();
+    jmp(done);
 
+    bind(slow_case);
     // Call the runtime routine for slow case.
     movptr(Address(lock_reg, BasicObjectLock::obj_offset_in_bytes()), obj_reg); // restore obj
     call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), lock_reg);
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
index fc0e2cce318..b1c65faaf31 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -2905,56 +2905,26 @@ void MacroAssembler::pop_cont_fastpath() {
 }
 
 void MacroAssembler::inc_held_monitor_count() {
-  if (!Continuations::enabled()) return;
-
 #ifndef _LP64
   Register thread = rax;
   push(thread);
   get_thread(thread);
-#else
-  Register thread = r15_thread;
-#endif
-
   incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
-
-#ifndef _LP64
   pop(thread);
+#else // LP64
+  incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 #endif
 }
 
 void MacroAssembler::dec_held_monitor_count() {
-  if (!Continuations::enabled()) return;
-
 #ifndef _LP64
   Register thread = rax;
   push(thread);
   get_thread(thread);
-#else
-  Register thread = r15_thread;
-#endif
-
   decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
-
-#ifndef _LP64
-  pop(thread);
-#endif
-}
-
-void MacroAssembler::reset_held_monitor_count() {
-  if (!Continuations::enabled()) return;
-
-#ifndef _LP64
-  Register thread = rax;
-  push(thread);
-  get_thread(thread);
-#else
-  Register thread = r15_thread;
-#endif
-
-  movl(Address(thread, JavaThread::held_monitor_count_offset()), (int32_t)0);
-
-#ifndef _LP64
   pop(thread);
+#else // LP64
+  decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 #endif
 }
 
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
index 7c3d098bfef..7332936dab9 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -526,9 +526,10 @@ class MacroAssembler: public Assembler {
 
   void push_cont_fastpath();
   void pop_cont_fastpath();
+
   void inc_held_monitor_count();
   void dec_held_monitor_count();
-  void reset_held_monitor_count();
+
   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 
   // Round up to a power of two
diff --git a/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp b/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp
index f3fe31d3f73..887b36c6117 100644
--- a/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp
+++ b/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp
@@ -1692,6 +1692,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
 
   // Lock a synchronized method
   if (method->is_synchronized()) {
+    Label count_mon;
 
     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
 
@@ -1719,7 +1720,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       // *obj_reg = lock_reg iff *obj_reg == rax, else rax, = *(obj_reg)
       __ lock();
       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
-      __ jcc(Assembler::equal, lock_done);
+      __ jcc(Assembler::equal, count_mon);
 
       // Test if the oopMark is an obvious stack pointer, i.e.,
       //  1) (mark & 3) == 0, and
@@ -1739,6 +1740,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
     } else {
       __ jmp(slow_path_lock);
     }
+    __ bind(count_mon);
+    __ inc_held_monitor_count();
 
     // Slow path will re-enter here
     __ bind(lock_done);
@@ -1852,16 +1855,19 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
   Label unlock_done;
   if (method->is_synchronized()) {
 
-    Label done;
+    Label fast_done;
 
     // Get locked oop from the handle we passed to jni
     __ movptr(obj_reg, Address(oop_handle_reg, 0));
 
     if (!UseHeavyMonitors) {
+      Label not_recur;
       // Simple recursive lock?
-
       __ cmpptr(Address(rbp, lock_slot_rbp_offset), (int32_t)NULL_WORD);
-      __ jcc(Assembler::equal, done);
+      __ jcc(Assembler::notEqual, not_recur);
+      __ dec_held_monitor_count();
+      __ jmpb(fast_done);
+      __ bind(not_recur);
     }
 
     // Must save rax, if it is live now because cmpxchg must use it
@@ -1882,6 +1888,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       __ lock();
       __ cmpxchgptr(rbx, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
       __ jcc(Assembler::notEqual, slow_path_unlock);
+      __ dec_held_monitor_count();
     } else {
       __ jmp(slow_path_unlock);
     }
@@ -1892,8 +1899,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       restore_native_result(masm, ret_type, stack_slots);
     }
 
-    __ bind(done);
-
+    __ bind(fast_done);
   }
 
   {
diff --git a/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp b/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp
index 9b00600096e..2a480bd593a 100644
--- a/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp
+++ b/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp
@@ -1955,6 +1955,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
   Label lock_done;
 
   if (method->is_synchronized()) {
+    Label count_mon;
 
     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
 
@@ -1969,6 +1970,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
     __ movptr(obj_reg, Address(oop_handle_reg, 0));
 
     if (!UseHeavyMonitors) {
+
       // Load immediate 1 into swap_reg %rax
       __ movl(swap_reg, 1);
 
@@ -1981,7 +1983,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       // src -> dest iff dest == rax else rax <- dest
       __ lock();
       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
-      __ jcc(Assembler::equal, lock_done);
+      __ jcc(Assembler::equal, count_mon);
 
       // Hmm should this move to the slow path code area???
 
@@ -2003,6 +2005,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
     } else {
       __ jmp(slow_path_lock);
     }
+    __ bind(count_mon);
+    __ inc_held_monitor_count();
 
     // Slow path will re-enter here
     __ bind(lock_done);
@@ -2100,19 +2104,23 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
   // native result if any is live
 
   // Unlock
-  Label unlock_done;
   Label slow_path_unlock;
+  Label unlock_done;
   if (method->is_synchronized()) {
 
+    Label fast_done;
+
     // Get locked oop from the handle we passed to jni
     __ movptr(obj_reg, Address(oop_handle_reg, 0));
 
-    Label done;
-
     if (!UseHeavyMonitors) {
+      Label not_recur;
       // Simple recursive lock?
       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
-      __ jcc(Assembler::equal, done);
+      __ jcc(Assembler::notEqual, not_recur);
+      __ dec_held_monitor_count();
+      __ jmpb(fast_done);
+      __ bind(not_recur);
     }
 
     // Must save rax if it is live now because cmpxchg must use it
@@ -2120,7 +2128,6 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       save_native_result(masm, ret_type, stack_slots);
     }
 
-
     if (!UseHeavyMonitors) {
       // get address of the stack lock
       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
@@ -2131,6 +2138,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       __ lock();
       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
       __ jcc(Assembler::notEqual, slow_path_unlock);
+      __ dec_held_monitor_count();
     } else {
       __ jmp(slow_path_unlock);
     }
@@ -2141,7 +2149,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
       restore_native_result(masm, ret_type, stack_slots);
     }
 
-    __ bind(done);
+    __ bind(fast_done);
   }
   {
     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 68a980e0d25..b4befba12fa 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -8334,11 +8334,11 @@ void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Regist
 
   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
-  __ movl(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
-  __ movl(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
+  __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
+  __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
 
   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
-  __ reset_held_monitor_count();
+  __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
 }
 
 //---------------------------- continuation_enter_cleanup ---------------------------
@@ -8363,8 +8363,8 @@ void continuation_enter_cleanup(MacroAssembler* masm) {
 
   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
-  __ movl(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
-  __ movl(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
+  __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
+  __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
 
   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
diff --git a/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp b/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp
index 90442ef705d..30c5d65619c 100644
--- a/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp
+++ b/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp
@@ -604,8 +604,6 @@ void TemplateInterpreterGenerator::lock_method() {
   const Register lockreg = NOT_LP64(rdx) LP64_ONLY(c_rarg1);
   __ movptr(lockreg, rsp); // object address
   __ lock_object(lockreg);
-
-  __ inc_held_monitor_count();
 }
 
 // Generate a fixed interpreter frame. This is identical setup for
@@ -1264,7 +1262,6 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
 
       __ bind(unlock);
       __ unlock_object(regmon);
-      __ dec_held_monitor_count();
     }
     __ bind(L);
   }
diff --git a/src/hotspot/cpu/x86/templateTable_x86.cpp b/src/hotspot/cpu/x86/templateTable_x86.cpp
index 912757b57df..d575d0f565d 100644
--- a/src/hotspot/cpu/x86/templateTable_x86.cpp
+++ b/src/hotspot/cpu/x86/templateTable_x86.cpp
@@ -4363,9 +4363,6 @@ void TemplateTable::monitorenter() {
   __ movptr(Address(rmon, BasicObjectLock::obj_offset_in_bytes()), rax);
   __ lock_object(rmon);
 
-  // The object is stored so counter should be increased even if stackoverflow is generated
-  __ inc_held_monitor_count();
-
   // check to make sure this monitor doesn't cause stack overflow after locking
   __ save_bcp();  // in case of exception
   __ generate_stack_overflow_check(0);
@@ -4424,9 +4421,6 @@ void TemplateTable::monitorexit() {
   __ bind(found);
   __ push_ptr(rax); // make sure object is on stack (contract with oopMaps)
   __ unlock_object(rtop);
-
-  __ dec_held_monitor_count();
-
   __ pop_ptr(rax); // discard object
 }
 
diff --git a/src/hotspot/cpu/zero/globalDefinitions_zero.hpp b/src/hotspot/cpu/zero/globalDefinitions_zero.hpp
index dcf33d18d37..9db2060b8dd 100644
--- a/src/hotspot/cpu/zero/globalDefinitions_zero.hpp
+++ b/src/hotspot/cpu/zero/globalDefinitions_zero.hpp
@@ -30,6 +30,8 @@
 #define SUPPORTS_NATIVE_CX8
 #endif
 
+#define SUPPORT_MONITOR_COUNT
+
 #ifndef FFI_GO_CLOSURES
 #define FFI_GO_CLOSURES 0
 #endif
diff --git a/src/hotspot/cpu/zero/zeroInterpreter_zero.cpp b/src/hotspot/cpu/zero/zeroInterpreter_zero.cpp
index d840a1e0c30..fc1ee735237 100644
--- a/src/hotspot/cpu/zero/zeroInterpreter_zero.cpp
+++ b/src/hotspot/cpu/zero/zeroInterpreter_zero.cpp
@@ -334,16 +334,21 @@ int ZeroInterpreter::native_entry(Method* method, intptr_t UNUSED, TRAPS) {
     markWord disp = lockee->mark().set_unlocked();
     monitor->lock()->set_displaced_header(disp);
     bool call_vm = UseHeavyMonitors;
+    bool inc_monitor_count = true;
     if (call_vm || lockee->cas_set_mark(markWord::from_pointer(monitor), disp) != disp) {
       // Is it simple recursive case?
       if (!call_vm && thread->is_lock_owned((address) disp.clear_lock_bits().to_pointer())) {
         monitor->lock()->set_displaced_header(markWord::from_pointer(NULL));
       } else {
+        inc_monitor_count = false;
         CALL_VM_NOCHECK(InterpreterRuntime::monitorenter(thread, monitor));
         if (HAS_PENDING_EXCEPTION)
           goto unwind_and_return;
       }
     }
+    if (inc_monitor_count) {
+      THREAD->inc_held_monitor_count();
+    }
   }
 
   // Get the signature handler
@@ -479,13 +484,18 @@ int ZeroInterpreter::native_entry(Method* method, intptr_t UNUSED, TRAPS) {
     oop rcvr = monitor->obj();
     monitor->set_obj(NULL);
 
+    bool dec_monitor_count = true;
     if (header.to_pointer() != NULL) {
       markWord old_header = markWord::encode(lock);
       if (rcvr->cas_set_mark(header, old_header) != old_header) {
         monitor->set_obj(rcvr);
+        dec_monitor_count = false;
         InterpreterRuntime::monitorexit(monitor);
       }
     }
+    if (dec_monitor_count) {
+      THREAD->dec_held_monitor_count();
+    }
   }
 
  unwind_and_return:
diff --git a/src/hotspot/share/c1/c1_Runtime1.cpp b/src/hotspot/share/c1/c1_Runtime1.cpp
index 97d8f9a6288..76edf08aa06 100644
--- a/src/hotspot/share/c1/c1_Runtime1.cpp
+++ b/src/hotspot/share/c1/c1_Runtime1.cpp
@@ -742,7 +742,6 @@ JRT_BLOCK_ENTRY(void, Runtime1::monitorenter(JavaThread* current, oopDesc* obj,
   }
   assert(obj == lock->obj(), "must match");
   SharedRuntime::monitor_enter_helper(obj, lock->lock(), current);
-  current->inc_held_monitor_count();
 JRT_END
 
 
diff --git a/src/hotspot/share/interpreter/zero/bytecodeInterpreter.cpp b/src/hotspot/share/interpreter/zero/bytecodeInterpreter.cpp
index e286d7bd4eb..2e4186f90c3 100644
--- a/src/hotspot/share/interpreter/zero/bytecodeInterpreter.cpp
+++ b/src/hotspot/share/interpreter/zero/bytecodeInterpreter.cpp
@@ -626,14 +626,19 @@ void BytecodeInterpreter::run(interpreterState istate) {
         markWord displaced = rcvr->mark().set_unlocked();
         mon->lock()->set_displaced_header(displaced);
         bool call_vm = UseHeavyMonitors;
+        bool inc_monitor_count = true;
         if (call_vm || rcvr->cas_set_mark(markWord::from_pointer(mon), displaced) != displaced) {
           // Is it simple recursive case?
           if (!call_vm && THREAD->is_lock_owned((address) displaced.clear_lock_bits().to_pointer())) {
             mon->lock()->set_displaced_header(markWord::from_pointer(NULL));
           } else {
+            inc_monitor_count = false;
             CALL_VM(InterpreterRuntime::monitorenter(THREAD, mon), handle_exception);
           }
         }
+        if (inc_monitor_count) {
+          THREAD->inc_held_monitor_count();
+        }
       }
       THREAD->clr_do_not_unlock();
 
@@ -720,14 +725,19 @@ void BytecodeInterpreter::run(interpreterState istate) {
       markWord displaced = lockee->mark().set_unlocked();
       entry->lock()->set_displaced_header(displaced);
       bool call_vm = UseHeavyMonitors;
+      bool inc_monitor_count = true;
       if (call_vm || lockee->cas_set_mark(markWord::from_pointer(entry), displaced) != displaced) {
         // Is it simple recursive case?
         if (!call_vm && THREAD->is_lock_owned((address) displaced.clear_lock_bits().to_pointer())) {
           entry->lock()->set_displaced_header(markWord::from_pointer(NULL));
         } else {
+          inc_monitor_count = false;
           CALL_VM(InterpreterRuntime::monitorenter(THREAD, entry), handle_exception);
         }
       }
+      if (inc_monitor_count) {
+        THREAD->inc_held_monitor_count();
+      }
       UPDATE_PC_AND_TOS(1, -1);
       goto run;
     }
@@ -1628,14 +1638,19 @@ run:
           markWord displaced = lockee->mark().set_unlocked();
           entry->lock()->set_displaced_header(displaced);
           bool call_vm = UseHeavyMonitors;
+          bool inc_monitor_count = true;
           if (call_vm || lockee->cas_set_mark(markWord::from_pointer(entry), displaced) != displaced) {
             // Is it simple recursive case?
             if (!call_vm && THREAD->is_lock_owned((address) displaced.clear_lock_bits().to_pointer())) {
               entry->lock()->set_displaced_header(markWord::from_pointer(NULL));
             } else {
+              inc_monitor_count = false;
               CALL_VM(InterpreterRuntime::monitorenter(THREAD, entry), handle_exception);
             }
           }
+          if (inc_monitor_count) {
+            THREAD->inc_held_monitor_count();
+          }
           UPDATE_PC_AND_TOS_AND_CONTINUE(1, -1);
         } else {
           istate->set_msg(more_monitors);
@@ -1657,15 +1672,20 @@ run:
             most_recent->set_obj(NULL);
 
             // If it isn't recursive we either must swap old header or call the runtime
+            bool dec_monitor_count = true;
             bool call_vm = UseHeavyMonitors;
             if (header.to_pointer() != NULL || call_vm) {
               markWord old_header = markWord::encode(lock);
               if (call_vm || lockee->cas_set_mark(header, old_header) != old_header) {
                 // restore object for the slow case
                 most_recent->set_obj(lockee);
+                dec_monitor_count = false;
                 InterpreterRuntime::monitorexit(most_recent);
               }
             }
+            if (dec_monitor_count) {
+              THREAD->dec_held_monitor_count();
+            }
             UPDATE_PC_AND_TOS_AND_CONTINUE(1, -1);
           }
           most_recent++;
@@ -3081,14 +3101,19 @@ run:
           end->set_obj(NULL);
 
           // If it isn't recursive we either must swap old header or call the runtime
+          bool dec_monitor_count = true;
           if (header.to_pointer() != NULL) {
             markWord old_header = markWord::encode(lock);
             if (lockee->cas_set_mark(header, old_header) != old_header) {
               // restore object for the slow case
               end->set_obj(lockee);
+              dec_monitor_count = false;
               InterpreterRuntime::monitorexit(end);
             }
           }
+          if (dec_monitor_count) {
+            THREAD->dec_held_monitor_count();
+          }
 
           // One error is plenty
           if (illegal_state_oop() == NULL && !suppress_error) {
@@ -3147,11 +3172,13 @@ run:
             base->set_obj(NULL);
 
             // If it isn't recursive we either must swap old header or call the runtime
+            bool dec_monitor_count = true;
             if (header.to_pointer() != NULL) {
               markWord old_header = markWord::encode(lock);
               if (rcvr->cas_set_mark(header, old_header) != old_header) {
                 // restore object for the slow case
                 base->set_obj(rcvr);
+                dec_monitor_count = false;
                 InterpreterRuntime::monitorexit(base);
                 if (THREAD->has_pending_exception()) {
                   if (!suppress_error) illegal_state_oop = Handle(THREAD, THREAD->pending_exception());
@@ -3159,6 +3186,9 @@ run:
                 }
               }
             }
+            if (dec_monitor_count) {
+              THREAD->dec_held_monitor_count();
+            }
           }
         }
       }
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index 0fd7f4e16a6..76642cb9526 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -191,7 +191,7 @@
   nonstatic_field(JavaThread,                  _jni_environment,                              JNIEnv)                                \
   nonstatic_field(JavaThread,                  _poll_data,                                    SafepointMechanism::ThreadData)        \
   nonstatic_field(JavaThread,                  _stack_overflow_state._reserved_stack_activation, address)                            \
-  nonstatic_field(JavaThread,                  _held_monitor_count,                           int)                                   \
+  nonstatic_field(JavaThread,                  _held_monitor_count,                           int64_t)                               \
                                                                                                                                      \
   static_field(java_lang_Class,                _klass_offset,                                 int)                                   \
   static_field(java_lang_Class,                _array_klass_offset,                           int)                                   \
diff --git a/src/hotspot/share/opto/macro.cpp b/src/hotspot/share/opto/macro.cpp
index fd9a2c899b1..609bfdbb6cd 100644
--- a/src/hotspot/share/opto/macro.cpp
+++ b/src/hotspot/share/opto/macro.cpp
@@ -2214,29 +2214,11 @@ void PhaseMacroExpand::expand_lock_node(LockNode *lock) {
 
   Node *memproj = transform_later(new ProjNode(call, TypeFunc::Memory));
 
-  Node* thread = transform_later(new ThreadLocalNode());
-  if (Continuations::enabled()) {
-    // held_monitor_count increased in slowpath (complete_monitor_locking_C_inc_held_monitor_count), need compensate a decreament here
-    // this minimizes control flow changes here and add redundant count updates only in slowpath
-    Node* dec_count = make_load(slow_ctrl, memproj, thread, in_bytes(JavaThread::held_monitor_count_offset()), TypeInt::INT, TypeInt::INT->basic_type());
-    Node* new_dec_count = transform_later(new SubINode(dec_count, intcon(1)));
-    Node *compensate_dec = make_store(slow_ctrl, memproj, thread, in_bytes(JavaThread::held_monitor_count_offset()), new_dec_count, T_INT);
-    mem_phi->init_req(1, compensate_dec);
-  } else {
-    mem_phi->init_req(1, memproj);
-  }
+  mem_phi->init_req(1, memproj);
+
   transform_later(mem_phi);
 
-  if (Continuations::enabled()) {
-    // held_monitor_count increases in all path's post-dominate
-    Node* inc_count = make_load(region, mem_phi, thread, in_bytes(JavaThread::held_monitor_count_offset()), TypeInt::INT, TypeInt::INT->basic_type());
-    Node* new_inc_count = transform_later(new AddINode(inc_count, intcon(1)));
-    Node *store = make_store(region, mem_phi, thread, in_bytes(JavaThread::held_monitor_count_offset()), new_inc_count, T_INT);
-
-    _igvn.replace_node(_callprojs.fallthrough_memproj, store);
-  } else {
-    _igvn.replace_node(_callprojs.fallthrough_memproj, mem_phi);
-  }
+  _igvn.replace_node(_callprojs.fallthrough_memproj, mem_phi);
 }
 
 //------------------------------expand_unlock_node----------------------
@@ -2291,15 +2273,7 @@ void PhaseMacroExpand::expand_unlock_node(UnlockNode *unlock) {
   mem_phi->init_req(2, mem);
   transform_later(mem_phi);
 
-  if (Continuations::enabled()) {
-    Node* count = make_load(region, mem_phi, thread, in_bytes(JavaThread::held_monitor_count_offset()), TypeInt::INT, TypeInt::INT->basic_type());
-    Node* newcount = transform_later(new SubINode(count, intcon(1)));
-    Node *store = make_store(region, mem_phi, thread, in_bytes(JavaThread::held_monitor_count_offset()), newcount, T_INT);
-
-    _igvn.replace_node(_callprojs.fallthrough_memproj, store);
-  } else {
-    _igvn.replace_node(_callprojs.fallthrough_memproj, mem_phi);
-  }
+  _igvn.replace_node(_callprojs.fallthrough_memproj, mem_phi);
 }
 
 void PhaseMacroExpand::expand_subtypecheck_node(SubTypeCheckNode *check) {
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 825e4c8e7c8..14982f63249 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -144,7 +144,7 @@ bool OptoRuntime::generate(ciEnv* env) {
   gen(env, _multianewarray4_Java           , multianewarray4_Type         , multianewarray4_C               ,    0 , true, false);
   gen(env, _multianewarray5_Java           , multianewarray5_Type         , multianewarray5_C               ,    0 , true, false);
   gen(env, _multianewarrayN_Java           , multianewarrayN_Type         , multianewarrayN_C               ,    0 , true, false);
-  gen(env, _complete_monitor_locking_Java  , complete_monitor_enter_Type  , SharedRuntime::complete_monitor_locking_C_inc_held_monitor_count, 0, false, false);
+  gen(env, _complete_monitor_locking_Java  , complete_monitor_enter_Type  , SharedRuntime::complete_monitor_locking_C, 0, false, false);
   gen(env, _monitor_notify_Java            , monitor_notify_Type          , monitor_notify_C                ,    0 , false, false);
   gen(env, _monitor_notifyAll_Java         , monitor_notify_Type          , monitor_notifyAll_C             ,    0 , false, false);
   gen(env, _rethrow_Java                   , rethrow_Type                 , rethrow_C                       ,    2 , true , true );
diff --git a/src/hotspot/share/prims/jni.cpp b/src/hotspot/share/prims/jni.cpp
index 3030c87341d..7299d124b27 100644
--- a/src/hotspot/share/prims/jni.cpp
+++ b/src/hotspot/share/prims/jni.cpp
@@ -2719,12 +2719,7 @@ JNI_ENTRY(jint, jni_MonitorEnter(JNIEnv *env, jobject jobj))
 
   Handle obj(thread, JNIHandles::resolve_non_null(jobj));
   ObjectSynchronizer::jni_enter(obj, thread);
-  if (!Continuation::pin(thread)) {
-    ObjectSynchronizer::jni_exit(obj(), CHECK_(JNI_ERR));
-    THROW_(vmSymbols::java_lang_VirtualMachineError(), JNI_ERR);
-  }
-  ret = JNI_OK;
-  return ret;
+  return JNI_OK;
 JNI_END
 
 DT_RETURN_MARK_DECL(MonitorExit, jint
@@ -2742,11 +2737,7 @@ JNI_ENTRY(jint, jni_MonitorExit(JNIEnv *env, jobject jobj))
 
   Handle obj(THREAD, JNIHandles::resolve_non_null(jobj));
   ObjectSynchronizer::jni_exit(obj(), CHECK_(JNI_ERR));
-  if (!Continuation::unpin(thread)) {
-    ShouldNotReachHere();
-  }
-  ret = JNI_OK;
-  return ret;
+  return JNI_OK;
 JNI_END
 
 //
diff --git a/src/hotspot/share/runtime/continuationEntry.hpp b/src/hotspot/share/runtime/continuationEntry.hpp
index feab7e8069c..6914a6795fc 100644
--- a/src/hotspot/share/runtime/continuationEntry.hpp
+++ b/src/hotspot/share/runtime/continuationEntry.hpp
@@ -68,7 +68,11 @@ private:
   int _flags;
   int _argsize;
   intptr_t* _parent_cont_fastpath;
-  int _parent_held_monitor_count;
+#ifdef _LP64
+  int64_t   _parent_held_monitor_count;
+#else
+  int32_t   _parent_held_monitor_count;
+#endif
   uint _pin_count;
 
 public:
@@ -87,7 +91,7 @@ public:
   static size_t size() { return align_up((int)sizeof(ContinuationEntry), 2*wordSize); }
 
   ContinuationEntry* parent() const { return _parent; }
-  int parent_held_monitor_count() const { return _parent_held_monitor_count; }
+  int64_t parent_held_monitor_count() const { return (int64_t)_parent_held_monitor_count; }
 
   static address entry_pc() { return _return_pc; }
   intptr_t* entry_sp() const { return (intptr_t*)this; }
diff --git a/src/hotspot/share/runtime/continuationFreezeThaw.cpp b/src/hotspot/share/runtime/continuationFreezeThaw.cpp
index e4e1f6d8d90..bb0fb3969c1 100644
--- a/src/hotspot/share/runtime/continuationFreezeThaw.cpp
+++ b/src/hotspot/share/runtime/continuationFreezeThaw.cpp
@@ -1407,7 +1407,8 @@ static inline int freeze_internal(JavaThread* current, intptr_t* const sp) {
 
   assert(entry->is_virtual_thread() == (entry->scope() == java_lang_VirtualThread::vthread_scope()), "");
 
-  assert(monitors_on_stack(current) == (current->held_monitor_count() > 0), "");
+  assert(monitors_on_stack(current) == ((current->held_monitor_count() - current->jni_monitor_count()) > 0),
+         "Held monitor count and locks on stack invariant: " INT64_FORMAT " JNI: " INT64_FORMAT, (int64_t)current->held_monitor_count(), (int64_t)current->jni_monitor_count());
 
   if (entry->is_pinned() || current->held_monitor_count() > 0) {
     log_develop_debug(continuations)("PINNED due to critical section/hold monitor");
@@ -2253,7 +2254,7 @@ static inline intptr_t* thaw_internal(JavaThread* thread, const Continuation::th
   assert(is_aligned(sp, frame::frame_alignment), "");
 
   // All the frames have been thawed so we know they don't hold any monitors
-  thread->reset_held_monitor_count();
+  assert(thread->held_monitor_count() == 0, "Must be");
 
 #ifdef ASSERT
   intptr_t* sp0 = sp;
diff --git a/src/hotspot/share/runtime/deoptimization.cpp b/src/hotspot/share/runtime/deoptimization.cpp
index d7fe303848b..9c11b930705 100644
--- a/src/hotspot/share/runtime/deoptimization.cpp
+++ b/src/hotspot/share/runtime/deoptimization.cpp
@@ -1522,7 +1522,6 @@ bool Deoptimization::relock_objects(JavaThread* thread, GrowableArray<MonitorInf
         BasicLock* lock = mon_info->lock();
         ObjectSynchronizer::enter(obj, lock, deoptee_thread);
         assert(mon_info->owner()->is_locked(), "object must be locked now");
-        deoptee_thread->inc_held_monitor_count();
       }
     }
   }
@@ -1597,7 +1596,6 @@ void Deoptimization::pop_frames_failed_reallocs(JavaThread* thread, vframeArray*
         BasicObjectLock* src = monitors->at(j);
         if (src->obj() != NULL) {
           ObjectSynchronizer::exit(src->obj(), src->lock(), thread);
-          thread->dec_held_monitor_count();
         }
       }
       array->element(i)->free_monitors(thread);
diff --git a/src/hotspot/share/runtime/javaThread.cpp b/src/hotspot/share/runtime/javaThread.cpp
index 47b8010236f..dbe34aea11e 100644
--- a/src/hotspot/share/runtime/javaThread.cpp
+++ b/src/hotspot/share/runtime/javaThread.cpp
@@ -456,6 +456,7 @@ JavaThread::JavaThread() :
   _cont_fastpath(0),
   _cont_fastpath_thread_state(1),
   _held_monitor_count(0),
+  _jni_monitor_count(0),
 
   _handshake(this),
 
@@ -850,7 +851,18 @@ void JavaThread::exit(bool destroy_vm, ExitType exit_type) {
     assert(!this->has_pending_exception(), "release_monitors should have cleared");
   }
 
-  assert(!Continuations::enabled() || this->held_monitor_count() == 0, "held monitor count should be zero");
+  // Since above code may not release JNI monitors and if someone forgot to do an
+  // JNI monitorexit, held count should be equal jni count.
+  // Consider scan all object monitor for this owner if JNI count > 0 (at least on detach).
+  assert(this->held_monitor_count() == this->jni_monitor_count(),
+         "held monitor count should be equal to jni: " INT64_FORMAT " != " INT64_FORMAT,
+         (int64_t)this->held_monitor_count(), (int64_t)this->jni_monitor_count());
+  if (CheckJNICalls && this->jni_monitor_count() > 0) {
+    // We would like a fatal here, but due to we never checked this before there
+    // is a lot of tests which breaks, even with an error log.
+    log_debug(jni)("JavaThread %s (tid: " UINTX_FORMAT ") with Objects still locked by JNI MonitorEnter.",
+      exit_type == JavaThread::normal_exit ? "exiting" : "detaching", os::current_thread_id());
+  }
 
   // These things needs to be done while we are still a Java Thread. Make sure that thread
   // is in a consistent state, in case GC happens
@@ -1844,19 +1856,26 @@ void JavaThread::trace_stack() {
 
 #endif // PRODUCT
 
-void JavaThread::inc_held_monitor_count() {
-  if (!Continuations::enabled()) {
-    return;
+void JavaThread::inc_held_monitor_count(int i, bool jni) {
+#ifdef SUPPORT_MONITOR_COUNT
+  assert(_held_monitor_count >= 0, "Must always be greater than 0: " INT64_FORMAT, (int64_t)_held_monitor_count);
+  _held_monitor_count += i;
+  if (jni) {
+    assert(_jni_monitor_count >= 0, "Must always be greater than 0: " INT64_FORMAT, (int64_t)_jni_monitor_count);
+    _jni_monitor_count += i;
   }
-  _held_monitor_count++;
+#endif
 }
 
-void JavaThread::dec_held_monitor_count() {
-  if (!Continuations::enabled()) {
-    return;
+void JavaThread::dec_held_monitor_count(int i, bool jni) {
+#ifdef SUPPORT_MONITOR_COUNT
+  _held_monitor_count -= i;
+  assert(_held_monitor_count >= 0, "Must always be greater than 0: " INT64_FORMAT, (int64_t)_held_monitor_count);
+  if (jni) {
+    _jni_monitor_count -= i;
+    assert(_jni_monitor_count >= 0, "Must always be greater than 0: " INT64_FORMAT, (int64_t)_jni_monitor_count);
   }
-  assert(_held_monitor_count > 0, "");
-  _held_monitor_count--;
+#endif
 }
 
 frame JavaThread::vthread_last_frame() {
diff --git a/src/hotspot/share/runtime/javaThread.hpp b/src/hotspot/share/runtime/javaThread.hpp
index 2bfb295817c..af226b4f672 100644
--- a/src/hotspot/share/runtime/javaThread.hpp
+++ b/src/hotspot/share/runtime/javaThread.hpp
@@ -441,7 +441,15 @@ class JavaThread: public Thread {
   intptr_t* _cont_fastpath; // the sp of the oldest known interpreted/call_stub frame inside the
                             // continuation that we know about
   int _cont_fastpath_thread_state; // whether global thread state allows continuation fastpath (JVMTI)
-  int _held_monitor_count;  // used by continuations for fast lock detection
+  // It's signed for error detection.
+#ifdef _LP64
+  int64_t _held_monitor_count;  // used by continuations for fast lock detection
+  int64_t _jni_monitor_count;
+#else
+  int32_t _held_monitor_count;  // used by continuations for fast lock detection
+  int32_t _jni_monitor_count;
+#endif
+
 private:
 
   friend class VMThread;
@@ -591,10 +599,12 @@ private:
   bool cont_fastpath() const                   { return _cont_fastpath == NULL && _cont_fastpath_thread_state != 0; }
   bool cont_fastpath_thread_state() const      { return _cont_fastpath_thread_state != 0; }
 
-  int held_monitor_count()        { return _held_monitor_count; }
-  void reset_held_monitor_count() { _held_monitor_count = 0; }
-  void inc_held_monitor_count();
-  void dec_held_monitor_count();
+  void inc_held_monitor_count(int i = 1, bool jni = false);
+  void dec_held_monitor_count(int i = 1, bool jni = false);
+
+  int64_t held_monitor_count() { return (int64_t)_held_monitor_count; }
+  int64_t jni_monitor_count()  { return (int64_t)_jni_monitor_count;  }
+  void clear_jni_monitor_count() { _jni_monitor_count = 0;   }
 
   inline bool is_vthread_mounted() const;
   inline const ContinuationEntry* vthread_continuation() const;
diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
index f391ff0fcf9..fa5b1bc1d14 100644
--- a/src/hotspot/share/runtime/objectMonitor.cpp
+++ b/src/hotspot/share/runtime/objectMonitor.cpp
@@ -1631,8 +1631,10 @@ void ObjectMonitor::wait(jlong millis, bool interruptible, TRAPS) {
   current->set_current_waiting_monitor(NULL);
 
   guarantee(_recursions == 0, "invariant");
-  _recursions = save      // restore the old recursion count
-                + JvmtiDeferredUpdates::get_and_reset_relock_count_after_wait(current); //  increased by the deferred relock count
+  int relock_count = JvmtiDeferredUpdates::get_and_reset_relock_count_after_wait(current);
+  _recursions =   save          // restore the old recursion count
+                + relock_count; //  increased by the deferred relock count
+  current->inc_held_monitor_count(relock_count); // Deopt never entered these counts.
   _waiters--;             // decrement the number of waiters
 
   // Verify a few postconditions
diff --git a/src/hotspot/share/runtime/sharedRuntime.cpp b/src/hotspot/share/runtime/sharedRuntime.cpp
index f6787ccff28..e4a4888b06f 100644
--- a/src/hotspot/share/runtime/sharedRuntime.cpp
+++ b/src/hotspot/share/runtime/sharedRuntime.cpp
@@ -2167,7 +2167,9 @@ void SharedRuntime::monitor_enter_helper(oopDesc* obj, BasicLock* lock, JavaThre
   if (!SafepointSynchronize::is_synchronizing()) {
     // Only try quick_enter() if we're not trying to reach a safepoint
     // so that the calling thread reaches the safepoint more quickly.
-    if (ObjectSynchronizer::quick_enter(obj, current, lock)) return;
+    if (ObjectSynchronizer::quick_enter(obj, current, lock)) {
+      return;
+    }
   }
   // NO_ASYNC required because an async exception on the state transition destructor
   // would leave you with the lock held and it would never be released.
@@ -2185,11 +2187,6 @@ JRT_BLOCK_ENTRY(void, SharedRuntime::complete_monitor_locking_C(oopDesc* obj, Ba
   SharedRuntime::monitor_enter_helper(obj, lock, current);
 JRT_END
 
-JRT_BLOCK_ENTRY(void, SharedRuntime::complete_monitor_locking_C_inc_held_monitor_count(oopDesc* obj, BasicLock* lock, JavaThread* current))
-  SharedRuntime::monitor_enter_helper(obj, lock, current);
-  current->inc_held_monitor_count();
-JRT_END
-
 void SharedRuntime::monitor_exit_helper(oopDesc* obj, BasicLock* lock, JavaThread* current) {
   assert(JavaThread::current() == current, "invariant");
   // Exit must be non-blocking, and therefore no exceptions can be thrown.
diff --git a/src/hotspot/share/runtime/sharedRuntime.hpp b/src/hotspot/share/runtime/sharedRuntime.hpp
index 21cef1c4b89..a8ed33cfa3f 100644
--- a/src/hotspot/share/runtime/sharedRuntime.hpp
+++ b/src/hotspot/share/runtime/sharedRuntime.hpp
@@ -487,7 +487,6 @@ class SharedRuntime: AllStatic {
 
   // Slow-path Locking and Unlocking
   static void complete_monitor_locking_C(oopDesc* obj, BasicLock* lock, JavaThread* current);
-  static void complete_monitor_locking_C_inc_held_monitor_count(oopDesc* obj, BasicLock* lock, JavaThread* current);
   static void complete_monitor_unlocking_C(oopDesc* obj, BasicLock* lock, JavaThread* current);
 
   // Resolving of calls
diff --git a/src/hotspot/share/runtime/synchronizer.cpp b/src/hotspot/share/runtime/synchronizer.cpp
index 748eec7cfdf..aad3d092ac3 100644
--- a/src/hotspot/share/runtime/synchronizer.cpp
+++ b/src/hotspot/share/runtime/synchronizer.cpp
@@ -375,6 +375,7 @@ bool ObjectSynchronizer::quick_enter(oop obj, JavaThread* current,
 
     if (owner == current) {
       m->_recursions++;
+      current->inc_held_monitor_count();
       return true;
     }
 
@@ -391,6 +392,7 @@ bool ObjectSynchronizer::quick_enter(oop obj, JavaThread* current,
 
     if (owner == NULL && m->try_set_owner_from(NULL, current) == NULL) {
       assert(m->_recursions == 0, "invariant");
+      current->inc_held_monitor_count();
       return true;
     }
   }
@@ -472,6 +474,8 @@ void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current)
     handle_sync_on_value_based_class(obj, current);
   }
 
+  current->inc_held_monitor_count();
+
   if (!useHeavyMonitors()) {
     markWord mark = obj->mark();
     if (mark.is_neutral()) {
@@ -511,6 +515,8 @@ void ObjectSynchronizer::enter(Handle obj, BasicLock* lock, JavaThread* current)
 }
 
 void ObjectSynchronizer::exit(oop object, BasicLock* lock, JavaThread* current) {
+  current->dec_held_monitor_count();
+
   if (!useHeavyMonitors()) {
     markWord mark = object->mark();
 
@@ -579,8 +585,9 @@ intx ObjectSynchronizer::complete_exit(Handle obj, JavaThread* current) {
   // The ObjectMonitor* can't be async deflated until ownership is
   // dropped inside exit() and the ObjectMonitor* must be !is_busy().
   ObjectMonitor* monitor = inflate(current, obj(), inflate_cause_vm_internal);
-  intptr_t ret_code = monitor->complete_exit(current);
-  return ret_code;
+  intx recur_count = monitor->complete_exit(current);
+  current->dec_held_monitor_count(recur_count + 1);
+  return recur_count;
 }
 
 // NOTE: must use heavy weight monitor to handle complete_exit/reenter()
@@ -592,6 +599,7 @@ void ObjectSynchronizer::reenter(Handle obj, intx recursions, JavaThread* curren
   while (true) {
     ObjectMonitor* monitor = inflate(current, obj(), inflate_cause_vm_internal);
     if (monitor->reenter(recursions, current)) {
+      current->inc_held_monitor_count(recursions + 1);
       return;
     }
   }
@@ -613,6 +621,7 @@ void ObjectSynchronizer::jni_enter(Handle obj, JavaThread* current) {
   while (true) {
     ObjectMonitor* monitor = inflate(current, obj(), inflate_cause_jni_enter);
     if (monitor->enter(current)) {
+      current->inc_held_monitor_count(1, true);
       break;
     }
   }
@@ -631,6 +640,7 @@ void ObjectSynchronizer::jni_exit(oop obj, TRAPS) {
   // monitor even if an exception was already pending.
   if (monitor->check_owner(THREAD)) {
     monitor->exit(current);
+    current->dec_held_monitor_count(1, true);
   }
 }
 
@@ -1577,7 +1587,8 @@ class ReleaseJavaMonitorsClosure: public MonitorClosure {
  public:
   ReleaseJavaMonitorsClosure(JavaThread* thread) : _thread(thread) {}
   void do_monitor(ObjectMonitor* mid) {
-    (void)mid->complete_exit(_thread);
+    intx rec = mid->complete_exit(_thread);
+    _thread->dec_held_monitor_count(rec + 1);
   }
 };
 
@@ -1603,6 +1614,9 @@ void ObjectSynchronizer::release_monitors_owned_by_thread(JavaThread* current) {
   ObjectSynchronizer::monitors_iterate(&rjmc, current);
   assert(!current->has_pending_exception(), "Should not be possible");
   current->clear_pending_exception();
+  assert(current->held_monitor_count() == 0, "Should not be possible");
+  // All monitors (including entered via JNI) have been unlocked above, so we need to clear jni count.
+  current->clear_jni_monitor_count();
 }
 
 const char* ObjectSynchronizer::inflate_cause_name(const InflateCause cause) {
diff --git a/src/hotspot/share/runtime/thread.cpp b/src/hotspot/share/runtime/thread.cpp
index 424b0c54db4..43c2a16b380 100644
--- a/src/hotspot/share/runtime/thread.cpp
+++ b/src/hotspot/share/runtime/thread.cpp
@@ -552,7 +552,6 @@ bool Thread::set_as_starting_thread() {
   return os::create_main_thread(JavaThread::cast(this));
 }
 
-
 // Ad-hoc mutual exclusion primitives: SpinLock
 //
 // We employ SpinLocks _only for low-contention, fixed-length
diff --git a/test/hotspot/jtreg/runtime/Monitor/CompleteExit.java b/test/hotspot/jtreg/runtime/Monitor/CompleteExit.java
new file mode 100644
index 00000000000..7e63050dc9e
--- /dev/null
+++ b/test/hotspot/jtreg/runtime/Monitor/CompleteExit.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/*
+ * @test CompleteExit
+ * @summary This does a sanity test of the poll in the native wrapper.
+ * @requires os.family == "linux"
+ * @library /testlibrary /test/lib
+ * @build CompleteExit
+ * @run main/native CompleteExit
+ */
+
+public class CompleteExit {
+    public static native void testIt(Object o1, Object o2);
+
+    static volatile Object o1 = new Object();
+    static volatile Object o2 = new Object();
+
+    static {
+        System.loadLibrary("CompleteExit");
+    }
+
+    public static void main(String[] args) throws Exception {
+        testIt(o1, o2);
+    }
+}
diff --git a/test/hotspot/jtreg/runtime/Monitor/libCompleteExit.c b/test/hotspot/jtreg/runtime/Monitor/libCompleteExit.c
new file mode 100644
index 00000000000..07ba0ff0ef8
--- /dev/null
+++ b/test/hotspot/jtreg/runtime/Monitor/libCompleteExit.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#include <jni.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdio.h>
+
+#define die(x) do { printf("%s:%s\n",x , __func__); perror(x); exit(EXIT_FAILURE); } while (0)
+
+#ifndef _Included_CompleteExit
+#define _Included_CompleteExit
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static JavaVM* jvm;
+static pthread_t attacher;
+
+static jobject t1, t2;
+
+static void* do_test() {
+  JNIEnv* env;
+  int res = (*jvm)->AttachCurrentThread(jvm, (void**)&env, NULL);
+  if (res != JNI_OK) die("AttachCurrentThread");
+
+  if ((*env)->MonitorEnter(env, t1) != 0) die("MonitorEnter");
+  if ((*env)->MonitorEnter(env, t2) != 0) die("MonitorEnter");
+
+  if ((*jvm)->DetachCurrentThread(jvm) != JNI_OK) die("DetachCurrentThread");
+  pthread_exit(NULL);
+
+  return NULL;
+}
+
+/*
+ * Class:     CompleteExit
+ * Method:    startThread
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_CompleteExit_testIt(JNIEnv* env, jclass jc, jobject o1, jobject o2) {
+    void* ret;
+    pthread_attr_t attr;
+
+    (*env)->GetJavaVM(env, &jvm);
+
+    t1 = (*env)->NewGlobalRef(env, o1);
+    t2 = (*env)->NewGlobalRef(env, o2);
+
+    if (pthread_attr_init(&attr) != 0) die("pthread_attr_init");
+    if (pthread_create(&attacher, &attr, do_test, NULL) != 0) die("pthread_create");
+    if (pthread_join(attacher, &ret) != 0) die("pthread_join");
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif