6672778: G1 should trim task queues more aggressively during evacuation pauses

Regularly try to drain task queues. This helps memory usage and performance during garbage collection. Reviewed-by: sjohanss, sangheki
2018-04-27 12:06:46 +02:00 · 2018-04-27 12:06:46 +02:00 · 0f583c7817
commit 0f583c7817
parent 893bd31647
24 changed files with 315 additions and 570 deletions
--- a/src/hotspot/share/gc/g1/bufferingOopClosure.hpp
+++ b/src/hotspot/share/gc/g1/bufferingOopClosure.hpp
@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
-#define SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
-
-#include "memory/iterator.hpp"
-#include "oops/oopsHierarchy.hpp"
-#include "runtime/os.hpp"
-#include "utilities/debug.hpp"
-
-// A BufferingOops closure tries to separate out the cost of finding roots
-// from the cost of applying closures to them.  It maintains an array of
-// ref-containing locations.  Until the array is full, applying the closure
-// to an oop* merely records that location in the array.  Since this
-// closure app cost is small, an elapsed timer can approximately attribute
-// all of this cost to the cost of finding the roots.  When the array fills
-// up, the wrapped closure is applied to all elements, keeping track of
-// this elapsed time of this process, and leaving the array empty.
-// The caller must be sure to call "done" to process any unprocessed
-// buffered entries.
-
-class BufferingOopClosure: public OopClosure {
-  friend class BufferingOopClosureTest;
-protected:
-  static const size_t BufferLength = 1024;
-
-  // We need to know if the buffered addresses contain oops or narrowOops.
-  // We can't tag the addresses the way StarTask does, because we need to
-  // be able to handle unaligned addresses coming from oops embedded in code.
-  //
-  // The addresses for the full-sized oops are filled in from the bottom,
-  // while the addresses for the narrowOops are filled in from the top.
-  OopOrNarrowOopStar  _buffer[BufferLength];
-  OopOrNarrowOopStar* _oop_top;
-  OopOrNarrowOopStar* _narrowOop_bottom;
-
-  OopClosure* _oc;
-  double      _closure_app_seconds;
-
-
-  bool is_buffer_empty() {
-    return _oop_top == _buffer && _narrowOop_bottom == (_buffer + BufferLength - 1);
-  }
-
-  bool is_buffer_full() {
-    return _narrowOop_bottom < _oop_top;
-  }
-
-  // Process addresses containing full-sized oops.
-  void process_oops() {
-    for (OopOrNarrowOopStar* curr = _buffer; curr < _oop_top; ++curr) {
-      _oc->do_oop((oop*)(*curr));
-    }
-    _oop_top = _buffer;
-  }
-
-  // Process addresses containing narrow oops.
-  void process_narrowOops() {
-    for (OopOrNarrowOopStar* curr = _buffer + BufferLength - 1; curr > _narrowOop_bottom; --curr) {
-      _oc->do_oop((narrowOop*)(*curr));
-    }
-    _narrowOop_bottom = _buffer + BufferLength - 1;
-  }
-
-  // Apply the closure to all oops and clear the buffer.
-  // Accumulate the time it took.
-  void process_buffer() {
-    double start = os::elapsedTime();
-
-    process_oops();
-    process_narrowOops();
-
-    _closure_app_seconds += (os::elapsedTime() - start);
-  }
-
-  void process_buffer_if_full() {
-    if (is_buffer_full()) {
-      process_buffer();
-    }
-  }
-
-  void add_narrowOop(narrowOop* p) {
-    assert(!is_buffer_full(), "Buffer should not be full");
-    *_narrowOop_bottom = (OopOrNarrowOopStar)p;
-    _narrowOop_bottom--;
-  }
-
-  void add_oop(oop* p) {
-    assert(!is_buffer_full(), "Buffer should not be full");
-    *_oop_top = (OopOrNarrowOopStar)p;
-    _oop_top++;
-  }
-
-public:
-  virtual void do_oop(narrowOop* p) {
-    process_buffer_if_full();
-    add_narrowOop(p);
-  }
-
-  virtual void do_oop(oop* p)       {
-    process_buffer_if_full();
-    add_oop(p);
-  }
-
-  void done() {
-    if (!is_buffer_empty()) {
-      process_buffer();
-    }
-  }
-
-  double closure_app_seconds() {
-    return _closure_app_seconds;
-  }
-
-  BufferingOopClosure(OopClosure *oc) :
-    _oc(oc),
-    _oop_top(_buffer),
-    _narrowOop_bottom(_buffer + BufferLength - 1),
-    _closure_app_seconds(0.0) { }
-};
-
-#endif // SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@ -126,6 +126,11 @@ void G1Arguments::initialize() {

  log_trace(gc)("MarkStackSize: %uk  MarkStackSizeMax: %uk", (unsigned int) (MarkStackSize / K), (uint) (MarkStackSizeMax / K));

+  // By default do not let the target stack size to be more than 1/4 of the entries
+  if (FLAG_IS_DEFAULT(GCDrainStackTargetSize)) {
+    FLAG_SET_ERGO(uintx, GCDrainStackTargetSize, MIN2(GCDrainStackTargetSize, (uintx)TASKQUEUE_SIZE / 4));
+  }
+
 #ifdef COMPILER2
  // Enable loop strip mining to offer better pause time guarantees
  if (FLAG_IS_DEFAULT(UseCountedLoopSafepoints)) {
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -28,7 +28,6 @@
 #include "classfile/symbolTable.hpp"
 #include "code/codeCache.hpp"
 #include "code/icBuffer.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
@ -1841,7 +1840,7 @@ void G1CollectedHeap::iterate_dirty_card_closure(CardTableEntryClosure* cl, uint
  while (dcqs.apply_closure_during_gc(cl, worker_i)) {
    n_completed_buffers++;
  }
-  g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, n_completed_buffers);
+  g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, n_completed_buffers, G1GCPhaseTimes::UpdateRSProcessedBuffers);
  dcqs.clear_n_completed_buffers();
  assert(!dcqs.completed_buffers_exist_dirty(), "Completed buffers exist!");
 }
@ -3130,15 +3129,13 @@ public:

      double start_strong_roots_sec = os::elapsedTime();

-      _root_processor->evacuate_roots(pss->closures(), worker_id);
+      _root_processor->evacuate_roots(pss, worker_id);

      // We pass a weak code blobs closure to the remembered set scanning because we want to avoid
      // treating the nmethods visited to act as roots for concurrent marking.
      // We only want to make sure that the oops in the nmethods are adjusted with regard to the
      // objects copied by the current evacuation.
-      _g1h->g1_rem_set()->oops_into_collection_set_do(pss,
-                                                      pss->closures()->weak_codeblobs(),
-                                                      worker_id);
+      _g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id);

      double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;

@ -3152,9 +3149,11 @@ public:
        evac_term_attempts = evac.term_attempts();
        term_sec = evac.term_time();
        double elapsed_sec = os::elapsedTime() - start;
-        _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
-        _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
+
+        G1GCPhaseTimes* p = _g1h->g1_policy()->phase_times();
+        p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
+        p->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
+        p->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
      }

      assert(pss->queue_is_empty(), "should be empty");
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,6 +26,7 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
+#include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1StringDedup.hpp"
 #include "gc/shared/workerDataArray.inline.hpp"
 #include "memory/resourceArea.hpp"
@ -167,9 +168,12 @@ void G1GCPhaseTimes::note_gc_start() {
 }

 #define ASSERT_PHASE_UNINITIALIZED(phase) \
-    assert(_gc_par_phases[phase]->get(i) == uninitialized, "Phase " #phase " reported for thread that was not started");
+    assert(_gc_par_phases[phase] == NULL || _gc_par_phases[phase]->get(i) == uninitialized, "Phase " #phase " reported for thread that was not started");

 double G1GCPhaseTimes::worker_time(GCParPhases phase, uint worker) {
+  if (_gc_par_phases[phase] == NULL) {
+    return 0.0;
+  }
  double value = _gc_par_phases[phase]->get(worker);
  if (value != WorkerDataArray<double>::uninitialized()) {
    return value;
@ -189,21 +193,20 @@ void G1GCPhaseTimes::note_gc_end() {
      double total_worker_time = _gc_par_phases[GCWorkerEnd]->get(i) - _gc_par_phases[GCWorkerStart]->get(i);
      record_time_secs(GCWorkerTotal, i , total_worker_time);

-      double worker_known_time =
-          worker_time(ExtRootScan, i)
-          + worker_time(SATBFiltering, i)
-          + worker_time(UpdateRS, i)
-          + worker_time(ScanRS, i)
-          + worker_time(CodeRoots, i)
-          + worker_time(ObjCopy, i)
-          + worker_time(Termination, i);
+      double worker_known_time = worker_time(ExtRootScan, i) +
+                                 worker_time(ScanHCC, i) +
+                                 worker_time(UpdateRS, i) +
+                                 worker_time(ScanRS, i) +
+                                 worker_time(CodeRoots, i) +
+                                 worker_time(ObjCopy, i) +
+                                 worker_time(Termination, i);

      record_time_secs(Other, i, total_worker_time - worker_known_time);
    } else {
      // Make sure all slots are uninitialized since this thread did not seem to have been started
      ASSERT_PHASE_UNINITIALIZED(GCWorkerEnd);
      ASSERT_PHASE_UNINITIALIZED(ExtRootScan);
-      ASSERT_PHASE_UNINITIALIZED(SATBFiltering);
+      ASSERT_PHASE_UNINITIALIZED(ScanHCC);
      ASSERT_PHASE_UNINITIALIZED(UpdateRS);
      ASSERT_PHASE_UNINITIALIZED(ScanRS);
      ASSERT_PHASE_UNINITIALIZED(CodeRoots);
@ -225,6 +228,14 @@ void G1GCPhaseTimes::add_time_secs(GCParPhases phase, uint worker_i, double secs
  _gc_par_phases[phase]->add(worker_i, secs);
 }

+void G1GCPhaseTimes::record_or_add_objcopy_time_secs(uint worker_i, double secs) {
+  if (_gc_par_phases[ObjCopy]->get(worker_i) == _gc_par_phases[ObjCopy]->uninitialized()) {
+    record_time_secs(ObjCopy, worker_i, secs);
+  } else {
+    add_time_secs(ObjCopy, worker_i, secs);
+  }
+}
+
 void G1GCPhaseTimes::record_thread_work_item(GCParPhases phase, uint worker_i, size_t count, uint index) {
  _gc_par_phases[phase]->set_thread_work_item(worker_i, count, index);
 }
@ -463,16 +474,49 @@ void G1GCPhaseTimes::print() {
  }
 }

+G1EvacPhaseWithTrimTimeTracker::G1EvacPhaseWithTrimTimeTracker(G1ParScanThreadState* pss, Tickspan& total_time, Tickspan& trim_time) :
+  _pss(pss),
+  _start(Ticks::now()),
+  _total_time(total_time),
+  _trim_time(trim_time) {
+
+  assert(_pss->trim_ticks().value() == 0, "Possibly remaining trim ticks left over from previous use");
+}
+
+G1EvacPhaseWithTrimTimeTracker::~G1EvacPhaseWithTrimTimeTracker() {
+  _total_time += (Ticks::now() - _start) - _pss->trim_ticks();
+  _trim_time += _pss->trim_ticks();
+  _pss->reset_trim_ticks();
+}
+
 G1GCParPhaseTimesTracker::G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id) :
    _phase_times(phase_times), _phase(phase), _worker_id(worker_id) {
  if (_phase_times != NULL) {
-    _start_time = os::elapsedTime();
+    _start_time = Ticks::now();
  }
 }

 G1GCParPhaseTimesTracker::~G1GCParPhaseTimesTracker() {
  if (_phase_times != NULL) {
-    _phase_times->record_time_secs(_phase, _worker_id, os::elapsedTime() - _start_time);
+    _phase_times->record_time_secs(_phase, _worker_id, TicksToTimeHelper::seconds(Ticks::now() - _start_time));
+  }
+}
+
+G1EvacPhaseTimesTracker::G1EvacPhaseTimesTracker(G1GCPhaseTimes* phase_times,
+                                                 G1ParScanThreadState* pss,
+                                                 G1GCPhaseTimes::GCParPhases phase,
+                                                 uint worker_id) :
+  G1GCParPhaseTimesTracker(phase_times, phase, worker_id),
+  _total_time(),
+  _trim_time(),
+  _trim_tracker(pss, _total_time, _trim_time) {
+}
+
+G1EvacPhaseTimesTracker::~G1EvacPhaseTimesTracker() {
+  if (_phase_times != NULL) {
+    // Exclude trim time by increasing the start time.
+    _start_time += _trim_time;
+    _phase_times->record_or_add_objcopy_time_secs(_worker_id, TicksToTimeHelper::seconds(_trim_time));
  }
 }

--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -31,6 +31,7 @@
 #include "utilities/macros.hpp"

 class LineBuffer;
+class G1ParScanThreadState;
 class STWGCTimer;

 template <class T> class WorkerDataArray;
@ -198,6 +199,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
  // add a number of seconds to a phase
  void add_time_secs(GCParPhases phase, uint worker_i, double secs);

+  void record_or_add_objcopy_time_secs(uint worker_i, double secs);
+
  void record_thread_work_item(GCParPhases phase, uint worker_i, size_t count, uint index = 0);

  // return the average time for a phase in milliseconds
@ -369,14 +372,36 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
  ReferenceProcessorPhaseTimes* ref_phase_times() { return &_ref_phase_times; }
 };

-class G1GCParPhaseTimesTracker : public StackObj {
-  double _start_time;
+class G1EvacPhaseWithTrimTimeTracker : public StackObj {
+  G1ParScanThreadState* _pss;
+  Ticks _start;
+
+  Tickspan& _total_time;
+  Tickspan& _trim_time;
+public:
+  G1EvacPhaseWithTrimTimeTracker(G1ParScanThreadState* pss, Tickspan& total_time, Tickspan& trim_time);
+  ~G1EvacPhaseWithTrimTimeTracker();
+};
+
+class G1GCParPhaseTimesTracker : public CHeapObj<mtGC> {
+protected:
+  Ticks _start_time;
  G1GCPhaseTimes::GCParPhases _phase;
  G1GCPhaseTimes* _phase_times;
  uint _worker_id;
 public:
  G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
-  ~G1GCParPhaseTimesTracker();
+  virtual ~G1GCParPhaseTimesTracker();
+};
+
+class G1EvacPhaseTimesTracker : public G1GCParPhaseTimesTracker {
+  Tickspan _total_time;
+  Tickspan _trim_time;
+
+  G1EvacPhaseWithTrimTimeTracker _trim_tracker;
+public:
+  G1EvacPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1ParScanThreadState* pss, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
+  virtual ~G1EvacPhaseTimesTracker();
 };

 #endif // SHARE_VM_GC_G1_G1GCPHASETIMES_HPP
--- a/src/hotspot/share/gc/g1/g1OopClosures.cpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -56,6 +56,8 @@ void G1CLDScanClosure::do_cld(ClassLoaderData* cld) {
    cld->oops_do(_closure, _must_claim, /*clear_modified_oops*/true);

    _closure->set_scanned_cld(NULL);
+
+    _closure->trim_queue_partially();
  }
  _count++;
 }
--- a/src/hotspot/share/gc/g1/g1OopClosures.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp
@ -57,6 +57,8 @@ public:
  // This closure needs special handling for InstanceRefKlass.
  virtual ReferenceIterationMode reference_iteration_mode() { return DO_DISCOVERED_AND_DISCOVERY; }
  void set_region(HeapRegion* from) { _from = from; }
+
+  inline void trim_queue_partially();
 };

 // Used during the Update RS phase to refine remaining cards in the DCQ during garbage collection.
@ -126,6 +128,8 @@ protected:
 public:
  void set_scanned_cld(ClassLoaderData* cld) { _scanned_cld = cld; }
  inline void do_cld_barrier(oop new_obj);
+
+  inline void trim_queue_partially();
 };

 enum G1Barrier {
--- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
@ -67,6 +67,10 @@ inline void G1ScanClosureBase::handle_non_cset_obj_common(InCSetState const stat
  }
 }

+inline void G1ScanClosureBase::trim_queue_partially() {
+  _par_scan_state->trim_queue_partially();
+}
+
 template <class T>
 inline void G1ScanEvacuatedObjClosure::do_oop_nv(T* p) {
  T heap_oop = RawAccess<>::oop_load(p);
@ -225,6 +229,10 @@ void G1ParCopyHelper::mark_forwarded_object(oop from_obj, oop to_obj) {
  _cm->mark_in_next_bitmap(_worker_id, to_obj, from_obj->size());
 }

+void G1ParCopyHelper::trim_queue_partially() {
+  _par_scan_state->trim_queue_partially();
+}
+
 template <G1Barrier barrier, G1Mark do_mark_object>
 template <class T>
 void G1ParCopyClosure<barrier, do_mark_object>::do_oop_work(T* p) {
@ -269,6 +277,7 @@ void G1ParCopyClosure<barrier, do_mark_object>::do_oop_work(T* p) {
      mark_object(obj);
    }
  }
+  trim_queue_partially();
 }

 template <class T> void G1RebuildRemSetClosure::do_oop_nv(T* p) {
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
@ -43,11 +43,15 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id,
    _dcq(&g1h->dirty_card_queue_set()),
    _ct(g1h->card_table()),
    _closures(NULL),
+    _plab_allocator(NULL),
+    _age_table(false),
+    _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
+    _scanner(g1h, this),
    _hash_seed(17),
    _worker_id(worker_id),
-    _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
-    _age_table(false),
-    _scanner(g1h, this),
+    _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
+    _stack_trim_lower_threshold(GCDrainStackTargetSize),
+    _trim_ticks(),
    _old_gen_is_full(false)
 {
  // we allocate G1YoungSurvRateNumRegions plus one entries, since
@ -138,16 +142,8 @@ bool G1ParScanThreadState::verify_task(StarTask ref) const {
 void G1ParScanThreadState::trim_queue() {
  StarTask ref;
  do {
-    // Drain the overflow stack first, so other threads can steal.
-    while (_refs->pop_overflow(ref)) {
-      if (!_refs->try_push_to_taskqueue(ref)) {
-        dispatch_reference(ref);
-      }
-    }
-
-    while (_refs->pop_local(ref)) {
-      dispatch_reference(ref);
-    }
+    // Fully drain the queue.
+    trim_queue_to_threshold(0);
  } while (!_refs->is_empty());
 }

@ -314,7 +310,7 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state,
      // length field of the from-space object.
      arrayOop(obj)->set_length(0);
      oop* old_p = set_partial_array_mask(old);
-      push_on_queue(old_p);
+      do_oop_partial_array(old_p);
    } else {
      HeapRegion* const to_region = _g1h->heap_region_containing(obj_ptr);
      _scanner.set_region(to_region);
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
@ -35,6 +35,7 @@
 #include "gc/shared/ageTable.hpp"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
+#include "utilities/ticks.hpp"

 class G1PLABAllocator;
 class G1EvacuationRootClosures;
@ -42,7 +43,6 @@ class HeapRegion;
 class outputStream;

 class G1ParScanThreadState : public CHeapObj<mtGC> {
- private:
  G1CollectedHeap* _g1h;
  RefToScanQueue*  _refs;
  DirtyCardQueue   _dcq;
@ -60,6 +60,11 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
  int  _hash_seed;
  uint _worker_id;

+  // Upper and lower threshold to start and end work queue draining.
+  uint const _stack_trim_upper_threshold;
+  uint const _stack_trim_lower_threshold;
+
+  Tickspan _trim_ticks;
  // Map from young-age-index (0 == not young, 1 is youngest) to
  // surviving words. base is what we get back from the malloc call
  size_t* _surviving_young_words_base;
@ -83,7 +88,7 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
    return _dest[original.value()];
  }

- public:
+public:
  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length);
  virtual ~G1ParScanThreadState();

@ -129,7 +134,7 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {

  void flush(size_t* surviving_young_words);

- private:
+private:
  #define G1_PARTIAL_ARRAY_MASK 0x2

  inline bool has_partial_array_mask(oop* ref) const {
@ -185,11 +190,19 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
  void report_promotion_event(InCSetState const dest_state,
                              oop const old, size_t word_sz, uint age,
                              HeapWord * const obj_ptr) const;
- public:

+  inline bool needs_partial_trimming() const;
+  inline bool is_partially_trimmed() const;
+
+  inline void trim_queue_to_threshold(uint threshold);
+public:
  oop copy_to_survivor_space(InCSetState const state, oop const obj, markOop const old_mark);

  void trim_queue();
+  void trim_queue_partially();
+
+  Tickspan trim_ticks() const;
+  void reset_trim_ticks();

  inline void steal_and_trim_queue(RefToScanQueueSet *task_queues);

--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
@ -29,6 +29,7 @@
 #include "gc/g1/g1RemSet.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "utilities/ticks.inline.hpp"

 template <class T> void G1ParScanThreadState::do_oop_evac(T* p) {
  // Reference should not be NULL here as such are never pushed to the task queue.
@ -151,4 +152,46 @@ void G1ParScanThreadState::steal_and_trim_queue(RefToScanQueueSet *task_queues)
  }
 }

+inline bool G1ParScanThreadState::needs_partial_trimming() const {
+  return !_refs->overflow_empty() || _refs->size() > _stack_trim_upper_threshold;
+}
+
+inline bool G1ParScanThreadState::is_partially_trimmed() const {
+  return _refs->overflow_empty() && _refs->size() <= _stack_trim_lower_threshold;
+}
+
+inline void G1ParScanThreadState::trim_queue_to_threshold(uint threshold) {
+  StarTask ref;
+  // Drain the overflow stack first, so other threads can potentially steal.
+  while (_refs->pop_overflow(ref)) {
+    if (!_refs->try_push_to_taskqueue(ref)) {
+      dispatch_reference(ref);
+    }
+  }
+
+  while (_refs->pop_local(ref, threshold)) {
+    dispatch_reference(ref);
+  }
+}
+
+inline void G1ParScanThreadState::trim_queue_partially() {
+  if (!needs_partial_trimming()) {
+    return;
+  }
+
+  const Ticks start = Ticks::now();
+  do {
+    trim_queue_to_threshold(_stack_trim_lower_threshold);
+  } while (!is_partially_trimmed());
+  _trim_ticks += Ticks::now() - start;
+}
+
+inline Tickspan G1ParScanThreadState::trim_ticks() const {
+  return _trim_ticks;
+}
+
+inline void G1ParScanThreadState::reset_trim_ticks() {
+  _trim_ticks = Tickspan();
+}
+
 #endif // SHARE_VM_GC_G1_G1PARSCANTHREADSTATE_INLINE_HPP
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@ -628,7 +628,7 @@ void G1Policy::record_collection_pause_end(double pause_time_ms, size_t cards_sc
  if (update_stats) {
    double cost_per_card_ms = 0.0;
    if (_pending_cards > 0) {
-      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS) - scan_hcc_time_ms) / (double) _pending_cards;
+      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS)) / (double) _pending_cards;
      _analytics->report_cost_per_card_ms(cost_per_card_ms);
    }
    _analytics->report_cost_scan_hcc(scan_hcc_time_ms);
@ -730,9 +730,9 @@ void G1Policy::record_collection_pause_end(double pause_time_ms, size_t cards_sc
  } else {
    update_rs_time_goal_ms -= scan_hcc_time_ms;
  }
-  _g1h->concurrent_refine()->adjust(average_time_ms(G1GCPhaseTimes::UpdateRS) - scan_hcc_time_ms,
-                                   phase_times()->sum_thread_work_items(G1GCPhaseTimes::UpdateRS),
-                                   update_rs_time_goal_ms);
+  _g1h->concurrent_refine()->adjust(average_time_ms(G1GCPhaseTimes::UpdateRS),
+                                    phase_times()->sum_thread_work_items(G1GCPhaseTimes::UpdateRS),
+                                    update_rs_time_goal_ms);

  cset_chooser()->verify();
 }
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@ -33,6 +33,7 @@
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
 #include "gc/g1/g1OopClosures.inline.hpp"
+#include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionManager.inline.hpp"
@ -306,32 +307,21 @@ void G1RemSet::initialize(size_t capacity, uint max_regions) {

 G1ScanRSForRegionClosure::G1ScanRSForRegionClosure(G1RemSetScanState* scan_state,
                                                   G1ScanObjsDuringScanRSClosure* scan_obj_on_card,
-                                                   CodeBlobClosure* code_root_cl,
+                                                   G1ParScanThreadState* pss,
                                                   uint worker_i) :
-  _scan_state(scan_state),
+  _g1h(G1CollectedHeap::heap()),
+  _ct(_g1h->card_table()),
+  _pss(pss),
  _scan_objs_on_card_cl(scan_obj_on_card),
-  _code_root_cl(code_root_cl),
-  _strong_code_root_scan_time_sec(0.0),
+  _scan_state(scan_state),
+  _worker_i(worker_i),
  _cards_claimed(0),
  _cards_scanned(0),
  _cards_skipped(0),
-  _worker_i(worker_i) {
-  _g1h = G1CollectedHeap::heap();
-  _bot = _g1h->bot();
-  _ct = _g1h->card_table();
-}
-
-void G1ScanRSForRegionClosure::scan_card(MemRegion mr, uint region_idx_for_card) {
-  HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
-  _scan_objs_on_card_cl->set_region(card_region);
-  card_region->oops_on_card_seq_iterate_careful<true>(mr, _scan_objs_on_card_cl);
-  _cards_scanned++;
-}
-
-void G1ScanRSForRegionClosure::scan_strong_code_roots(HeapRegion* r) {
-  double scan_start = os::elapsedTime();
-  r->strong_code_roots_do(_code_root_cl);
-  _strong_code_root_scan_time_sec += (os::elapsedTime() - scan_start);
+  _rem_set_root_scan_time(),
+  _rem_set_trim_partially_time(),
+  _strong_code_root_scan_time(),
+  _strong_code_trim_partially_time() {
 }

 void G1ScanRSForRegionClosure::claim_card(size_t card_index, const uint region_idx_for_card){
@ -339,13 +329,17 @@ void G1ScanRSForRegionClosure::claim_card(size_t card_index, const uint region_i
  _scan_state->add_dirty_region(region_idx_for_card);
 }

-bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {
-  assert(r->in_collection_set(), "should only be called on elements of CS.");
-  uint region_idx = r->hrm_index();
+void G1ScanRSForRegionClosure::scan_card(MemRegion mr, uint region_idx_for_card) {
+  HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
+  _scan_objs_on_card_cl->set_region(card_region);
+  card_region->oops_on_card_seq_iterate_careful<true>(mr, _scan_objs_on_card_cl);
+  _scan_objs_on_card_cl->trim_queue_partially();
+  _cards_scanned++;
+}
+
+void G1ScanRSForRegionClosure::scan_rem_set_roots(HeapRegion* r) {
+  uint const region_idx = r->hrm_index();

-  if (_scan_state->iter_is_complete(region_idx)) {
-    return false;
-  }
  if (_scan_state->claim_iter(region_idx)) {
    // If we ever free the collection set concurrently, we should also
    // clear the card table concurrently therefore we won't need to
@ -397,33 +391,52 @@ bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {

    scan_card(mr, region_idx_for_card);
  }
+}
+
+void G1ScanRSForRegionClosure::scan_strong_code_roots(HeapRegion* r) {
+  r->strong_code_roots_do(_pss->closures()->weak_codeblobs());
+}
+
+bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {
+  assert(r->in_collection_set(),
+         "Should only be called on elements of the collection set but region %u is not.",
+         r->hrm_index());
+  uint const region_idx = r->hrm_index();
+
+  // Do an early out if we know we are complete.
+  if (_scan_state->iter_is_complete(region_idx)) {
+    return false;
+  }
+
+  {
+    G1EvacPhaseWithTrimTimeTracker timer(_pss, _rem_set_root_scan_time, _rem_set_trim_partially_time);
+    scan_rem_set_roots(r);
+  }
+
  if (_scan_state->set_iter_complete(region_idx)) {
+    G1EvacPhaseWithTrimTimeTracker timer(_pss, _strong_code_root_scan_time, _strong_code_trim_partially_time);
    // Scan the strong code root list attached to the current region
    scan_strong_code_roots(r);
  }
  return false;
 }

-void G1RemSet::scan_rem_set(G1ParScanThreadState* pss,
-                            CodeBlobClosure* heap_region_codeblobs,
-                            uint worker_i) {
-  double rs_time_start = os::elapsedTime();
-
+void G1RemSet::scan_rem_set(G1ParScanThreadState* pss, uint worker_i) {
  G1ScanObjsDuringScanRSClosure scan_cl(_g1h, pss);
-  G1ScanRSForRegionClosure cl(_scan_state, &scan_cl, heap_region_codeblobs, worker_i);
+  G1ScanRSForRegionClosure cl(_scan_state, &scan_cl, pss, worker_i);
  _g1h->collection_set_iterate_from(&cl, worker_i);

-  double scan_rs_time_sec = (os::elapsedTime() - rs_time_start) -
-                             cl.strong_code_root_scan_time_sec();
-
  G1GCPhaseTimes* p = _g1p->phase_times();

-  p->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, scan_rs_time_sec);
+  p->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, TicksToTimeHelper::seconds(cl.rem_set_root_scan_time()));
+  p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, TicksToTimeHelper::seconds(cl.rem_set_trim_partially_time()));
+
  p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_scanned(), G1GCPhaseTimes::ScanRSScannedCards);
  p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_claimed(), G1GCPhaseTimes::ScanRSClaimedCards);
  p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_skipped(), G1GCPhaseTimes::ScanRSSkippedCards);

-  p->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, cl.strong_code_root_scan_time_sec());
+  p->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, TicksToTimeHelper::seconds(cl.strong_code_root_scan_time()));
+  p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, TicksToTimeHelper::seconds(cl.strong_code_root_trim_partially_time()));
 }

 // Closure used for updating rem sets. Only called during an evacuation pause.
@ -448,6 +461,7 @@ public:
    bool card_scanned = _g1rs->refine_card_during_gc(card_ptr, _update_rs_cl);

    if (card_scanned) {
+      _update_rs_cl->trim_queue_partially();
      _cards_scanned++;
    } else {
      _cards_skipped++;
@ -460,32 +474,37 @@ public:
 };

 void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i) {
-  G1ScanObjsDuringUpdateRSClosure update_rs_cl(_g1h, pss, worker_i);
-  G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl);
+  G1GCPhaseTimes* p = _g1p->phase_times();

-  G1GCParPhaseTimesTracker x(_g1p->phase_times(), G1GCPhaseTimes::UpdateRS, worker_i);
+  // Apply closure to log entries in the HCC.
  if (G1HotCardCache::default_use_cache()) {
-    // Apply the closure to the entries of the hot card cache.
-    G1GCParPhaseTimesTracker y(_g1p->phase_times(), G1GCPhaseTimes::ScanHCC, worker_i);
+    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::ScanHCC, worker_i);
+
+    G1ScanObjsDuringUpdateRSClosure scan_hcc_cl(_g1h, pss, worker_i);
+    G1RefineCardClosure refine_card_cl(_g1h, &scan_hcc_cl);
    _g1h->iterate_hcc_closure(&refine_card_cl, worker_i);
  }
-  // Apply the closure to all remaining log entries.
-  _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i);

-  G1GCPhaseTimes* p = _g1p->phase_times();
-  p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards);
-  p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards);
+  // Now apply the closure to all remaining log entries.
+  {
+    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::UpdateRS, worker_i);
+
+    G1ScanObjsDuringUpdateRSClosure update_rs_cl(_g1h, pss, worker_i);
+    G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl);
+    _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i);
+
+    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards);
+    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards);
+  }
 }

 void G1RemSet::cleanupHRRS() {
  HeapRegionRemSet::cleanup();
 }

-void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss,
-                                           CodeBlobClosure* heap_region_codeblobs,
-                                           uint worker_i) {
+void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i) {
  update_rem_set(pss, worker_i);
-  scan_rem_set(pss, heap_region_codeblobs, worker_i);;
+  scan_rem_set(pss, worker_i);;
 }

 void G1RemSet::prepare_for_oops_into_collection_set_do() {
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp
@ -32,6 +32,7 @@
 #include "gc/g1/heapRegion.hpp"
 #include "memory/allocation.hpp"
 #include "memory/iterator.hpp"
+#include "utilities/ticks.hpp"

 // A G1RemSet provides ways of iterating over pointers into a selected
 // collection set.
@ -61,9 +62,7 @@ private:

  // Scan all remembered sets of the collection set for references into the collection
  // set.
-  void scan_rem_set(G1ParScanThreadState* pss,
-                    CodeBlobClosure* heap_region_codeblobs,
-                    uint worker_i);
+  void scan_rem_set(G1ParScanThreadState* pss, uint worker_i);

  // Flush remaining refinement buffers for cross-region references to either evacuate references
  // into the collection set or update the remembered set.
@ -102,9 +101,7 @@ public:
  //
  // Further applies heap_region_codeblobs on the oops of the unmarked nmethods on the strong code
  // roots list for each region in the collection set.
-  void oops_into_collection_set_do(G1ParScanThreadState* pss,
-                                   CodeBlobClosure* heap_region_codeblobs,
-                                   uint worker_i);
+  void oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i);

  // Prepare for and cleanup after an oops_into_collection_set_do
  // call.  Must call each of these once before and after (in sequential
@ -138,37 +135,44 @@ public:
 };

 class G1ScanRSForRegionClosure : public HeapRegionClosure {
+  G1CollectedHeap* _g1h;
+  G1CardTable *_ct;
+
+  G1ParScanThreadState* _pss;
+  G1ScanObjsDuringScanRSClosure* _scan_objs_on_card_cl;
+
  G1RemSetScanState* _scan_state;

+  uint   _worker_i;
+
  size_t _cards_scanned;
  size_t _cards_claimed;
  size_t _cards_skipped;

-  G1CollectedHeap* _g1h;
+  Tickspan _rem_set_root_scan_time;
+  Tickspan _rem_set_trim_partially_time;

-  G1ScanObjsDuringScanRSClosure* _scan_objs_on_card_cl;
-  CodeBlobClosure* _code_root_cl;
-
-  G1BlockOffsetTable* _bot;
-  G1CardTable *_ct;
-
-  double _strong_code_root_scan_time_sec;
-  uint   _worker_i;
+  Tickspan _strong_code_root_scan_time;
+  Tickspan _strong_code_trim_partially_time;

  void claim_card(size_t card_index, const uint region_idx_for_card);
  void scan_card(MemRegion mr, uint region_idx_for_card);
+
+  void scan_rem_set_roots(HeapRegion* r);
  void scan_strong_code_roots(HeapRegion* r);
 public:
  G1ScanRSForRegionClosure(G1RemSetScanState* scan_state,
                           G1ScanObjsDuringScanRSClosure* scan_obj_on_card,
-                           CodeBlobClosure* code_root_cl,
+                           G1ParScanThreadState* pss,
                           uint worker_i);

  bool do_heap_region(HeapRegion* r);

-  double strong_code_root_scan_time_sec() {
-    return _strong_code_root_scan_time_sec;
-  }
+  Tickspan rem_set_root_scan_time() const { return _rem_set_root_scan_time; }
+  Tickspan rem_set_trim_partially_time() const { return _rem_set_trim_partially_time; }
+
+  Tickspan strong_code_root_scan_time() const { return _strong_code_root_scan_time;  }
+  Tickspan strong_code_root_trim_partially_time() const { return _strong_code_trim_partially_time; }

  size_t cards_scanned() const { return _cards_scanned; }
  size_t cards_claimed() const { return _cards_claimed; }
--- a/src/hotspot/share/gc/g1/g1RootClosures.cpp
+++ b/src/hotspot/share/gc/g1/g1RootClosures.cpp
@ -37,8 +37,8 @@ public:
                       bool in_young_gc) :
      _closures(g1h, pss, in_young_gc, /* must_claim_cld */ false) {}

-  OopClosure* weak_oops()   { return &_closures._buffered_oops; }
-  OopClosure* strong_oops() { return &_closures._buffered_oops; }
+  OopClosure* weak_oops()   { return &_closures._oops; }
+  OopClosure* strong_oops() { return &_closures._oops; }

  CLDClosure* weak_clds()             { return &_closures._clds; }
  CLDClosure* strong_clds()           { return &_closures._clds; }
@ -47,9 +47,6 @@ public:
  CodeBlobClosure* strong_codeblobs()      { return &_closures._codeblobs; }
  CodeBlobClosure* weak_codeblobs()        { return &_closures._codeblobs; }

-  void flush()                 { _closures._buffered_oops.done(); }
-  double closure_app_seconds() { return _closures._buffered_oops.closure_app_seconds(); }
-
  OopClosure* raw_strong_oops() { return &_closures._oops; }

  bool trace_metadata()         { return false; }
@ -79,8 +76,8 @@ public:
      _strong(g1h, pss, /* process_only_dirty_klasses */ false, /* must_claim_cld */ true),
      _weak(g1h, pss,   /* process_only_dirty_klasses */ false, /* must_claim_cld */ true) {}

-  OopClosure* weak_oops()   { return &_weak._buffered_oops; }
-  OopClosure* strong_oops() { return &_strong._buffered_oops; }
+  OopClosure* weak_oops()   { return &_weak._oops; }
+  OopClosure* strong_oops() { return &_strong._oops; }

  // If MarkWeak is G1MarkPromotedFromRoot then the weak CLDs must be processed in a second pass.
  CLDClosure* weak_clds()             { return null_if<G1MarkPromotedFromRoot>(&_weak._clds); }
@ -93,16 +90,6 @@ public:
  CodeBlobClosure* strong_codeblobs()      { return &_strong._codeblobs; }
  CodeBlobClosure* weak_codeblobs()        { return &_weak._codeblobs; }

-  void flush() {
-    _strong._buffered_oops.done();
-    _weak._buffered_oops.done();
-  }
-
-  double closure_app_seconds() {
-    return _strong._buffered_oops.closure_app_seconds() +
-           _weak._buffered_oops.closure_app_seconds();
-  }
-
  OopClosure* raw_strong_oops() { return &_strong._oops; }

  // If we are not marking all weak roots then we are tracing
--- a/src/hotspot/share/gc/g1/g1RootClosures.hpp
+++ b/src/hotspot/share/gc/g1/g1RootClosures.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -49,10 +49,6 @@ public:

 class G1EvacuationRootClosures : public G1RootClosures {
 public:
-  // Flush any buffered state and deferred processing
-  virtual void flush() = 0;
-  virtual double closure_app_seconds() = 0;
-
  // Applied to the weakly reachable CLDs when all strongly reachable
  // CLDs are guaranteed to have been processed.
  virtual CLDClosure* second_pass_weak_clds() = 0;
--- a/src/hotspot/share/gc/g1/g1RootProcessor.cpp
+++ b/src/hotspot/share/gc/g1/g1RootProcessor.cpp
@ -28,12 +28,12 @@
 #include "classfile/stringTable.hpp"
 #include "classfile/systemDictionary.hpp"
 #include "code/codeCache.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1CodeBlobClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectorState.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
+#include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
 #include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
@ -73,10 +73,12 @@ G1RootProcessor::G1RootProcessor(G1CollectedHeap* g1h, uint n_workers) :
    _lock(Mutex::leaf, "G1 Root Scanning barrier lock", false, Monitor::_safepoint_check_never),
    _n_workers_discovered_strong_classes(0) {}

-void G1RootProcessor::evacuate_roots(G1EvacuationRootClosures* closures, uint worker_i) {
-  double ext_roots_start = os::elapsedTime();
+void G1RootProcessor::evacuate_roots(G1ParScanThreadState* pss, uint worker_i) {
  G1GCPhaseTimes* phase_times = _g1h->g1_policy()->phase_times();

+  G1EvacPhaseTimesTracker timer(phase_times, pss, G1GCPhaseTimes::ExtRootScan, worker_i);
+
+  G1EvacuationRootClosures* closures = pss->closures();
  process_java_roots(closures, phase_times, worker_i);

  // This is the point where this worker thread will not find more strong CLDs/nmethods.
@ -118,17 +120,6 @@ void G1RootProcessor::evacuate_roots(G1EvacuationRootClosures* closures, uint wo
    assert(closures->second_pass_weak_clds() == NULL, "Should be null if not tracing metadata.");
  }

-  // Finish up any enqueued closure apps (attributed as object copy time).
-  closures->flush();
-
-  double obj_copy_time_sec = closures->closure_app_seconds();
-
-  phase_times->record_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, obj_copy_time_sec);
-
-  double ext_root_time_sec = os::elapsedTime() - ext_roots_start - obj_copy_time_sec;
-
-  phase_times->record_time_secs(G1GCPhaseTimes::ExtRootScan, worker_i, ext_root_time_sec);
-
  // During conc marking we have to filter the per-thread SATB buffers
  // to make sure we remove any oops into the CSet (which will show up
  // as implicitly live).
--- a/src/hotspot/share/gc/g1/g1RootProcessor.hpp
+++ b/src/hotspot/share/gc/g1/g1RootProcessor.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -34,6 +34,7 @@ class CodeBlobClosure;
 class G1CollectedHeap;
 class G1EvacuationRootClosures;
 class G1GCPhaseTimes;
+class G1ParScanThreadState;
 class G1RootClosures;
 class Monitor;
 class OopClosure;
@ -97,10 +98,10 @@ class G1RootProcessor : public StackObj {
 public:
  G1RootProcessor(G1CollectedHeap* g1h, uint n_workers);

-  // Apply closures to the strongly and weakly reachable roots in the system
+  // Apply correct closures from pss to the strongly and weakly reachable roots in the system
  // in a single pass.
  // Record and report timing measurements for sub phases using the worker_i
-  void evacuate_roots(G1EvacuationRootClosures* closures, uint worker_i);
+  void evacuate_roots(G1ParScanThreadState* pss, uint worker_id);

  // Apply oops, clds and blobs to all strongly reachable roots in the system
  void process_strong_roots(OopClosure* oops,
--- a/src/hotspot/share/gc/g1/g1SharedClosures.hpp
+++ b/src/hotspot/share/gc/g1/g1SharedClosures.hpp
@ -22,7 +22,6 @@
 *
 */

-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1CodeBlobClosure.hpp"
 #include "gc/g1/g1OopClosures.hpp"
 #include "memory/iterator.hpp"
@ -39,12 +38,10 @@ public:

  G1CLDScanClosure                _clds;
  G1CodeBlobClosure               _codeblobs;
-  BufferingOopClosure             _buffered_oops;

  G1SharedClosures(G1CollectedHeap* g1h, G1ParScanThreadState* pss, bool process_only_dirty, bool must_claim_cld) :
    _oops(g1h, pss),
    _oops_in_cld(g1h, pss),
    _clds(&_oops_in_cld, process_only_dirty, must_claim_cld),
-    _codeblobs(&_oops),
-    _buffered_oops(&_oops) {}
+    _codeblobs(&_oops) {}
 };
--- a/src/hotspot/share/gc/shared/taskqueue.hpp
+++ b/src/hotspot/share/gc/shared/taskqueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -285,9 +285,10 @@ public:
  inline bool push(E t);

  // Attempts to claim a task from the "local" end of the queue (the most
-  // recently pushed).  If successful, returns true and sets t to the task;
-  // otherwise, returns false (the queue is empty).
-  inline bool pop_local(volatile E& t);
+  // recently pushed) as long as the number of entries exceeds the threshold.
+  // If successful, returns true and sets t to the task; otherwise, returns false
+  // (the queue is empty or the number of elements below the threshold).
+  inline bool pop_local(volatile E& t, uint threshold = 0);

  // Like pop_local(), but uses the "global" end of the queue (the least
  // recently pushed).
--- a/src/hotspot/share/gc/shared/taskqueue.inline.hpp
+++ b/src/hotspot/share/gc/shared/taskqueue.inline.hpp
@ -152,7 +152,7 @@ bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) {
 }

 template<class E, MEMFLAGS F, unsigned int N> inline bool
-GenericTaskQueue<E, F, N>::pop_local(volatile E& t) {
+GenericTaskQueue<E, F, N>::pop_local(volatile E& t, uint threshold) {
  uint localBot = _bottom;
  // This value cannot be N-1.  That can only occur as a result of
  // the assignment to bottom in this method.  If it does, this method
@ -160,7 +160,7 @@ GenericTaskQueue<E, F, N>::pop_local(volatile E& t) {
  // since this is pop_local.)
  uint dirty_n_elems = dirty_size(localBot, _age.top());
  assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
-  if (dirty_n_elems == 0) return false;
+  if (dirty_n_elems <= threshold) return false;
  localBot = decrement_index(localBot);
  _bottom = localBot;
  // This is necessary to prevent any read below from being reordered
--- a/src/hotspot/share/utilities/ticks.hpp
+++ b/src/hotspot/share/utilities/ticks.hpp
@ -47,6 +47,11 @@ class Tickspan {
    return *this;
  }

+  Tickspan& operator-=(const Tickspan& rhs) {
+    _span_ticks -= rhs._span_ticks;
+    return *this;
+  }
+
  jlong value() const {
    return _span_ticks;
  }
--- a/src/hotspot/share/utilities/ticks.inline.hpp
+++ b/src/hotspot/share/utilities/ticks.inline.hpp
@ -32,6 +32,11 @@ inline Tickspan operator+(Tickspan lhs, const Tickspan& rhs) {
  return lhs;
 }

+inline Tickspan operator-(Tickspan lhs, const Tickspan& rhs) {
+  lhs -= rhs;
+  return lhs;
+}
+
 inline bool operator==(const Tickspan& lhs, const Tickspan& rhs) {
  return lhs.value() == rhs.value();
 }
--- a/test/hotspot/gtest/gc/g1/test_bufferingOopClosure.cpp
+++ b/test/hotspot/gtest/gc/g1/test_bufferingOopClosure.cpp
@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-#include "precompiled.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
-#include "memory/iterator.hpp"
-#include "unittest.hpp"
-
-class BufferingOopClosureTest : public ::testing::Test {
- public:
-  // Helper class to fake a set of oop*s and narrowOop*s.
-  class FakeRoots {
-   public:
-    // Used for sanity checking of the values passed to the do_oops functions in the test.
-    static const uintptr_t NarrowOopMarker = uintptr_t(1) << (BitsPerWord -1);
-
-    int    _num_narrow;
-    int    _num_full;
-    void** _narrow;
-    void** _full;
-
-    FakeRoots(int num_narrow, int num_full) :
-        _num_narrow(num_narrow),
-        _num_full(num_full),
-        _narrow((void**)::malloc(sizeof(void*) * num_narrow)),
-        _full((void**)::malloc(sizeof(void*) * num_full)) {
-
-      for (int i = 0; i < num_narrow; i++) {
-        _narrow[i] = (void*)(NarrowOopMarker + (uintptr_t)i);
-      }
-      for (int i = 0; i < num_full; i++) {
-        _full[i] = (void*)(uintptr_t)i;
-      }
-    }
-
-    ~FakeRoots() {
-      ::free(_narrow);
-      ::free(_full);
-    }
-
-    void oops_do_narrow_then_full(OopClosure* cl) {
-      for (int i = 0; i < _num_narrow; i++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-      for (int i = 0; i < _num_full; i++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-    }
-
-    void oops_do_full_then_narrow(OopClosure* cl) {
-      for (int i = 0; i < _num_full; i++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-      for (int i = 0; i < _num_narrow; i++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-    }
-
-    void oops_do_mixed(OopClosure* cl) {
-      int i;
-      for (i = 0; i < _num_full && i < _num_narrow; i++) {
-        cl->do_oop((oop*)_full[i]);
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-      for (int j = i; j < _num_full; j++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-      for (int j = i; j < _num_narrow; j++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-    }
-
-    static const int MaxOrder = 2;
-
-    void oops_do(OopClosure* cl, int do_oop_order) {
-      switch(do_oop_order) {
-        case 0:
-          oops_do_narrow_then_full(cl);
-          break;
-        case 1:
-          oops_do_full_then_narrow(cl);
-          break;
-        case 2:
-          oops_do_mixed(cl);
-          break;
-        default:
-          oops_do_narrow_then_full(cl);
-          break;
-      }
-    }
-  };
-
-  class CountOopClosure : public OopClosure {
-    int _narrow_oop_count;
-    int _full_oop_count;
-   public:
-    CountOopClosure() : _narrow_oop_count(0), _full_oop_count(0) {}
-    void do_oop(narrowOop* p) {
-      EXPECT_NE(uintptr_t(0), (uintptr_t(p) & FakeRoots::NarrowOopMarker))
-              << "The narrowOop was unexpectedly not marked with the NarrowOopMarker";
-      _narrow_oop_count++;
-    }
-
-    void do_oop(oop* p){
-      EXPECT_EQ(uintptr_t(0), (uintptr_t(p) & FakeRoots::NarrowOopMarker))
-              << "The oop was unexpectedly marked with the NarrowOopMarker";
-      _full_oop_count++;
-    }
-
-    int narrow_oop_count() { return _narrow_oop_count; }
-    int full_oop_count()   { return _full_oop_count; }
-    int all_oop_count()    { return _narrow_oop_count + _full_oop_count; }
-  };
-
-  class DoNothingOopClosure : public OopClosure {
-   public:
-    void do_oop(narrowOop* p) {}
-    void do_oop(oop* p)       {}
-  };
-
-  static void testCount(int num_narrow, int num_full, int do_oop_order) {
-    FakeRoots fr(num_narrow, num_full);
-
-    CountOopClosure coc;
-    BufferingOopClosure boc(&coc);
-
-    fr.oops_do(&boc, do_oop_order);
-
-    boc.done();
-
-    EXPECT_EQ(num_narrow, coc.narrow_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-
-    EXPECT_EQ(num_full, coc.full_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-
-    EXPECT_EQ(num_narrow + num_full, coc.all_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-  }
-
-  static void testIsBufferEmptyOrFull(int num_narrow, int num_full, bool expect_empty, bool expect_full) {
-    FakeRoots fr(num_narrow, num_full);
-
-    DoNothingOopClosure cl;
-    BufferingOopClosure boc(&cl);
-
-    fr.oops_do(&boc, 0);
-
-    EXPECT_EQ(expect_empty, boc.is_buffer_empty())
-            << "when running testIsBufferEmptyOrFull("
-            << num_narrow << ", " << num_full << ", "
-            << expect_empty << ", " << expect_full << ")";
-
-    EXPECT_EQ(expect_full, boc.is_buffer_full())
-            << "when running testIsBufferEmptyOrFull("
-            << num_narrow << ", " << num_full << ", "
-            << expect_empty << ", " << expect_full << ")";
-  }
-
-  static void testEmptyAfterDone(int num_narrow, int num_full) {
-    FakeRoots fr(num_narrow, num_full);
-
-    DoNothingOopClosure cl;
-    BufferingOopClosure boc(&cl);
-
-    fr.oops_do(&boc, 0);
-
-    // Make sure all get processed.
-    boc.done();
-
-    EXPECT_TRUE(boc.is_buffer_empty()) << "Should be empty after call to done()."
-            << " testEmptyAfterDone(" << num_narrow << ", " << num_full << ")";
-  }
-
-  static int get_buffer_length() {
-    return BufferingOopClosure::BufferLength;
-  }
-};
-
-TEST_VM_F(BufferingOopClosureTest, count_test) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  for (int order = 0; order < FakeRoots::MaxOrder; order++) {
-    testCount(0,      0,      order);
-    testCount(10,     0,      order);
-    testCount(0,      10,     order);
-    testCount(10,     10,     order);
-    testCount(bl,     10,     order);
-    testCount(10,     bl,     order);
-    testCount(bl,     bl,     order);
-    testCount(bl + 1, 10,     order);
-    testCount(10,     bl + 1, order);
-    testCount(bl + 1, bl,     order);
-    testCount(bl,     bl + 1, order);
-    testCount(bl + 1, bl + 1, order);
-  }
-}
-
-TEST_VM_F(BufferingOopClosureTest, buffer_empty_or_full) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  testIsBufferEmptyOrFull(0,      0,      true,  false);
-  testIsBufferEmptyOrFull(1,      0,      false, false);
-  testIsBufferEmptyOrFull(0,      1,      false, false);
-  testIsBufferEmptyOrFull(1,      1,      false, false);
-  testIsBufferEmptyOrFull(10,     0,      false, false);
-  testIsBufferEmptyOrFull(0,      10,     false, false);
-  testIsBufferEmptyOrFull(10,     10,     false, false);
-  testIsBufferEmptyOrFull(0,      bl,     false, true);
-  testIsBufferEmptyOrFull(bl,     0,      false, true);
-  testIsBufferEmptyOrFull(bl / 2, bl / 2, false, true);
-  testIsBufferEmptyOrFull(bl - 1, 1,      false, true);
-  testIsBufferEmptyOrFull(1,      bl - 1, false, true);
-  // Processed
-  testIsBufferEmptyOrFull(bl + 1, 0,      false, false);
-  testIsBufferEmptyOrFull(bl * 2, 0,      false, true);
-}
-
-TEST_VM_F(BufferingOopClosureTest, empty_after_done) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  testEmptyAfterDone(0,      0);
-  testEmptyAfterDone(1,      0);
-  testEmptyAfterDone(0,      1);
-  testEmptyAfterDone(1,      1);
-  testEmptyAfterDone(10,     0);
-  testEmptyAfterDone(0,      10);
-  testEmptyAfterDone(10,     10);
-  testEmptyAfterDone(0,      bl);
-  testEmptyAfterDone(bl,     0);
-  testEmptyAfterDone(bl / 2, bl / 2);
-  testEmptyAfterDone(bl - 1, 1);
-  testEmptyAfterDone(1,      bl - 1);
-  // Processed
-  testEmptyAfterDone(bl + 1, 0);
-  testEmptyAfterDone(bl * 2, 0);
-}