6608385: G1: need to support parallel reference processing

Implement support for ParallelRefProcEnabled in the reference processing that takes place at the end of G1 concurrent marking. Reviewed-by: tonyp, ysr
2011-01-25 10:56:22 -08:00 · 2011-01-25 10:56:22 -08:00 · ecdb5848eb
commit ecdb5848eb
parent 14f4450d25
4 changed files with 296 additions and 29 deletions
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@ -1055,7 +1055,12 @@ public:
      do {
        double start_vtime_sec = os::elapsedVTime();
        double start_time_sec = os::elapsedTime();
-        the_task->do_marking_step(10.0);
+        double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
+
+        the_task->do_marking_step(mark_step_duration_ms,
+                                  true /* do_stealing    */,
+                                  true /* do_termination */);
+
        double end_time_sec = os::elapsedTime();
        double end_vtime_sec = os::elapsedVTime();
        double elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
@ -1111,7 +1116,8 @@ void ConcurrentMark::markFromRoots() {

  _restart_for_overflow = false;

-  set_phase(MAX2((size_t) 1, parallel_marking_threads()), true);
+  size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  set_phase(active_workers, true /* concurrent */);

  CMConcurrentMarkingTask markingTask(this, cmThread());
  if (parallel_marking_threads() > 0)
@ -1176,6 +1182,12 @@ void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) {
                               /* silent */           false,
                               /* use_prev_marking */ false);
    }
+    assert(!restart_for_overflow(), "sanity");
+  }
+
+  // Reset the marking state if marking completed
+  if (!restart_for_overflow()) {
+    set_non_marking_state();
  }

 #if VERIFY_OBJS_PROCESSED
@ -1853,6 +1865,8 @@ void ConcurrentMark::completeCleanup() {
  assert(local_free_list.is_empty(), "post-condition");
 }

+// Support closures for reference procssing in G1
+
 bool G1CMIsAliveClosure::do_object_b(oop obj) {
  HeapWord* addr = (HeapWord*)obj;
  return addr != NULL &&
@ -1873,11 +1887,17 @@ class G1CMKeepAliveClosure: public OopClosure {
  virtual void do_oop(      oop* p) { do_oop_work(p); }

  template <class T> void do_oop_work(T* p) {
-    oop thisOop = oopDesc::load_decode_heap_oop(p);
-    HeapWord* addr = (HeapWord*)thisOop;
-    if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(thisOop)) {
+    oop obj = oopDesc::load_decode_heap_oop(p);
+    HeapWord* addr = (HeapWord*)obj;
+
+    if (_cm->verbose_high())
+      gclog_or_tty->print_cr("\t[0] we're looking at location "
+                               "*"PTR_FORMAT" = "PTR_FORMAT,
+                               p, (void*) obj);
+
+    if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
      _bitMap->mark(addr);
-      _cm->mark_stack_push(thisOop);
+      _cm->mark_stack_push(obj);
    }
  }
 };
@ -1899,6 +1919,199 @@ class G1CMDrainMarkingStackClosure: public VoidClosure {
  }
 };

+// 'Keep Alive' closure used by parallel reference processing.
+// An instance of this closure is used in the parallel reference processing
+// code rather than an instance of G1CMKeepAliveClosure. We could have used
+// the G1CMKeepAliveClosure as it is MT-safe. Also reference objects are
+// placed on to discovered ref lists once so we can mark and push with no
+// need to check whether the object has already been marked. Using the
+// G1CMKeepAliveClosure would mean, however, having all the worker threads
+// operating on the global mark stack. This means that an individual
+// worker would be doing lock-free pushes while it processes its own
+// discovered ref list followed by drain call. If the discovered ref lists
+// are unbalanced then this could cause interference with the other
+// workers. Using a CMTask (and its embedded local data structures)
+// avoids that potential interference.
+class G1CMParKeepAliveAndDrainClosure: public OopClosure {
+  ConcurrentMark*  _cm;
+  CMTask*          _task;
+  CMBitMap*        _bitMap;
+  int              _ref_counter_limit;
+  int              _ref_counter;
+ public:
+  G1CMParKeepAliveAndDrainClosure(ConcurrentMark* cm,
+                                  CMTask* task,
+                                  CMBitMap* bitMap) :
+    _cm(cm), _task(task), _bitMap(bitMap),
+    _ref_counter_limit(G1RefProcDrainInterval)
+  {
+    assert(_ref_counter_limit > 0, "sanity");
+    _ref_counter = _ref_counter_limit;
+  }
+
+  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
+  virtual void do_oop(      oop* p) { do_oop_work(p); }
+
+  template <class T> void do_oop_work(T* p) {
+    if (!_cm->has_overflown()) {
+      oop obj = oopDesc::load_decode_heap_oop(p);
+      if (_cm->verbose_high())
+        gclog_or_tty->print_cr("\t[%d] we're looking at location "
+                               "*"PTR_FORMAT" = "PTR_FORMAT,
+                               _task->task_id(), p, (void*) obj);
+
+      _task->deal_with_reference(obj);
+      _ref_counter--;
+
+      if (_ref_counter == 0) {
+        // We have dealt with _ref_counter_limit references, pushing them and objects
+        // reachable from them on to the local stack (and possibly the global stack).
+        // Call do_marking_step() to process these entries. We call the routine in a
+        // loop, which we'll exit if there's nothing more to do (i.e. we're done
+        // with the entries that we've pushed as a result of the deal_with_reference
+        // calls above) or we overflow.
+        // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
+        // while there may still be some work to do. (See the comment at the
+        // beginning of CMTask::do_marking_step() for those conditions - one of which
+        // is reaching the specified time target.) It is only when
+        // CMTask::do_marking_step() returns without setting the has_aborted() flag
+        // that the marking has completed.
+        do {
+          double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
+          _task->do_marking_step(mark_step_duration_ms,
+                                 false /* do_stealing    */,
+                                 false /* do_termination */);
+        } while (_task->has_aborted() && !_cm->has_overflown());
+        _ref_counter = _ref_counter_limit;
+      }
+    } else {
+       if (_cm->verbose_high())
+         gclog_or_tty->print_cr("\t[%d] CM Overflow", _task->task_id());
+    }
+  }
+};
+
+class G1CMParDrainMarkingStackClosure: public VoidClosure {
+  ConcurrentMark* _cm;
+  CMTask* _task;
+ public:
+  G1CMParDrainMarkingStackClosure(ConcurrentMark* cm, CMTask* task) :
+    _cm(cm), _task(task)
+  {}
+
+  void do_void() {
+    do {
+      if (_cm->verbose_high())
+        gclog_or_tty->print_cr("\t[%d] Drain: Calling do marking_step", _task->task_id());
+
+      // We call CMTask::do_marking_step() to completely drain the local and
+      // global marking stacks. The routine is called in a loop, which we'll
+      // exit if there's nothing more to do (i.e. we'completely drained the
+      // entries that were pushed as a result of applying the
+      // G1CMParKeepAliveAndDrainClosure to the entries on the discovered ref
+      // lists above) or we overflow the global marking stack.
+      // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
+      // while there may still be some work to do. (See the comment at the
+      // beginning of CMTask::do_marking_step() for those conditions - one of which
+      // is reaching the specified time target.) It is only when
+      // CMTask::do_marking_step() returns without setting the has_aborted() flag
+      // that the marking has completed.
+
+      _task->do_marking_step(1000000000.0 /* something very large */,
+                             true /* do_stealing    */,
+                             true /* do_termination */);
+    } while (_task->has_aborted() && !_cm->has_overflown());
+  }
+};
+
+// Implementation of AbstractRefProcTaskExecutor for G1
+class G1RefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+private:
+  G1CollectedHeap* _g1h;
+  ConcurrentMark*  _cm;
+  CMBitMap*        _bitmap;
+  WorkGang*        _workers;
+  int              _active_workers;
+
+public:
+  G1RefProcTaskExecutor(G1CollectedHeap* g1h,
+                        ConcurrentMark* cm,
+                        CMBitMap* bitmap,
+                        WorkGang* workers,
+                        int n_workers) :
+    _g1h(g1h), _cm(cm), _bitmap(bitmap),
+    _workers(workers), _active_workers(n_workers)
+  { }
+
+  // Executes the given task using concurrent marking worker threads.
+  virtual void execute(ProcessTask& task);
+  virtual void execute(EnqueueTask& task);
+};
+
+class G1RefProcTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
+  ProcessTask&     _proc_task;
+  G1CollectedHeap* _g1h;
+  ConcurrentMark*  _cm;
+  CMBitMap*        _bitmap;
+
+public:
+  G1RefProcTaskProxy(ProcessTask& proc_task,
+                     G1CollectedHeap* g1h,
+                     ConcurrentMark* cm,
+                     CMBitMap* bitmap) :
+    AbstractGangTask("Process reference objects in parallel"),
+    _proc_task(proc_task), _g1h(g1h), _cm(cm), _bitmap(bitmap)
+  {}
+
+  virtual void work(int i) {
+    CMTask* marking_task = _cm->task(i);
+    G1CMIsAliveClosure g1_is_alive(_g1h);
+    G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task, _bitmap);
+    G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task);
+
+    _proc_task.work(i, g1_is_alive, g1_par_keep_alive, g1_par_drain);
+  }
+};
+
+void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1RefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
+
+  // We need to reset the phase for each task execution so that
+  // the termination protocol of CMTask::do_marking_step works.
+  _cm->set_phase(_active_workers, false /* concurrent */);
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&proc_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+class G1RefEnqueueTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
+  EnqueueTask& _enq_task;
+
+public:
+  G1RefEnqueueTaskProxy(EnqueueTask& enq_task) :
+    AbstractGangTask("Enqueue reference objects in parallel"),
+    _enq_task(enq_task)
+  { }
+
+  virtual void work(int i) {
+    _enq_task.work(i);
+  }
+};
+
+void G1RefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1RefEnqueueTaskProxy enq_task_proxy(enq_task);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&enq_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
 void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
  ResourceMark rm;
  HandleMark   hm;
@ -1917,18 +2130,52 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
  G1CMDrainMarkingStackClosure
    g1_drain_mark_stack(nextMarkBitMap(), &_markStack, &g1_keep_alive);

-  // XXXYYY  Also: copy the parallel ref processing code from CMS.
+  // We use the work gang from the G1CollectedHeap and we utilize all
+  // the worker threads.
+  int active_workers = MAX2(MIN2(g1h->workers()->total_workers(), (int)_max_task_num), 1);
+
+  G1RefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
+                                          g1h->workers(), active_workers);
+
+  if (rp->processing_is_mt()) {
+    // Set the degree of MT here.  If the discovery is done MT, there
+    // may have been a different number of threads doing the discovery
+    // and a different number of discovered lists may have Ref objects.
+    // That is OK as long as the Reference lists are balanced (see
+    // balance_all_queues() and balance_queues()).
+    rp->set_mt_degree(active_workers);
+
+    rp->process_discovered_references(&g1_is_alive,
+                                      &g1_keep_alive,
+                                      &g1_drain_mark_stack,
+                                      &par_task_executor);
+
+    // The work routines of the parallel keep_alive and drain_marking_stack
+    // will set the has_overflown flag if we overflow the global marking
+    // stack.
+  } else {
    rp->process_discovered_references(&g1_is_alive,
                                      &g1_keep_alive,
                                      &g1_drain_mark_stack,
                                      NULL);
+
+  }
+
  assert(_markStack.overflow() || _markStack.isEmpty(),
      "mark stack should be empty (unless it overflowed)");
  if (_markStack.overflow()) {
+    // Should have been done already when we tried to push an
+    // entry on to the global mark stack. But let's do it again.
    set_has_overflown();
  }

+  if (rp->processing_is_mt()) {
+    assert(rp->num_q() == active_workers, "why not");
+    rp->enqueue_discovered_references(&par_task_executor);
+  } else {
    rp->enqueue_discovered_references();
+  }
+
  rp->verify_no_references_recorded();
  assert(!rp->discovery_enabled(), "should have been disabled");

@ -1955,7 +2202,9 @@ public:
      CMTask* task = _cm->task(worker_i);
      task->record_start_time();
      do {
-        task->do_marking_step(1000000000.0 /* something very large */);
+        task->do_marking_step(1000000000.0 /* something very large */,
+                              true /* do_stealing    */,
+                              true /* do_termination */);
      } while (task->has_aborted() && !_cm->has_overflown());
      // If we overflow, then we do not want to restart. We instead
      // want to abort remark and do concurrent marking again.
@ -1978,7 +2227,7 @@ void ConcurrentMark::checkpointRootsFinalWork() {
    G1CollectedHeap::StrongRootsScope srs(g1h);
    // this is remark, so we'll use up all available threads
    int active_workers = ParallelGCThreads;
-    set_phase(active_workers, false);
+    set_phase(active_workers, false /* concurrent */);

    CMRemarkTask remarkTask(this);
    // We will start all available threads, even if we decide that the
@ -1992,7 +2241,7 @@ void ConcurrentMark::checkpointRootsFinalWork() {
    G1CollectedHeap::StrongRootsScope srs(g1h);
    // this is remark, so we'll use up all available threads
    int active_workers = 1;
-    set_phase(active_workers, false);
+    set_phase(active_workers, false /* concurrent */);

    CMRemarkTask remarkTask(this);
    // We will start all available threads, even if we decide that the
@ -2005,9 +2254,6 @@ void ConcurrentMark::checkpointRootsFinalWork() {

  print_stats();

-  if (!restart_for_overflow())
-    set_non_marking_state();
-
 #if VERIFY_OBJS_PROCESSED
  if (_scan_obj_cl.objs_processed != ThreadLocalObjQueue::objs_enqueued) {
    gclog_or_tty->print_cr("Processed = %d, enqueued = %d.",
@ -3249,7 +3495,7 @@ void CMTask::regular_clock_call() {
  double elapsed_time_ms = curr_time_ms - _start_time_ms;
  if (elapsed_time_ms > _time_target_ms) {
    set_has_aborted();
-    _has_aborted_timed_out = true;
+    _has_timed_out = true;
    statsOnly( ++_aborted_timed_out );
    return;
  }
@ -3754,7 +4000,9 @@ void CMTask::print_stats() {

 *****************************************************************************/

-void CMTask::do_marking_step(double time_target_ms) {
+void CMTask::do_marking_step(double time_target_ms,
+                             bool do_stealing,
+                             bool do_termination) {
  assert(time_target_ms >= 1.0, "minimum granularity is 1ms");
  assert(concurrent() == _cm->concurrent(), "they should be the same");

@ -3794,7 +4042,7 @@ void CMTask::do_marking_step(double time_target_ms) {

  // clear all flags
  clear_has_aborted();
-  _has_aborted_timed_out = false;
+  _has_timed_out = false;
  _draining_satb_buffers = false;

  ++_calls;
@ -3970,7 +4218,7 @@ void CMTask::do_marking_step(double time_target_ms) {
  drain_global_stack(false);

  // Attempt at work stealing from other task's queues.
-  if (!has_aborted()) {
+  if (do_stealing && !has_aborted()) {
    // We have not aborted. This means that we have finished all that
    // we could. Let's try to do some stealing...

@ -4011,7 +4259,7 @@ void CMTask::do_marking_step(double time_target_ms) {

  // We still haven't aborted. Now, let's try to get into the
  // termination protocol.
-  if (!has_aborted()) {
+  if (do_termination && !has_aborted()) {
    // We cannot check whether the global stack is empty, since other
    // tasks might be concurrently pushing objects on it. We also cannot
    // check if the region stack is empty because if a thread is aborting
@ -4087,7 +4335,7 @@ void CMTask::do_marking_step(double time_target_ms) {

    statsOnly( ++_aborted );

-    if (_has_aborted_timed_out) {
+    if (_has_timed_out) {
      double diff_ms = elapsed_time_ms - _time_target_ms;
      // Keep statistics of how well we did with respect to hitting
      // our target only if we actually timed out (if we aborted for
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
@ -353,6 +353,10 @@ class ConcurrentMark: public CHeapObj {
  friend class CMConcurrentMarkingTask;
  friend class G1ParNoteEndTask;
  friend class CalcLiveObjectsClosure;
+  friend class G1RefProcTaskProxy;
+  friend class G1RefProcTaskExecutor;
+  friend class G1CMParKeepAliveAndDrainClosure;
+  friend class G1CMParDrainMarkingStackClosure;

 protected:
  ConcurrentMarkThread* _cmThread;   // the thread doing the work
@ -936,7 +940,7 @@ private:
  // if this is true, then the task has aborted for some reason
  bool                        _has_aborted;
  // set when the task aborts because it has met its time quota
-  bool                        _has_aborted_timed_out;
+  bool                        _has_timed_out;
  // true when we're draining SATB buffers; this avoids the task
  // aborting due to SATB buffers being available (as we're already
  // dealing with them)
@ -1041,7 +1045,7 @@ public:
  // trying not to exceed the given duration. However, it might exit
  // prematurely, according to some conditions (i.e. SATB buffers are
  // available for processing).
-  void do_marking_step(double target_ms);
+  void do_marking_step(double target_ms, bool do_stealing, bool do_termination);

  // These two calls start and stop the timer
  void record_start_time() {
@ -1063,6 +1067,7 @@ public:
  bool has_aborted()            { return _has_aborted; }
  void set_has_aborted()        { _has_aborted = true; }
  void clear_has_aborted()      { _has_aborted = false; }
+  bool has_timed_out()          { return _has_timed_out; }
  bool claimed()                { return _claimed; }

  // Support routines for the partially scanned region that may be
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
@ -81,6 +81,14 @@
  product(intx, G1MarkRegionStackSize, 1024 * 1024,                         \
          "Size of the region stack for concurrent marking.")               \
                                                                            \
+  product(double, G1ConcMarkStepDurationMillis, 10.0,                       \
+          "Target duration of individual concurrent marking steps "         \
+          "in milliseconds.")                                               \
+                                                                            \
+  product(intx, G1RefProcDrainInterval, 10,                                 \
+          "The number of discovered reference objects to process before "   \
+          "draining concurrent marking work queues.")                       \
+                                                                            \
  develop(bool, G1SATBBarrierPrintNullPreVals, false,                       \
          "If true, count frac of ptr writes with null pre-vals.")          \
                                                                            \
--- a/hotspot/src/share/vm/runtime/arguments.cpp
+++ b/hotspot/src/share/vm/runtime/arguments.cpp
@ -1941,10 +1941,16 @@ bool Arguments::check_vm_args_consistency() {
    status = false;
  }

+#ifndef SERIALGC
  if (UseG1GC) {
    status = status && verify_percentage(InitiatingHeapOccupancyPercent,
                                         "InitiatingHeapOccupancyPercent");
+    status = status && verify_min_value(G1RefProcDrainInterval, 1,
+                                        "G1RefProcDrainInterval");
+    status = status && verify_min_value((intx)G1ConcMarkStepDurationMillis, 1,
+                                        "G1ConcMarkStepDurationMillis");
  }
+#endif

  status = status && verify_interval(RefDiscoveryPolicy,
                                     ReferenceProcessor::DiscoveryPolicyMin,