6484957: G1: parallel concurrent refinement

6826318: G1: remove traversal-based refinement code Removed traversal-based refinement code as it's no longer used. Made the concurrent refinement (queue-based) parallel. Reviewed-by: tonyp
2009-05-11 16:30:56 -07:00 · 2009-05-11 16:30:56 -07:00 · 8c764e214c
commit 8c764e214c
parent 955a453996
23 changed files with 230 additions and 666 deletions
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
@ -4454,43 +4454,26 @@ void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val
    delayed()->nop();
  }

-  // Now we decide how to generate the card table write.  If we're
-  // enqueueing, we call out to a generated function.  Otherwise, we do it
-  // inline here.
-
-  if (G1RSBarrierUseQueue) {
-    // If the "store_addr" register is an "in" or "local" register, move it to
-    // a scratch reg so we can pass it as an argument.
-    bool use_scr = !(store_addr->is_global() || store_addr->is_out());
-    // Pick a scratch register different from "tmp".
-    Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch);
-    // Make sure we use up the delay slot!
-    if (use_scr) {
-      post_filter_masm->mov(store_addr, scr);
-    } else {
-      post_filter_masm->nop();
-    }
-    generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base);
-    save_frame(0);
-    call(dirty_card_log_enqueue);
-    if (use_scr) {
-      delayed()->mov(scr, O0);
-    } else {
-      delayed()->mov(store_addr->after_save(), O0);
-    }
-    restore();
-
+  // If the "store_addr" register is an "in" or "local" register, move it to
+  // a scratch reg so we can pass it as an argument.
+  bool use_scr = !(store_addr->is_global() || store_addr->is_out());
+  // Pick a scratch register different from "tmp".
+  Register scr = (tmp == G1_scratch ? G3_scratch : G1_scratch);
+  // Make sure we use up the delay slot!
+  if (use_scr) {
+    post_filter_masm->mov(store_addr, scr);
  } else {
-
-#ifdef _LP64
-    post_filter_masm->srlx(store_addr, CardTableModRefBS::card_shift, store_addr);
-#else
-    post_filter_masm->srl(store_addr, CardTableModRefBS::card_shift, store_addr);
-#endif
-    assert(tmp != store_addr, "need separate temp reg");
-    set(bs->byte_map_base, tmp);
-    stb(G0, tmp, store_addr);
+    post_filter_masm->nop();
  }
+  generate_dirty_card_log_enqueue_if_necessary(bs->byte_map_base);
+  save_frame(0);
+  call(dirty_card_log_enqueue);
+  if (use_scr) {
+    delayed()->mov(scr, O0);
+  } else {
+    delayed()->mov(store_addr->after_save(), O0);
+  }
+  restore();

  bind(filtered);

--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@ -25,23 +25,26 @@
 #include "incls/_precompiled.incl"
 #include "incls/_concurrentG1Refine.cpp.incl"

-bool ConcurrentG1Refine::_enabled = false;
-
 ConcurrentG1Refine::ConcurrentG1Refine() :
-  _pya(PYA_continue), _last_pya(PYA_continue),
-  _last_cards_during(), _first_traversal(false),
  _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL),
  _hot_cache(NULL),
  _def_use_cache(false), _use_cache(false),
-  _n_periods(0), _total_cards(0), _total_travs(0)
+  _n_periods(0), _total_cards(0), _total_travs(0),
+  _threads(NULL), _n_threads(0)
 {
  if (G1ConcRefine) {
-    _cg1rThread = new ConcurrentG1RefineThread(this);
-    assert(cg1rThread() != NULL, "Conc refine should have been created");
-    assert(cg1rThread()->cg1r() == this,
-           "Conc refine thread should refer to this");
-  } else {
-    _cg1rThread = NULL;
+    _n_threads = (G1ParallelRSetThreads > 0) ? G1ParallelRSetThreads : ParallelGCThreads;
+    if (_n_threads > 0) {
+      _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads);
+      ConcurrentG1RefineThread *next = NULL;
+      for (int i = _n_threads - 1; i >= 0; i--) {
+        ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, i);
+        assert(t != NULL, "Conc refine should have been created");
+        assert(t->cg1r() == this, "Conc refine thread should refer to this");
+        _threads[i] = t;
+        next = t;
+      }
+    }
  }
 }

@ -75,6 +78,14 @@ void ConcurrentG1Refine::init() {
  }
 }

+void ConcurrentG1Refine::stop() {
+  if (_threads != NULL) {
+    for (int i = 0; i < _n_threads; i++) {
+      _threads[i]->stop();
+    }
+  }
+}
+
 ConcurrentG1Refine::~ConcurrentG1Refine() {
  if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
    assert(_card_counts != NULL, "Logic");
@ -88,104 +99,22 @@ ConcurrentG1Refine::~ConcurrentG1Refine() {
    assert(_hot_cache != NULL, "Logic");
    FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
  }
-}
-
-bool ConcurrentG1Refine::refine() {
-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  unsigned cards_before = g1h->g1_rem_set()->conc_refine_cards();
-  clear_hot_cache();  // Any previous values in this are now invalid.
-  g1h->g1_rem_set()->concurrentRefinementPass(this);
-  _traversals++;
-  unsigned cards_after = g1h->g1_rem_set()->conc_refine_cards();
-  unsigned cards_during = cards_after-cards_before;
-  // If this is the first traversal in the current enabling
-  // and we did some cards, or if the number of cards found is decreasing
-  // sufficiently quickly, then keep going.  Otherwise, sleep a while.
-  bool res =
-    (_first_traversal && cards_during > 0)
-    ||
-    (!_first_traversal && cards_during * 3 < _last_cards_during * 2);
-  _last_cards_during = cards_during;
-  _first_traversal = false;
-  return res;
-}
-
-void ConcurrentG1Refine::enable() {
-  MutexLocker x(G1ConcRefine_mon);
-  if (!_enabled) {
-    _enabled = true;
-    _first_traversal = true; _last_cards_during = 0;
-    G1ConcRefine_mon->notify_all();
+  if (_threads != NULL) {
+    for (int i = 0; i < _n_threads; i++) {
+      delete _threads[i];
+    }
+    FREE_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads);
  }
 }

-unsigned ConcurrentG1Refine::disable() {
-  MutexLocker x(G1ConcRefine_mon);
-  if (_enabled) {
-    _enabled = false;
-    return _traversals;
-  } else {
-    return 0;
-  }
-}
-
-void ConcurrentG1Refine::wait_for_ConcurrentG1Refine_enabled() {
-  G1ConcRefine_mon->lock();
-  while (!_enabled) {
-    G1ConcRefine_mon->wait(Mutex::_no_safepoint_check_flag);
-  }
-  G1ConcRefine_mon->unlock();
-  _traversals = 0;
-};
-
-void ConcurrentG1Refine::set_pya_restart() {
-  // If we're using the log-based RS barrier, the above will cause
-  // in-progress traversals of completed log buffers to quit early; we will
-  // also abandon all other buffers.
-  if (G1RSBarrierUseQueue) {
-    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
-    dcqs.abandon_logs();
-    // Reset the post-yield actions.
-    _pya = PYA_continue;
-    _last_pya = PYA_continue;
-  } else {
-    _pya = PYA_restart;
-  }
-}
-
-void ConcurrentG1Refine::set_pya_cancel() {
-  _pya = PYA_cancel;
-}
-
-PostYieldAction ConcurrentG1Refine::get_pya() {
-  if (_pya != PYA_continue) {
-    jint val = _pya;
-    while (true) {
-      jint val_read = Atomic::cmpxchg(PYA_continue, &_pya, val);
-      if (val_read == val) {
-        PostYieldAction res = (PostYieldAction)val;
-        assert(res != PYA_continue, "Only the refine thread should reset.");
-        _last_pya = res;
-        return res;
-      } else {
-        val = val_read;
-      }
+void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
+  if (_threads != NULL) {
+    for (int i = 0; i < _n_threads; i++) {
+      tc->do_thread(_threads[i]);
    }
  }
-  // QQQ WELL WHAT DO WE RETURN HERE???
-  // make up something!
-  return PYA_continue;
 }

-PostYieldAction ConcurrentG1Refine::get_last_pya() {
-  PostYieldAction res = _last_pya;
-  _last_pya = PYA_continue;
-  return res;
-}
-
-bool ConcurrentG1Refine::do_traversal() {
-  return _cg1rThread->do_traversal();
-}

 int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) {
  size_t card_num = (card_ptr - _ct_bot);
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
@ -26,25 +26,9 @@
 class ConcurrentG1RefineThread;
 class G1RemSet;

-// What to do after a yield:
-enum PostYieldAction {
-  PYA_continue,  // Continue the traversal
-  PYA_restart,   // Restart
-  PYA_cancel     // It's been completed by somebody else: cancel.
-};
-
 class ConcurrentG1Refine: public CHeapObj {
-  ConcurrentG1RefineThread* _cg1rThread;
-
-  volatile jint _pya;
-  PostYieldAction _last_pya;
-
-  static bool _enabled;  // Protected by G1ConcRefine_mon.
-  unsigned _traversals;
-
-  // Number of cards processed during last refinement traversal.
-  unsigned _first_traversal;
-  unsigned _last_cards_during;
+  ConcurrentG1RefineThread** _threads;
+  int _n_threads;

  // The cache for card refinement.
  bool     _use_cache;
@ -74,37 +58,10 @@ class ConcurrentG1Refine: public CHeapObj {
  ~ConcurrentG1Refine();

  void init(); // Accomplish some initialization that has to wait.
+  void stop();

-  // Enabled Conc refinement, waking up thread if necessary.
-  void enable();
-
-  // Returns the number of traversals performed since this refiner was enabled.
-  unsigned disable();
-
-  // Requires G1ConcRefine_mon to be held.
-  bool enabled() { return _enabled; }
-
-  // Returns only when G1 concurrent refinement has been enabled.
-  void wait_for_ConcurrentG1Refine_enabled();
-
-  // Do one concurrent refinement pass over the card table.  Returns "true"
-  // if heuristics determine that another pass should be done immediately.
-  bool refine();
-
-  // Indicate that an in-progress refinement pass should start over.
-  void set_pya_restart();
-  // Indicate that an in-progress refinement pass should quit.
-  void set_pya_cancel();
-
-  // Get the appropriate post-yield action.  Also sets last_pya.
-  PostYieldAction get_pya();
-
-  // The last PYA read by "get_pya".
-  PostYieldAction get_last_pya();
-
-  bool do_traversal();
-
-  ConcurrentG1RefineThread* cg1rThread() { return _cg1rThread; }
+  // Iterate over the conc refine threads
+  void threads_do(ThreadClosure *tc);

  // If this is the first entry for the slot, writes into the cache and
  // returns NULL.  If it causes an eviction, returns the evicted pointer.
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp
@ -30,12 +30,12 @@
 // The CM thread is created when the G1 garbage collector is used

 ConcurrentG1RefineThread::
-ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r) :
+ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next, int worker_id) :
  ConcurrentGCThread(),
+  _worker_id(worker_id),
+  _active(false),
+  _next(next),
  _cg1r(cg1r),
-  _started(false),
-  _in_progress(false),
-  _do_traversal(false),
  _vtime_accum(0.0),
  _co_tracker(G1CRGroup),
  _interval_ms(5.0)
@ -43,112 +43,6 @@ ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r) :
  create_and_start();
 }

-const long timeout = 200; // ms.
-
-void ConcurrentG1RefineThread::traversalBasedRefinement() {
-  _cg1r->wait_for_ConcurrentG1Refine_enabled();
-  MutexLocker x(G1ConcRefine_mon);
-  while (_cg1r->enabled()) {
-    MutexUnlocker ux(G1ConcRefine_mon);
-    ResourceMark rm;
-    HandleMark   hm;
-
-    if (G1TraceConcurrentRefinement) {
-      gclog_or_tty->print_cr("G1-Refine starting pass");
-    }
-    _sts.join();
-    bool no_sleep = _cg1r->refine();
-    _sts.leave();
-    if (!no_sleep) {
-      MutexLockerEx x(CGC_lock, Mutex::_no_safepoint_check_flag);
-      // We do this only for the timeout; we don't expect this to be signalled.
-      CGC_lock->wait(Mutex::_no_safepoint_check_flag, timeout);
-    }
-  }
-}
-
-void ConcurrentG1RefineThread::queueBasedRefinement() {
-  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
-  // Wait for completed log buffers to exist.
-  {
-    MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
-    while (!_do_traversal && !dcqs.process_completed_buffers() &&
-           !_should_terminate) {
-      DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag);
-    }
-  }
-
-  if (_should_terminate) {
-    return;
-  }
-
-  // Now we take them off (this doesn't hold locks while it applies
-  // closures.)  (If we did a full collection, then we'll do a full
-  // traversal.
-  _sts.join();
-  if (_do_traversal) {
-    (void)_cg1r->refine();
-    switch (_cg1r->get_last_pya()) {
-    case PYA_cancel: case PYA_continue:
-      // Continue was caught and handled inside "refine".  If it's still
-      // "continue" when we get here, we're done.
-      _do_traversal = false;
-      break;
-    case PYA_restart:
-      assert(_do_traversal, "Because of Full GC.");
-      break;
-    }
-  } else {
-    int n_logs = 0;
-    int lower_limit = 0;
-    double start_vtime_sec; // only used when G1SmoothConcRefine is on
-    int prev_buffer_num; // only used when G1SmoothConcRefine is on
-
-    if (G1SmoothConcRefine) {
-      lower_limit = 0;
-      start_vtime_sec = os::elapsedVTime();
-      prev_buffer_num = (int) dcqs.completed_buffers_num();
-    } else {
-      lower_limit = DCQBarrierProcessCompletedThreshold / 4; // For now.
-    }
-    while (dcqs.apply_closure_to_completed_buffer(0, lower_limit)) {
-      double end_vtime_sec;
-      double elapsed_vtime_sec;
-      int elapsed_vtime_ms;
-      int curr_buffer_num;
-
-      if (G1SmoothConcRefine) {
-        end_vtime_sec = os::elapsedVTime();
-        elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
-        elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0);
-        curr_buffer_num = (int) dcqs.completed_buffers_num();
-
-        if (curr_buffer_num > prev_buffer_num ||
-            curr_buffer_num > DCQBarrierProcessCompletedThreshold) {
-          decreaseInterval(elapsed_vtime_ms);
-        } else if (curr_buffer_num < prev_buffer_num) {
-          increaseInterval(elapsed_vtime_ms);
-        }
-      }
-
-      sample_young_list_rs_lengths();
-      _co_tracker.update(false);
-
-      if (G1SmoothConcRefine) {
-        prev_buffer_num = curr_buffer_num;
-        _sts.leave();
-        os::sleep(Thread::current(), (jlong) _interval_ms, false);
-        _sts.join();
-        start_vtime_sec = os::elapsedVTime();
-      }
-      n_logs++;
-    }
-    // Make sure we harvest the PYA, if any.
-    (void)_cg1r->get_pya();
-  }
-  _sts.leave();
-}
-
 void ConcurrentG1RefineThread::sample_young_list_rs_lengths() {
  G1CollectedHeap* g1h = G1CollectedHeap::heap();
  G1CollectorPolicy* g1p = g1h->g1_policy();
@ -184,15 +78,97 @@ void ConcurrentG1RefineThread::run() {
  _co_tracker.start();

  while (!_should_terminate) {
-    // wait until started is set.
-    if (G1RSBarrierUseQueue) {
-      queueBasedRefinement();
-    } else {
-      traversalBasedRefinement();
+    DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
+    // Wait for completed log buffers to exist.
+    {
+      MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+      while (((_worker_id == 0 && !dcqs.process_completed_buffers()) ||
+              (_worker_id > 0 && !is_active())) &&
+             !_should_terminate) {
+         DirtyCardQ_CBL_mon->wait(Mutex::_no_safepoint_check_flag);
+      }
    }
+
+    if (_should_terminate) {
+      return;
+    }
+
+    // Now we take them off (this doesn't hold locks while it applies
+    // closures.)  (If we did a full collection, then we'll do a full
+    // traversal.
    _sts.join();
-    _co_tracker.update();
+    int n_logs = 0;
+    int lower_limit = 0;
+    double start_vtime_sec; // only used when G1SmoothConcRefine is on
+    int prev_buffer_num; // only used when G1SmoothConcRefine is on
+    // This thread activation threshold
+    int threshold = DCQBarrierProcessCompletedThreshold * _worker_id;
+    // Next thread activation threshold
+    int next_threshold = threshold + DCQBarrierProcessCompletedThreshold;
+    int deactivation_threshold = MAX2<int>(threshold - DCQBarrierProcessCompletedThreshold / 2, 0);
+
+    if (G1SmoothConcRefine) {
+      lower_limit = 0;
+      start_vtime_sec = os::elapsedVTime();
+      prev_buffer_num = (int) dcqs.completed_buffers_num();
+    } else {
+      lower_limit = DCQBarrierProcessCompletedThreshold / 4; // For now.
+    }
+    while (dcqs.apply_closure_to_completed_buffer(_worker_id, lower_limit)) {
+      double end_vtime_sec;
+      double elapsed_vtime_sec;
+      int elapsed_vtime_ms;
+      int curr_buffer_num = (int) dcqs.completed_buffers_num();
+
+      if (G1SmoothConcRefine) {
+        end_vtime_sec = os::elapsedVTime();
+        elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
+        elapsed_vtime_ms = (int) (elapsed_vtime_sec * 1000.0);
+
+        if (curr_buffer_num > prev_buffer_num ||
+            curr_buffer_num > next_threshold) {
+          decreaseInterval(elapsed_vtime_ms);
+        } else if (curr_buffer_num < prev_buffer_num) {
+          increaseInterval(elapsed_vtime_ms);
+        }
+      }
+      if (_worker_id == 0) {
+        sample_young_list_rs_lengths();
+      } else if (curr_buffer_num < deactivation_threshold) {
+        // If the number of the buffer has fallen below our threshold
+        // we should deactivate. The predecessor will reactivate this
+        // thread should the number of the buffers cross the threshold again.
+        MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+        deactivate();
+        if (G1TraceConcurrentRefinement) {
+          gclog_or_tty->print_cr("G1-Refine-deactivated worker %d", _worker_id);
+        }
+        break;
+      }
+      _co_tracker.update(false);
+
+      // Check if we need to activate the next thread.
+      if (curr_buffer_num > next_threshold && _next != NULL && !_next->is_active()) {
+        MutexLockerEx x(DirtyCardQ_CBL_mon, Mutex::_no_safepoint_check_flag);
+        _next->activate();
+        DirtyCardQ_CBL_mon->notify_all();
+        if (G1TraceConcurrentRefinement) {
+          gclog_or_tty->print_cr("G1-Refine-activated worker %d", _next->_worker_id);
+        }
+      }
+
+      if (G1SmoothConcRefine) {
+        prev_buffer_num = curr_buffer_num;
+        _sts.leave();
+        os::sleep(Thread::current(), (jlong) _interval_ms, false);
+        _sts.join();
+        start_vtime_sec = os::elapsedVTime();
+      }
+      n_logs++;
+    }
+    _co_tracker.update(false);
    _sts.leave();
+
    if (os::supports_vtime()) {
      _vtime_accum = (os::elapsedVTime() - _vtime_start);
    } else {
@ -240,7 +216,3 @@ void ConcurrentG1RefineThread::print() {
  Thread::print();
  gclog_or_tty->cr();
 }
-
-void ConcurrentG1RefineThread::set_do_traversal(bool b) {
-  _do_traversal = b;
-}
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.hpp
@ -33,21 +33,26 @@ class ConcurrentG1RefineThread: public ConcurrentGCThread {

  double _vtime_start;  // Initial virtual time.
  double _vtime_accum;  // Initial virtual time.
+  int _worker_id;

+  // The refinement threads collection is linked list. A predecessor can activate a successor
+  // when the number of the rset update buffer crosses a certain threshold. A successor
+  // would self-deactivate when the number of the buffers falls below the threshold.
+  bool _active;
+  ConcurrentG1RefineThread *       _next;
 public:
  virtual void run();

+  bool is_active()  { return _active;  }
+  void activate()   { _active = true;  }
+  void deactivate() { _active = false; }
+
 private:
  ConcurrentG1Refine*              _cg1r;
-  bool                             _started;
-  bool                             _in_progress;
-  volatile bool                    _restart;

  COTracker                        _co_tracker;
  double                           _interval_ms;

-  bool                             _do_traversal;
-
  void decreaseInterval(int processing_time_ms) {
    double min_interval_ms = (double) processing_time_ms;
    _interval_ms = 0.8 * _interval_ms;
@ -63,16 +68,12 @@ class ConcurrentG1RefineThread: public ConcurrentGCThread {

  void sleepBeforeNextCycle();

-  void traversalBasedRefinement();
-
-  void queueBasedRefinement();
-
  // For use by G1CollectedHeap, which is a friend.
  static SuspendibleThreadSet* sts() { return &_sts; }

 public:
  // Constructor
-  ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r);
+  ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next, int worker_id);

  // Printing
  void print();
@ -82,23 +83,11 @@ class ConcurrentG1RefineThread: public ConcurrentGCThread {

  ConcurrentG1Refine* cg1r()                     { return _cg1r;     }

-
-  void            set_started()                  { _started = true;   }
-  void            clear_started()                { _started = false;  }
-  bool            started()                      { return _started;   }
-
-  void            set_in_progress()              { _in_progress = true;   }
-  void            clear_in_progress()            { _in_progress = false;  }
-  bool            in_progress()                  { return _in_progress;   }
-
-  void            set_do_traversal(bool b);
-  bool            do_traversal() { return _do_traversal; }
-
  void            sample_young_list_rs_lengths();

  // Yield for GC
  void            yield();

  // shutdown
-  static void stop();
+  void stop();
 };
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp
@ -80,5 +80,5 @@ class ConcurrentMarkThread: public ConcurrentGCThread {
  void            yield();

  // shutdown
-  static void stop();
+  void stop();
 };
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentZFThread.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentZFThread.hpp
@ -73,7 +73,7 @@ class ConcurrentZFThread: public ConcurrentGCThread {
  // while holding the ZF_needed_mon lock.

  // shutdown
-  static void stop();
+  void stop();

  // Stats
  static void note_region_alloc() {_region_allocs++; }
--- a/hotspot/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp
@ -234,7 +234,7 @@ bool DirtyCardQueueSet::apply_closure_to_completed_buffer(int worker_i,
    nd = get_completed_buffer_lock(stop_at);
  }
  bool res = apply_closure_to_completed_buffer_helper(worker_i, nd);
-  if (res) _processed_buffers_rs_thread++;
+  if (res) Atomic::inc(&_processed_buffers_rs_thread);
  return res;
 }

--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@ -447,7 +447,7 @@ void YoungList::print() {
 }

 void G1CollectedHeap::stop_conc_gc_threads() {
-  _cg1r->cg1rThread()->stop();
+  _cg1r->stop();
  _czft->stop();
  _cmThread->stop();
 }
@ -1001,12 +1001,8 @@ void G1CollectedHeap::do_collection(bool full, bool clear_all_soft_refs,

    gc_epilogue(true);

-    // Abandon concurrent refinement.  This must happen last: in the
-    // dirty-card logging system, some cards may be dirty by weak-ref
-    // processing, and may be enqueued.  But the whole card table is
-    // dirtied, so this should abandon those logs, and set "do_traversal"
-    // to true.
-    concurrent_g1_refine()->set_pya_restart();
+    // Discard all rset updates
+    JavaThread::dirty_card_queue_set().abandon_logs();
    assert(!G1DeferredRSUpdate
           || (G1DeferredRSUpdate && (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any");
    assert(regions_accounted_for(), "Region leakage!");
@ -1521,12 +1517,12 @@ jint G1CollectedHeap::initialize() {
                                               SATB_Q_FL_lock,
                                               0,
                                               Shared_SATB_Q_lock);
-  if (G1RSBarrierUseQueue) {
-    JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
-                                                  DirtyCardQ_FL_lock,
-                                                  G1DirtyCardQueueMax,
-                                                  Shared_DirtyCardQ_lock);
-  }
+
+  JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
+                                                DirtyCardQ_FL_lock,
+                                                G1DirtyCardQueueMax,
+                                                Shared_DirtyCardQ_lock);
+
  if (G1DeferredRSUpdate) {
    dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
                                      DirtyCardQ_FL_lock,
@ -2249,6 +2245,15 @@ void G1CollectedHeap::print_on(outputStream* st) const {
  _hrs->iterate(&blk);
 }

+class PrintOnThreadsClosure : public ThreadClosure {
+  outputStream* _st;
+public:
+  PrintOnThreadsClosure(outputStream* st) : _st(st) { }
+  virtual void do_thread(Thread *t) {
+    t->print_on(_st);
+  }
+};
+
 void G1CollectedHeap::print_gc_threads_on(outputStream* st) const {
  if (ParallelGCThreads > 0) {
    workers()->print_worker_threads();
@ -2256,8 +2261,9 @@ void G1CollectedHeap::print_gc_threads_on(outputStream* st) const {
  st->print("\"G1 concurrent mark GC Thread\" ");
  _cmThread->print();
  st->cr();
-  st->print("\"G1 concurrent refinement GC Thread\" ");
-  _cg1r->cg1rThread()->print_on(st);
+  st->print("\"G1 concurrent refinement GC Threads\" ");
+  PrintOnThreadsClosure p(st);
+  _cg1r->threads_do(&p);
  st->cr();
  st->print("\"G1 zero-fill GC Thread\" ");
  _czft->print_on(st);
@ -2269,7 +2275,7 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
    workers()->threads_do(tc);
  }
  tc->do_thread(_cmThread);
-  tc->do_thread(_cg1r->cg1rThread());
+  _cg1r->threads_do(tc);
  tc->do_thread(_czft);
 }

--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@ -167,11 +167,6 @@ G1CollectorPolicy::G1CollectorPolicy() :

  _all_full_gc_times_ms(new NumberSeq()),

-  _conc_refine_enabled(0),
-  _conc_refine_zero_traversals(0),
-  _conc_refine_max_traversals(0),
-  _conc_refine_current_delta(G1ConcRefineInitialDelta),
-
  // G1PausesBtwnConcMark defaults to -1
  // so the hack is to do the cast  QQQ FIXME
  _pauses_btwn_concurrent_mark((size_t)G1PausesBtwnConcMark),
@ -1634,9 +1629,8 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
        print_stats(1, "Parallel Time", _cur_collection_par_time_ms);
        print_par_stats(2, "Update RS (Start)", _par_last_update_rs_start_times_ms, false);
        print_par_stats(2, "Update RS", _par_last_update_rs_times_ms);
-        if (G1RSBarrierUseQueue)
-          print_par_buffers(3, "Processed Buffers",
-                            _par_last_update_rs_processed_buffers, true);
+        print_par_buffers(3, "Processed Buffers",
+                          _par_last_update_rs_processed_buffers, true);
        print_par_stats(2, "Ext Root Scanning", _par_last_ext_root_scan_times_ms);
        print_par_stats(2, "Mark Stack Scanning", _par_last_mark_stack_scan_times_ms);
        print_par_stats(2, "Scan-Only Scanning", _par_last_scan_only_times_ms);
@ -1649,9 +1643,8 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
        print_stats(1, "Clear CT", _cur_clear_ct_time_ms);
      } else {
        print_stats(1, "Update RS", update_rs_time);
-        if (G1RSBarrierUseQueue)
-          print_stats(2, "Processed Buffers",
-                      (int)update_rs_processed_buffers);
+        print_stats(2, "Processed Buffers",
+                    (int)update_rs_processed_buffers);
        print_stats(1, "Ext Root Scanning", ext_root_scan_time);
        print_stats(1, "Mark Stack Scanning", mark_stack_scan_time);
        print_stats(1, "Scan-Only Scanning", scan_only_time);
@ -2467,18 +2460,6 @@ void G1CollectorPolicy::print_tracing_info() const {
               (double) _region_num_young / (double) all_region_num * 100.0,
               _region_num_tenured,
               (double) _region_num_tenured / (double) all_region_num * 100.0);
-
-    if (!G1RSBarrierUseQueue) {
-      gclog_or_tty->print_cr("Of %d times conc refinement was enabled, %d (%7.2f%%) "
-                    "did zero traversals.",
-                    _conc_refine_enabled, _conc_refine_zero_traversals,
-                    _conc_refine_enabled > 0 ?
-                    100.0 * (float)_conc_refine_zero_traversals/
-                    (float)_conc_refine_enabled : 0.0);
-      gclog_or_tty->print_cr("  Max # of traversals = %d.",
-                    _conc_refine_max_traversals);
-      gclog_or_tty->print_cr("");
-    }
  }
  if (TraceGen1Time) {
    if (_all_full_gc_times_ms->num() > 0) {
@ -2500,38 +2481,6 @@ void G1CollectorPolicy::print_yg_surv_rate_info() const {
 #endif // PRODUCT
 }

-void G1CollectorPolicy::update_conc_refine_data() {
-  unsigned traversals = _g1->concurrent_g1_refine()->disable();
-  if (traversals == 0) _conc_refine_zero_traversals++;
-  _conc_refine_max_traversals = MAX2(_conc_refine_max_traversals,
-                                     (size_t)traversals);
-
-  if (G1PolicyVerbose > 1)
-    gclog_or_tty->print_cr("Did a CR traversal series: %d traversals.", traversals);
-  double multiplier = 1.0;
-  if (traversals == 0) {
-    multiplier = 4.0;
-  } else if (traversals > (size_t)G1ConcRefineTargTraversals) {
-    multiplier = 1.0/1.5;
-  } else if (traversals < (size_t)G1ConcRefineTargTraversals) {
-    multiplier = 1.5;
-  }
-  if (G1PolicyVerbose > 1) {
-    gclog_or_tty->print_cr("  Multiplier = %7.2f.", multiplier);
-    gclog_or_tty->print("  Delta went from %d regions to ",
-               _conc_refine_current_delta);
-  }
-  _conc_refine_current_delta =
-    MIN2(_g1->n_regions(),
-         (size_t)(_conc_refine_current_delta * multiplier));
-  _conc_refine_current_delta =
-    MAX2(_conc_refine_current_delta, (size_t)1);
-  if (G1PolicyVerbose > 1) {
-    gclog_or_tty->print_cr("%d regions.", _conc_refine_current_delta);
-  }
-  _conc_refine_enabled++;
-}
-
 bool
 G1CollectorPolicy::should_add_next_region_to_young_list() {
  assert(in_young_gc_mode(), "should be in young GC mode");
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@ -637,18 +637,6 @@ protected:
  // The number of collection pauses at the end of the last mark.
  size_t _n_pauses_at_mark_end;

-  // ==== This section is for stats related to starting Conc Refinement on time.
-  size_t _conc_refine_enabled;
-  size_t _conc_refine_zero_traversals;
-  size_t _conc_refine_max_traversals;
-  // In # of heap regions.
-  size_t _conc_refine_current_delta;
-
-  // At the beginning of a collection pause, update the variables above,
-  // especially the "delta".
-  void update_conc_refine_data();
-  // ====
-
  // Stash a pointer to the g1 heap.
  G1CollectedHeap* _g1;

--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@ -105,28 +105,6 @@ StupidG1RemSet::oops_into_collection_set_do(OopsInHeapRegionClosure* oc,
  _g1->heap_region_iterate(&rc);
 }

-class UpdateRSOutOfRegionClosure: public HeapRegionClosure {
-  G1CollectedHeap*    _g1h;
-  ModRefBarrierSet*   _mr_bs;
-  UpdateRSOopClosure  _cl;
-  int _worker_i;
-public:
-  UpdateRSOutOfRegionClosure(G1CollectedHeap* g1, int worker_i = 0) :
-    _cl(g1->g1_rem_set()->as_HRInto_G1RemSet(), worker_i),
-    _mr_bs(g1->mr_bs()),
-    _worker_i(worker_i),
-    _g1h(g1)
-    {}
-  bool doHeapRegion(HeapRegion* r) {
-    if (!r->in_collection_set() && !r->continuesHumongous()) {
-      _cl.set_from(r);
-      r->set_next_filter_kind(HeapRegionDCTOC::OutOfRegionFilterKind);
-      _mr_bs->mod_oop_in_space_iterate(r, &_cl, true, true);
-    }
-    return false;
-  }
-};
-
 class VerifyRSCleanCardOopClosure: public OopClosure {
  G1CollectedHeap* _g1;
 public:
@ -350,30 +328,17 @@ void HRInto_G1RemSet::updateRS(int worker_i) {
  double start = os::elapsedTime();
  _g1p->record_update_rs_start_time(worker_i, start * 1000.0);

-  if (G1RSBarrierUseQueue && !cg1r->do_traversal()) {
-    // Apply the appropriate closure to all remaining log entries.
-    _g1->iterate_dirty_card_closure(false, worker_i);
-    // Now there should be no dirty cards.
-    if (G1RSLogCheckCardTable) {
-      CountNonCleanMemRegionClosure cl(_g1);
-      _ct_bs->mod_card_iterate(&cl);
-      // XXX This isn't true any more: keeping cards of young regions
-      // marked dirty broke it.  Need some reasonable fix.
-      guarantee(cl.n() == 0, "Card table should be clean.");
-    }
-  } else {
-    UpdateRSOutOfRegionClosure update_rs(_g1, worker_i);
-    _g1->heap_region_iterate(&update_rs);
-    // We did a traversal; no further one is necessary.
-    if (G1RSBarrierUseQueue) {
-      assert(cg1r->do_traversal(), "Or we shouldn't have gotten here.");
-      cg1r->set_pya_cancel();
-    }
-    if (_cg1r->use_cache()) {
-      _cg1r->clear_and_record_card_counts();
-      _cg1r->clear_hot_cache();
-    }
+  // Apply the appropriate closure to all remaining log entries.
+  _g1->iterate_dirty_card_closure(false, worker_i);
+  // Now there should be no dirty cards.
+  if (G1RSLogCheckCardTable) {
+    CountNonCleanMemRegionClosure cl(_g1);
+    _ct_bs->mod_card_iterate(&cl);
+    // XXX This isn't true any more: keeping cards of young regions
+    // marked dirty broke it.  Need some reasonable fix.
+    guarantee(cl.n() == 0, "Card table should be clean.");
  }
+
  _g1p->record_update_rs_time(worker_i, (os::elapsedTime() - start) * 1000.0);
 }

@ -486,11 +451,6 @@ HRInto_G1RemSet::scanNewRefsRS(OopsInHeapRegionClosure* oc,
                                  * 1000.0);
 }

-void HRInto_G1RemSet::set_par_traversal(bool b) {
-  _par_traversal_in_progress = b;
-  HeapRegionRemSet::set_par_traversal(b);
-}
-
 void HRInto_G1RemSet::cleanupHRRS() {
  HeapRegionRemSet::cleanup();
 }
@ -527,7 +487,7 @@ HRInto_G1RemSet::oops_into_collection_set_do(OopsInHeapRegionClosure* oc,
      updateRS(worker_i);
      scanNewRefsRS(oc, worker_i);
    } else {
-      _g1p->record_update_rs_start_time(worker_i, os::elapsedTime());
+      _g1p->record_update_rs_start_time(worker_i, os::elapsedTime() * 1000.0);
      _g1p->record_update_rs_processed_buffers(worker_i, 0.0);
      _g1p->record_update_rs_time(worker_i, 0.0);
      _g1p->record_scan_new_refs_time(worker_i, 0.0);
@ -535,7 +495,7 @@ HRInto_G1RemSet::oops_into_collection_set_do(OopsInHeapRegionClosure* oc,
    if (G1ParallelRSetScanningEnabled || (worker_i == 0)) {
      scanRS(oc, worker_i);
    } else {
-      _g1p->record_scan_rs_start_time(worker_i, os::elapsedTime());
+      _g1p->record_scan_rs_start_time(worker_i, os::elapsedTime() * 1000.0);
      _g1p->record_scan_rs_time(worker_i, 0.0);
    }
  } else {
@ -562,11 +522,6 @@ prepare_for_oops_into_collection_set_do() {
  if (ParallelGCThreads > 0) {
    set_par_traversal(true);
    _seq_task->set_par_threads((int)n_workers());
-    if (cg1r->do_traversal()) {
-      updateRS(0);
-      // Have to do this again after updaters
-      cleanupHRRS();
-    }
  }
  guarantee( _cards_scanned == NULL, "invariant" );
  _cards_scanned = NEW_C_HEAP_ARRAY(size_t, n_workers());
@ -647,11 +602,8 @@ void HRInto_G1RemSet::cleanup_after_oops_into_collection_set_do() {
  _g1->collection_set_iterate(&iterClosure);
  // Set all cards back to clean.
  _g1->cleanUpCardTable();
+
  if (ParallelGCThreads > 0) {
-    ConcurrentG1Refine* cg1r = _g1->concurrent_g1_refine();
-    if (cg1r->do_traversal()) {
-      cg1r->cg1rThread()->set_do_traversal(false);
-    }
    set_par_traversal(false);
  }

@ -721,139 +673,8 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
 }


-class ConcRefineRegionClosure: public HeapRegionClosure {
-  G1CollectedHeap* _g1h;
-  CardTableModRefBS* _ctbs;
-  ConcurrentGCThread* _cgc_thrd;
-  ConcurrentG1Refine* _cg1r;
-  unsigned _cards_processed;
-  UpdateRSOopClosure _update_rs_oop_cl;
-public:
-  ConcRefineRegionClosure(CardTableModRefBS* ctbs,
-                          ConcurrentG1Refine* cg1r,
-                          HRInto_G1RemSet* g1rs) :
-    _ctbs(ctbs), _cg1r(cg1r), _cgc_thrd(cg1r->cg1rThread()),
-    _update_rs_oop_cl(g1rs), _cards_processed(0),
-    _g1h(G1CollectedHeap::heap())
-  {}
-
-  bool doHeapRegion(HeapRegion* r) {
-    if (!r->in_collection_set() &&
-        !r->continuesHumongous() &&
-        !r->is_young()) {
-      _update_rs_oop_cl.set_from(r);
-      UpdateRSObjectClosure update_rs_obj_cl(&_update_rs_oop_cl);
-
-      // For each run of dirty card in the region:
-      //   1) Clear the cards.
-      //   2) Process the range corresponding to the run, adding any
-      //      necessary RS entries.
-      // 1 must precede 2, so that a concurrent modification redirties the
-      // card.  If a processing attempt does not succeed, because it runs
-      // into an unparseable region, we will do binary search to find the
-      // beginning of the next parseable region.
-      HeapWord* startAddr = r->bottom();
-      HeapWord* endAddr = r->used_region().end();
-      HeapWord* lastAddr;
-      HeapWord* nextAddr;
-
-      for (nextAddr = lastAddr = startAddr;
-           nextAddr < endAddr;
-           nextAddr = lastAddr) {
-        MemRegion dirtyRegion;
-
-        // Get and clear dirty region from card table
-        MemRegion next_mr(nextAddr, endAddr);
-        dirtyRegion =
-          _ctbs->dirty_card_range_after_reset(
-                           next_mr,
-                           true, CardTableModRefBS::clean_card_val());
-        assert(dirtyRegion.start() >= nextAddr,
-               "returned region inconsistent?");
-
-        if (!dirtyRegion.is_empty()) {
-          HeapWord* stop_point =
-            r->object_iterate_mem_careful(dirtyRegion,
-                                          &update_rs_obj_cl);
-          if (stop_point == NULL) {
-            lastAddr = dirtyRegion.end();
-            _cards_processed +=
-              (int) (dirtyRegion.word_size() / CardTableModRefBS::card_size_in_words);
-          } else {
-            // We're going to skip one or more cards that we can't parse.
-            HeapWord* next_parseable_card =
-              r->next_block_start_careful(stop_point);
-            // Round this up to a card boundary.
-            next_parseable_card =
-              _ctbs->addr_for(_ctbs->byte_after_const(next_parseable_card));
-            // Now we invalidate the intervening cards so we'll see them
-            // again.
-            MemRegion remaining_dirty =
-              MemRegion(stop_point, dirtyRegion.end());
-            MemRegion skipped =
-              MemRegion(stop_point, next_parseable_card);
-            _ctbs->invalidate(skipped.intersection(remaining_dirty));
-
-            // Now start up again where we can parse.
-            lastAddr = next_parseable_card;
-
-            // Count how many we did completely.
-            _cards_processed +=
-              (stop_point - dirtyRegion.start()) /
-              CardTableModRefBS::card_size_in_words;
-          }
-          // Allow interruption at regular intervals.
-          // (Might need to make them more regular, if we get big
-          // dirty regions.)
-          if (_cgc_thrd != NULL) {
-            if (_cgc_thrd->should_yield()) {
-              _cgc_thrd->yield();
-              switch (_cg1r->get_pya()) {
-              case PYA_continue:
-                // This may have changed: re-read.
-                endAddr = r->used_region().end();
-                continue;
-              case PYA_restart: case PYA_cancel:
-                return true;
-              }
-            }
-          }
-        } else {
-          break;
-        }
-      }
-    }
-    // A good yield opportunity.
-    if (_cgc_thrd != NULL) {
-      if (_cgc_thrd->should_yield()) {
-        _cgc_thrd->yield();
-        switch (_cg1r->get_pya()) {
-        case PYA_restart: case PYA_cancel:
-          return true;
-        default:
-          break;
-        }
-
-      }
-    }
-    return false;
-  }
-
-  unsigned cards_processed() { return _cards_processed; }
-};
-
-
-void HRInto_G1RemSet::concurrentRefinementPass(ConcurrentG1Refine* cg1r) {
-  ConcRefineRegionClosure cr_cl(ct_bs(), cg1r, this);
-  _g1->heap_region_iterate(&cr_cl);
-  _conc_refine_traversals++;
-  _conc_refine_cards += cr_cl.cards_processed();
-}
-
 static IntHistogram out_of_histo(50, 50);

-
-
 void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
  // If the card is no longer dirty, nothing to do.
  if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
@ -983,10 +804,16 @@ public:
  HeapRegion* max_mem_sz_region() { return _max_mem_sz_region; }
 };

+class PrintRSThreadVTimeClosure : public ThreadClosure {
+public:
+  virtual void do_thread(Thread *t) {
+    ConcurrentG1RefineThread* crt = (ConcurrentG1RefineThread*) t;
+    gclog_or_tty->print("    %5.2f", crt->vtime_accum());
+  }
+};
+
 void HRInto_G1RemSet::print_summary_info() {
  G1CollectedHeap* g1 = G1CollectedHeap::heap();
-  ConcurrentG1RefineThread* cg1r_thrd =
-    g1->concurrent_g1_refine()->cg1rThread();

 #if CARD_REPEAT_HISTO
  gclog_or_tty->print_cr("\nG1 card_repeat count histogram: ");
@ -999,15 +826,13 @@ void HRInto_G1RemSet::print_summary_info() {
    gclog_or_tty->print_cr("  # of CS ptrs --> # of cards with that number.");
    out_of_histo.print_on(gclog_or_tty);
  }
-  gclog_or_tty->print_cr("\n Concurrent RS processed %d cards in "
-                "%5.2fs.",
-                _conc_refine_cards, cg1r_thrd->vtime_accum());
-
+  gclog_or_tty->print_cr("\n Concurrent RS processed %d cards",
+                         _conc_refine_cards);
  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
  jint tot_processed_buffers =
    dcqs.processed_buffers_mut() + dcqs.processed_buffers_rs_thread();
  gclog_or_tty->print_cr("  Of %d completed buffers:", tot_processed_buffers);
-  gclog_or_tty->print_cr("     %8d (%5.1f%%) by conc RS thread.",
+  gclog_or_tty->print_cr("     %8d (%5.1f%%) by conc RS threads.",
                dcqs.processed_buffers_rs_thread(),
                100.0*(float)dcqs.processed_buffers_rs_thread()/
                (float)tot_processed_buffers);
@ -1015,15 +840,12 @@ void HRInto_G1RemSet::print_summary_info() {
                dcqs.processed_buffers_mut(),
                100.0*(float)dcqs.processed_buffers_mut()/
                (float)tot_processed_buffers);
-  gclog_or_tty->print_cr("   Did %d concurrent refinement traversals.",
-                _conc_refine_traversals);
-  if (!G1RSBarrierUseQueue) {
-    gclog_or_tty->print_cr("   Scanned %8.2f cards/traversal.",
-                  _conc_refine_traversals > 0 ?
-                  (float)_conc_refine_cards/(float)_conc_refine_traversals :
-                  0);
-  }
+  gclog_or_tty->print_cr("  Conc RS threads times(s)");
+  PrintRSThreadVTimeClosure p;
+  gclog_or_tty->print("     ");
+  g1->concurrent_g1_refine()->threads_do(&p);
  gclog_or_tty->print_cr("");
+
  if (G1UseHRIntoRS) {
    HRRSStatsIter blk;
    g1->heap_region_iterate(&blk);
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@ -33,15 +33,12 @@ class ConcurrentG1Refine;
 class G1RemSet: public CHeapObj {
 protected:
  G1CollectedHeap* _g1;
-
-  unsigned _conc_refine_traversals;
  unsigned _conc_refine_cards;
-
  size_t n_workers();

 public:
  G1RemSet(G1CollectedHeap* g1) :
-    _g1(g1), _conc_refine_traversals(0), _conc_refine_cards(0)
+    _g1(g1), _conc_refine_cards(0)
  {}

  // Invoke "blk->do_oop" on all pointers into the CS in object in regions
@ -81,19 +78,11 @@ public:
  virtual void scrub_par(BitMap* region_bm, BitMap* card_bm,
                         int worker_num, int claim_val) = 0;

-  // Do any "refinement" activity that might be appropriate to the given
-  // G1RemSet.  If "refinement" has iterateive "passes", do one pass.
-  // If "t" is non-NULL, it is the thread performing the refinement.
-  // Default implementation does nothing.
-  virtual void concurrentRefinementPass(ConcurrentG1Refine* cg1r) {}
-
  // Refine the card corresponding to "card_ptr".  If "sts" is non-NULL,
  // join and leave around parts that must be atomic wrt GC.  (NULL means
  // being done at a safepoint.)
  virtual void concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {}

-  unsigned conc_refine_cards() { return _conc_refine_cards; }
-
  // Print any relevant summary info.
  virtual void print_summary_info() {}

@ -153,7 +142,7 @@ protected:
  // progress.  If so, then cards added to remembered sets should also have
  // their references into the collection summarized in "_new_refs".
  bool _par_traversal_in_progress;
-  void set_par_traversal(bool b);
+  void set_par_traversal(bool b) { _par_traversal_in_progress = b; }
  GrowableArray<oop*>** _new_refs;
  void new_refs_iterate(OopClosure* cl);

@ -194,7 +183,6 @@ public:
  void scrub_par(BitMap* region_bm, BitMap* card_bm,
                 int worker_num, int claim_val);

-  virtual void concurrentRefinementPass(ConcurrentG1Refine* t);
  virtual void concurrentRefineOneCard(jbyte* card_ptr, int worker_i);

  virtual void print_summary_info();
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
@ -147,9 +147,6 @@
  develop(bool, G1PrintCTFilterStats, false,                                \
          "If true, print stats on RS filtering effectiveness")             \
                                                                            \
-  develop(bool, G1RSBarrierUseQueue, true,                                  \
-          "If true, use queueing RS barrier")                               \
-                                                                            \
  develop(bool, G1DeferredRSUpdate, true,                                   \
          "If true, use deferred RS updates")                               \
                                                                            \
@ -253,6 +250,10 @@
                                                                            \
  experimental(bool, G1ParallelRSetScanningEnabled, false,                  \
          "Enables the parallelization of remembered set scanning "         \
-          "during evacuation pauses")
+          "during evacuation pauses")                                       \
+                                                                            \
+  product(uintx, G1ParallelRSetThreads, 0,                                  \
+          "If non-0 is the number of parallel rem set update threads, "     \
+          "otherwise the value is determined ergonomically.")

 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp
@ -1052,14 +1052,6 @@ bool OtherRegionsTable::contains_reference_locked(oop* from) const {

 }

-
-bool HeapRegionRemSet::_par_traversal = false;
-
-void HeapRegionRemSet::set_par_traversal(bool b) {
-  assert(_par_traversal != b, "Proper alternation...");
-  _par_traversal = b;
-}
-
 int HeapRegionRemSet::num_par_rem_sets() {
  // We always have at least two, so that a mutator thread can claim an
  // id and add to a rem set.
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp
@ -177,8 +177,6 @@ private:
  G1BlockOffsetSharedArray* _bosa;
  G1BlockOffsetSharedArray* bosa() const { return _bosa; }

-  static bool _par_traversal;
-
  OtherRegionsTable _other_regions;

  // One set bit for every region that has an entry for this one.
@ -211,8 +209,6 @@ public:
                   HeapRegion* hr);

  static int num_par_rem_sets();
-  static bool par_traversal() { return _par_traversal; }
-  static void set_par_traversal(bool b);

  HeapRegion* hr() const {
    return _other_regions.hr();
--- a/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.cpp
@ -172,7 +172,7 @@ void PtrQueueSet::enqueue_complete_buffer(void** buf, size_t index, bool ignore_
  _n_completed_buffers++;

  if (!_process_completed &&
-      _n_completed_buffers == _process_completed_threshold) {
+      _n_completed_buffers >= _process_completed_threshold) {
    _process_completed = true;
    if (_notify_when_complete)
      _cbl_mon->notify_all();
--- a/hotspot/src/share/vm/gc_implementation/includeDB_gc_g1
+++ b/hotspot/src/share/vm/gc_implementation/includeDB_gc_g1
@ -49,6 +49,8 @@ concurrentG1Refine.cpp			space.inline.hpp

 concurrentG1Refine.hpp			globalDefinitions.hpp
 concurrentG1Refine.hpp			allocation.hpp
+concurrentG1Refine.hpp			thread.hpp
+

 concurrentG1RefineThread.cpp		concurrentG1Refine.hpp
 concurrentG1RefineThread.cpp		concurrentG1RefineThread.hpp
--- a/hotspot/src/share/vm/gc_implementation/shared/concurrentGCThread.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/concurrentGCThread.cpp
@ -27,13 +27,12 @@
 # include "incls/_precompiled.incl"
 # include "incls/_concurrentGCThread.cpp.incl"

-bool ConcurrentGCThread::_should_terminate    = false;
-bool ConcurrentGCThread::_has_terminated      = false;
 int  ConcurrentGCThread::_CGC_flag            = CGC_nil;

 SuspendibleThreadSet ConcurrentGCThread::_sts;

-ConcurrentGCThread::ConcurrentGCThread() {
+ConcurrentGCThread::ConcurrentGCThread() :
+  _should_terminate(false), _has_terminated(false) {
  _sts.initialize();
 };

--- a/hotspot/src/share/vm/gc_implementation/shared/concurrentGCThread.hpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/concurrentGCThread.hpp
@ -72,8 +72,8 @@ class ConcurrentGCThread: public NamedThread {
  friend class VMStructs;

 protected:
-  static bool _should_terminate;
-  static bool _has_terminated;
+  bool _should_terminate;
+  bool _has_terminated;

  enum CGC_flag_type {
    CGC_nil           = 0x0,
--- a/hotspot/src/share/vm/memory/cardTableRS.cpp
+++ b/hotspot/src/share/vm/memory/cardTableRS.cpp
@ -33,12 +33,8 @@ CardTableRS::CardTableRS(MemRegion whole_heap,
 {
 #ifndef SERIALGC
  if (UseG1GC) {
-    if (G1RSBarrierUseQueue) {
      _ct_bs = new G1SATBCardTableLoggingModRefBS(whole_heap,
                                                  max_covered_regions);
-    } else {
-      _ct_bs = new G1SATBCardTableModRefBS(whole_heap, max_covered_regions);
-    }
  } else {
    _ct_bs = new CardTableModRefBSForCTRS(whole_heap, max_covered_regions);
  }
--- a/hotspot/src/share/vm/runtime/mutexLocker.cpp
+++ b/hotspot/src/share/vm/runtime/mutexLocker.cpp
@ -70,7 +70,6 @@ Monitor* FullGCCount_lock             = NULL;
 Monitor* CMark_lock                   = NULL;
 Monitor* ZF_mon                       = NULL;
 Monitor* Cleanup_mon                  = NULL;
-Monitor* G1ConcRefine_mon             = NULL;
 Mutex*   SATB_Q_FL_lock               = NULL;
 Monitor* SATB_Q_CBL_mon               = NULL;
 Mutex*   Shared_SATB_Q_lock           = NULL;
@ -168,7 +167,6 @@ void mutex_init() {
    def(CMark_lock                 , Monitor, nonleaf,     true ); // coordinate concurrent mark thread
    def(ZF_mon                     , Monitor, leaf,        true );
    def(Cleanup_mon                , Monitor, nonleaf,     true );
-    def(G1ConcRefine_mon           , Monitor, nonleaf,     true );
    def(SATB_Q_FL_lock             , Mutex  , special,     true );
    def(SATB_Q_CBL_mon             , Monitor, nonleaf,     true );
    def(Shared_SATB_Q_lock         , Mutex,   nonleaf,     true );
--- a/hotspot/src/share/vm/runtime/mutexLocker.hpp
+++ b/hotspot/src/share/vm/runtime/mutexLocker.hpp
@ -63,9 +63,6 @@ extern Monitor* FullGCCount_lock;                // in support of "concurrent" f
 extern Monitor* CMark_lock;                      // used for concurrent mark thread coordination
 extern Monitor* ZF_mon;                          // used for G1 conc zero-fill.
 extern Monitor* Cleanup_mon;                     // used for G1 conc cleanup.
-extern Monitor* G1ConcRefine_mon;                // used for G1 conc-refine
-                                                 // coordination.
-
 extern Mutex*   SATB_Q_FL_lock;                  // Protects SATB Q
                                                 // buffer free list.
 extern Monitor* SATB_Q_CBL_mon;                  // Protects SATB Q