8311691: C2: Remove legacy code related to PostLoopMultiversioning

Reviewed-by: kvn, sviswanathan
2023-07-13 01:45:52 +00:00 · 2023-07-13 01:45:52 +00:00 · a38582e941
commit a38582e941
parent 38f74125d1
9 changed files with 49 additions and 606 deletions
--- a/src/hotspot/share/compiler/compilerDefinitions.cpp
+++ b/src/hotspot/share/compiler/compilerDefinitions.cpp
@ -512,15 +512,6 @@ bool CompilerConfig::check_args_consistency(bool status) {
    FLAG_SET_CMDLINE(BackgroundCompilation, false);
  }

-#ifdef COMPILER2
-  if (PostLoopMultiversioning && !RangeCheckElimination) {
-    if (!FLAG_IS_DEFAULT(PostLoopMultiversioning)) {
-      warning("PostLoopMultiversioning disabled because RangeCheckElimination is disabled.");
-    }
-    FLAG_SET_CMDLINE(PostLoopMultiversioning, false);
-  }
-#endif // COMPILER2
-
  if (CompilerConfig::is_interpreter_only()) {
    if (UseCompiler) {
      if (!FLAG_IS_DEFAULT(UseCompiler)) {
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@ -182,9 +182,6 @@
           "Map number of unrolls for main loop via "                       \
           "Superword Level Parallelism analysis")                          \
                                                                            \
-  product(bool, PostLoopMultiversioning, false, EXPERIMENTAL,               \
-           "Multi versioned post loops to eliminate range checks")          \
-                                                                            \
  notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
          "Trace what Superword Level Parallelism analysis applies")        \
                                                                            \
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -1888,55 +1888,6 @@ void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old
  loop->record_for_igvn();
 }

-
-//-------------------------insert_scalar_rced_post_loop------------------------
-// Insert a copy of the rce'd main loop as a post loop,
-// We have not unrolled the main loop, so this is the right time to inject this.
-// Later we will examine the partner of this post loop pair which still has range checks
-// to see inject code which tests at runtime if the range checks are applicable.
-void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) {
-  if (!loop->_head->is_CountedLoop()) return;
-
-  CountedLoopNode *cl = loop->_head->as_CountedLoop();
-
-  // only process RCE'd main loops
-  if (!cl->is_main_loop() || loop->range_checks_present()) return;
-
-#ifndef PRODUCT
-  if (TraceLoopOpts) {
-    tty->print("PostScalarRce  ");
-    loop->dump_head();
-  }
-#endif
-  C->set_major_progress();
-
-  // Find common pieces of the loop being guarded with pre & post loops
-  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
-  CountedLoopEndNode *main_end = main_head->loopexit();
-  // diagnostic to show loop end is not properly formed
-  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
-
-  Node *incr = main_end->incr();
-  Node *limit = main_end->limit();
-
-  // In this case we throw away the result as we are not using it to connect anything else.
-  CountedLoopNode *post_head = nullptr;
-  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
-  copy_assertion_predicates_to_post_loop(main_head->skip_strip_mined(), post_head, incr, main_head->stride());
-
-  // It's difficult to be precise about the trip-counts
-  // for post loops.  They are usually very short,
-  // so guess that unit vector trips is a reasonable value.
-  post_head->set_profile_trip_cnt(4.0);
-  post_head->set_is_rce_post_loop();
-
-  // Now force out all loop-invariant dominating tests.  The optimizer
-  // finds some, but we _know_ they are all useless.
-  peeled_dom_test_elim(loop, old_new);
-  loop->record_for_igvn();
-}
-
-
 //------------------------------insert_post_loop-------------------------------
 // Insert post loops.  Add a post loop to the given loop passed.
 Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree* loop, Node_List& old_new,
@ -3198,143 +3149,6 @@ bool IdealLoopTree::compute_has_range_checks() const {
  return false;
 }

-//-------------------------multi_version_post_loops----------------------------
-// Check the range checks that remain, if simple, use the bounds to guard
-// which version to a post loop we execute, one with range checks or one without
-bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) {
-  bool multi_version_succeeded = false;
-  assert(RangeCheckElimination, "");
-  CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop();
-  assert(legacy_cl->is_post_loop(), "");
-
-  // Check for existence of range checks using the unique instance to make a guard with
-  Unique_Node_List worklist;
-  for (uint i = 0; i < legacy_loop->_body.size(); i++) {
-    Node *iff = legacy_loop->_body[i];
-    int iff_opc = iff->Opcode();
-    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
-      worklist.push(iff);
-    }
-  }
-
-  // Find RCE'd post loop so that we can stage its guard.
-  if (legacy_cl->is_canonical_loop_entry() == nullptr) {
-    return multi_version_succeeded;
-  }
-  Node* ctrl = legacy_cl->in(LoopNode::EntryControl);
-  Node* iffm = ctrl->in(0);
-
-  // Now we test that both the post loops are connected
-  Node* post_loop_region = iffm->in(0);
-  if (post_loop_region == nullptr) return multi_version_succeeded;
-  if (!post_loop_region->is_Region()) return multi_version_succeeded;
-  Node* covering_region = post_loop_region->in(RegionNode::Control+1);
-  if (covering_region == nullptr) return multi_version_succeeded;
-  if (!covering_region->is_Region()) return multi_version_succeeded;
-  Node* p_f = covering_region->in(RegionNode::Control);
-  if (p_f == nullptr) return multi_version_succeeded;
-  if (!p_f->is_IfFalse()) return multi_version_succeeded;
-  if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded;
-  CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd();
-  if (rce_loop_end == nullptr) return multi_version_succeeded;
-  CountedLoopNode* rce_cl = rce_loop_end->loopnode();
-  if (rce_cl == nullptr || !rce_cl->is_post_loop()) return multi_version_succeeded;
-  CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop();
-  if (rce_cl != known_rce_cl) return multi_version_succeeded;
-
-  // Then we fetch the cover entry test
-  ctrl = rce_cl->in(LoopNode::EntryControl);
-  if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded;
-
-#ifndef PRODUCT
-  if (TraceLoopOpts) {
-    tty->print("PostMultiVersion\n");
-    rce_loop->dump_head();
-    legacy_loop->dump_head();
-  }
-#endif
-
-  // Now fetch the limit we want to compare against
-  Node *limit = rce_cl->limit();
-  bool first_time = true;
-
-  // If we got this far, we identified the post loop which has been RCE'd and
-  // we have a work list.  Now we will try to transform the if guard to cause
-  // the loop pair to be multi version executed with the determination left to runtime
-  // or the optimizer if full information is known about the given arrays at compile time.
-  Node *last_min = nullptr;
-  multi_version_succeeded = true;
-  while (worklist.size()) {
-    Node* rc_iffm = worklist.pop();
-    if (rc_iffm->is_If()) {
-      Node *rc_bolzm = rc_iffm->in(1);
-      if (rc_bolzm->is_Bool()) {
-        Node *rc_cmpzm = rc_bolzm->in(1);
-        if (rc_cmpzm->is_Cmp()) {
-          Node *rc_left = rc_cmpzm->in(2);
-          if (rc_left->Opcode() != Op_LoadRange) {
-            multi_version_succeeded = false;
-            break;
-          }
-          if (first_time) {
-            last_min = rc_left;
-            first_time = false;
-          } else {
-            Node *cur_min = new MinINode(last_min, rc_left);
-            last_min = cur_min;
-            _igvn.register_new_node_with_optimizer(last_min);
-          }
-        }
-      }
-    }
-  }
-
-  // All we have to do is update the limit of the rce loop
-  // with the min of our expression and the current limit.
-  // We will use this expression to replace the current limit.
-  if (last_min && multi_version_succeeded) {
-    Node *cur_min = new MinINode(last_min, limit);
-    _igvn.register_new_node_with_optimizer(cur_min);
-    Node *cmp_node = rce_loop_end->cmp_node();
-    _igvn.replace_input_of(cmp_node, 2, cur_min);
-    set_ctrl(cur_min, ctrl);
-    set_loop(cur_min, rce_loop->_parent);
-
-    legacy_cl->mark_is_multiversioned();
-    rce_cl->mark_is_multiversioned();
-    multi_version_succeeded = true;
-
-    C->set_major_progress();
-  }
-
-  return multi_version_succeeded;
-}
-
-//-------------------------poison_rce_post_loop--------------------------------
-// Causes the rce'd post loop to be optimized away if multiversioning fails
-void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) {
-  CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop();
-  Node* ctrl = rce_cl->in(LoopNode::EntryControl);
-  if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) {
-    Node* iffm = ctrl->in(0);
-    if (iffm->is_If()) {
-      Node* cur_bool = iffm->in(1);
-      if (cur_bool->is_Bool()) {
-        Node* cur_cmp = cur_bool->in(1);
-        if (cur_cmp->is_Cmp()) {
-          BoolTest::mask new_test = BoolTest::gt;
-          BoolNode *new_bool = new BoolNode(cur_cmp, new_test);
-          _igvn.replace_node(cur_bool, new_bool);
-          _igvn._worklist.push(new_bool);
-          Node* left_op = cur_cmp->in(1);
-          _igvn.replace_input_of(cur_cmp, 2, left_op);
-          C->set_major_progress();
-        }
-      }
-    }
-  }
-}
-
 //------------------------------DCE_loop_body----------------------------------
 // Remove simplistic dead code from loop body
 void IdealLoopTree::DCE_loop_body() {
@ -3864,14 +3678,6 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
      phase->do_range_check(this, old_new);
    }

-    if (should_unroll && !should_peel && PostLoopMultiversioning &&
-        Matcher::has_predicated_vectors()) {
-      // Try to setup multiversioning on main loops before they are unrolled
-      if (cl->is_main_loop() && (cl->unrolled_count() == 1)) {
-        phase->insert_scalar_rced_post_loop(this, old_new);
-      }
-    }
-
    // Double loop body for unrolling.  Adjust the minimum-trip test (will do
    // twice as many iterations as before) and the main body limit (only do
    // an even number of trips).  If we are peeling, we might enable some RCE
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@ -4013,7 +4013,6 @@ void IdealLoopTree::dump_head() {
    if (cl->is_post_loop()) tty->print(" post");
    if (cl->is_vectorized_loop()) tty->print(" vector");
    if (range_checks_present()) tty->print(" rc ");
-    if (cl->is_multiversioned()) tty->print(" multi ");
  }
  if (_has_call) tty->print(" has_call");
  if (_has_sfpt) tty->print(" has_sfpt");
@ -4653,29 +4652,7 @@ void PhaseIdealLoop::build_and_optimize() {
      IdealLoopTree* lpt = iter.current();
      if (lpt->is_counted()) {
        CountedLoopNode *cl = lpt->_head->as_CountedLoop();
-
-        if (cl->is_rce_post_loop() && !cl->is_vectorized_loop()) {
-          assert(PostLoopMultiversioning, "multiversioning must be enabled");
-          // Check that the rce'd post loop is encountered first, multiversion after all
-          // major main loop optimization are concluded
-          if (!C->major_progress()) {
-            IdealLoopTree *lpt_next = lpt->_next;
-            if (lpt_next && lpt_next->is_counted()) {
-              CountedLoopNode *cl = lpt_next->_head->as_CountedLoop();
-              if (cl->is_post_loop() && lpt_next->range_checks_present()) {
-                if (!cl->is_multiversioned()) {
-                  if (multi_version_post_loops(lpt, lpt_next) == false) {
-                    // Cause the rce loop to be optimized away if we fail
-                    cl->mark_is_multiversioned();
-                    cl->set_slp_max_unroll(0);
-                    poison_rce_post_loop(lpt);
-                  }
-                }
-              }
-            }
-            sw.transform_loop(lpt, true);
-          }
-        } else if (cl->is_main_loop()) {
+        if (cl->is_main_loop()) {
          if (!sw.transform_loop(lpt, true)) {
            // Instigate more unrolling for optimization when vectorization fails.
            if (cl->has_passed_slp()) {
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -72,16 +72,13 @@ protected:
         DoUnrollOnly          = 1<<9,
         VectorizedLoop        = 1<<10,
         HasAtomicPostLoop     = 1<<11,
-         IsMultiversioned      = 1<<12,
-         StripMined            = 1<<13,
-         SubwordLoop           = 1<<14,
-         ProfileTripFailed     = 1<<15,
-         LoopNestInnerLoop     = 1<<16,
-         LoopNestLongOuterLoop = 1<<17};
+         StripMined            = 1<<12,
+         SubwordLoop           = 1<<13,
+         ProfileTripFailed     = 1<<14,
+         LoopNestInnerLoop     = 1<<15,
+         LoopNestLongOuterLoop = 1<<16 };
  char _unswitch_count;
  enum { _unswitch_max=3 };
-  char _postloop_flags;
-  enum { RCEPostLoop = 1 };

  // Expected trip count from profile data
  float _profile_trip_cnt;
@ -93,7 +90,6 @@ public:
  bool is_inner_loop() const { return _loop_flags & InnerLoop; }
  void set_inner_loop() { _loop_flags |= InnerLoop; }

-  bool is_multiversioned() const { return _loop_flags & IsMultiversioned; }
  bool is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
  bool is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
  void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
@ -110,7 +106,6 @@ public:
  void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
  void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
  void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
-  void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
  void mark_strip_mined() { _loop_flags |= StripMined; }
  void clear_strip_mined() { _loop_flags &= ~StripMined; }
  void mark_profile_trip_failed() { _loop_flags |= ProfileTripFailed; }
@ -121,9 +116,6 @@ public:
  int unswitch_max() { return _unswitch_max; }
  int unswitch_count() { return _unswitch_count; }

-  int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; }
-  void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; }
-
  void set_unswitch_count(int val) {
    assert (val <= unswitch_max(), "too many unswitches");
    _unswitch_count = val;
@ -134,7 +126,7 @@ public:

  LoopNode(Node *entry, Node *backedge)
    : RegionNode(3), _loop_flags(0), _unswitch_count(0),
-      _postloop_flags(0), _profile_trip_cnt(COUNT_UNKNOWN)  {
+      _profile_trip_cnt(COUNT_UNKNOWN) {
    init_class_id(Class_Loop);
    init_req(EntryControl, entry);
    init_req(LoopBackControl, backedge);
@ -322,8 +314,6 @@ public:
  int  node_count_before_unroll()            { return _node_count_before_unroll; }
  void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
  int  slp_max_unroll() const                { return _slp_maximum_unroll_factor; }
-  void set_slp_pack_count(int pack_count)    { _slp_vector_pack_count = pack_count; }
-  int  slp_pack_count() const                { return _slp_vector_pack_count; }

  virtual LoopNode* skip_strip_mined(int expect_skeleton = 1);
  OuterStripMinedLoopNode* outer_loop() const;
@ -1305,9 +1295,6 @@ public:
                         CountedLoopNode* main_head, CountedLoopEndNode* main_end,
                         Node*& incr, Node* limit, CountedLoopNode*& post_head);

-  // Add an RCE'd post loop which we will multi-version adapt for run time test path usage
-  void insert_scalar_rced_post_loop( IdealLoopTree *loop, Node_List &old_new );
-
  // Add a vector post loop between a vector main loop and the current post loop
  void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
  // If Node n lives in the back_ctrl block, we clone a private version of n
@ -1402,13 +1389,6 @@ public:
  // Eliminate range-checks and other trip-counter vs loop-invariant tests.
  void do_range_check(IdealLoopTree *loop, Node_List &old_new);

-  // Process post loops which have range checks and try to build a multi-version
-  // guard to safely determine if we can execute the post loop which was RCE'd.
-  bool multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop);
-
-  // Cause the rce'd post loop to optimized away, this happens if we cannot complete multiverioning
-  void poison_rce_post_loop(IdealLoopTree *rce_loop);
-
  // Create a slow version of the loop by cloning the loop
  // and inserting an if to select fast-slow versions.
  // Return the inserted if.
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -54,7 +54,6 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
  _packset(arena(), 8,  0, nullptr),                        // packs for the current block
  _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  _block(arena(), 8,  0, nullptr),                          // nodes in current block
-  _post_block(arena(), 8, 0, nullptr),                      // nodes common to current block which are marked as post loop vectorizable
  _data_entry(arena(), 8,  0, nullptr),                     // nodes with all inputs from outside
  _mem_slice_head(arena(), 8,  0, nullptr),                 // memory slice heads
  _mem_slice_tail(arena(), 8,  0, nullptr),                 // memory slice tails
@ -116,11 +115,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
    mark_reductions();
  }

-  if (cl->is_rce_post_loop() && is_marked_reduction_loop()) {
-    // Post loop vectorization doesn't support reductions
-    return false;
-  }
-
  // skip any loop that has not been assigned max unroll by analysis
  if (do_optimization) {
    if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
@ -176,24 +170,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  if (do_optimization) {
    assert(_packset.length() == 0, "packset must be empty");
    success = SLP_extract();
-    if (PostLoopMultiversioning) {
-      if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) {
-        IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
-        CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
-        // Main loop SLP works well for manually unrolled loops. But post loop
-        // vectorization doesn't work for these. To bail out the optimization
-        // earlier, we have range check and loop stride conditions below.
-        if (cl_next->is_post_loop() && !lpt_next->range_checks_present() &&
-            cl_next->stride_is_con() && abs(cl_next->stride_con()) == 1) {
-          if (!cl_next->is_vectorized_loop()) {
-            // Propagate some main loop attributes to its corresponding scalar
-            // rce'd post loop for vectorization with vector masks
-            cl_next->set_slp_max_unroll(cl->slp_max_unroll());
-            cl_next->set_slp_pack_count(cl->slp_pack_count());
-          }
-        }
-      }
-    }
  }
  return success;
 }
@ -206,9 +182,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
  Node_Stack nstack((int)ignored_size);
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  Node *cl_exit = cl->loopexit_or_null();
-  int rpo_idx = _post_block.length();
-
-  assert(rpo_idx == 0, "post loop block is empty");

  // First clear the entries
  for (uint i = 0; i < lpt()->_body.size(); i++) {
@ -313,27 +286,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
  }

  if (is_slp) {
-    // In the main loop, SLP works well if parts of the operations in the loop body
-    // are not vectorizable and those non-vectorizable parts will be unrolled only.
-    // But in post loops with vector masks, we create singleton packs directly from
-    // scalars so all operations should be vectorized together. This compares the
-    // number of packs in the post loop with the main loop and bail out if the post
-    // loop potentially has more packs.
-    if (cl->is_rce_post_loop()) {
-      for (uint i = 0; i < lpt()->_body.size(); i++) {
-        if (ignored_loop_nodes[i] == -1) {
-          _post_block.at_put_grow(rpo_idx++, lpt()->_body.at(i));
-        }
-      }
-      if (_post_block.length() > cl->slp_pack_count()) {
-        // Clear local_loop_unroll_factor and bail out directly from here
-        local_loop_unroll_factor = 0;
-        cl->mark_was_slp();
-        cl->set_slp_max_unroll(0);
-        return;
-      }
-    }
-
    // Now we try to find the maximum supported consistent vector which the machine
    // description can use
    bool flag_small_bt = false;
@ -404,7 +356,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
      cl->mark_passed_slp();
    }
    cl->mark_was_slp();
-    if (cl->is_main_loop() || cl->is_rce_post_loop()) {
+    if (cl->is_main_loop()) {
      cl->set_slp_max_unroll(local_loop_unroll_factor);
    }
  }
@ -590,80 +542,43 @@ bool SuperWord::SLP_extract() {
    }
  }
 #endif
+
+  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
+  assert(cl->is_main_loop(), "SLP should only work on main loops");
+
  // Ready the block
  if (!construct_bb()) {
    return false; // Exit if no interesting nodes or complex graph.
  }

-  // build    _dg, _disjoint_ptrs
+  // build _dg, _disjoint_ptrs
  dependence_graph();

  // compute function depth(Node*)
  compute_max_depth();

-  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
-  if (cl->is_main_loop()) {
-    compute_vector_element_type();
+  // Compute vector element types
+  compute_vector_element_type();

-    // Attempt vectorization
+  // Attempt vectorization
+  find_adjacent_refs();

-    find_adjacent_refs();
-
-    if (align_to_ref() == nullptr) {
-      return false; // Did not find memory reference to align vectors
-    }
-
-    extend_packlist();
-
-    combine_packs();
-
-    construct_my_pack_map();
-
-    filter_packs();
-
-    DEBUG_ONLY(verify_packs();)
-
-    schedule();
-
-    // Record eventual count of vector packs for checks in post loop vectorization
-    if (PostLoopMultiversioning) {
-      cl->set_slp_pack_count(_packset.length());
-    }
-  } else {
-    assert(cl->is_rce_post_loop(), "Must be an rce'd post loop");
-    int saved_mapped_unroll_factor = cl->slp_max_unroll();
-    if (saved_mapped_unroll_factor) {
-      int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
-
-      // now reset the slp_unroll_factor so that we can check the analysis mapped
-      // what the vector loop was mapped to
-      cl->set_slp_max_unroll(0);
-
-      // do the analysis on the post loop
-      unrolling_analysis(vector_mapped_unroll_factor);
-
-      // if our analyzed loop is a canonical fit, start processing it
-      if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
-        // now add the vector nodes to packsets
-        for (int i = 0; i < _post_block.length(); i++) {
-          Node* n = _post_block.at(i);
-          Node_List* singleton = new Node_List();
-          singleton->push(n);
-          _packset.append(singleton);
-          set_my_pack(n, singleton);
-        }
-
-        // map base types for vector usage
-        compute_vector_element_type();
-      } else {
-        return false;
-      }
-    } else {
-      // for some reason we could not map the slp analysis state of the vectorized loop
-      return false;
-    }
+  if (align_to_ref() == nullptr) {
+    return false; // Did not find memory reference to align vectors
  }

+  extend_packlist();
+
+  combine_packs();
+
+  construct_my_pack_map();
+
+  filter_packs();
+
+  DEBUG_ONLY(verify_packs();)
+
+  schedule();
+
  return output();
 }

@ -1143,6 +1058,8 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 void SuperWord::dependence_graph() {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  assert(cl->is_main_loop(), "SLP should only work on main loops");
+
  // First, assign a dependence node to each memory node
  for (int i = 0; i < _block.length(); i++ ) {
    Node *n = _block.at(i);
@ -1157,9 +1074,7 @@ void SuperWord::dependence_graph() {
    Node* n_tail = _mem_slice_tail.at(i);

    // Get slice in predecessor order (last is first)
-    if (cl->is_main_loop()) {
-      mem_slice_preds(n_tail, n, _nlist);
-    }
+    mem_slice_preds(n_tail, n, _nlist);

 #ifndef PRODUCT
    if(TraceSuperWord && Verbose) {
@ -2591,6 +2506,7 @@ void SuperWord::print_loop(bool whole) {
 // Convert packs into vector node operations
 bool SuperWord::output() {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  assert(cl->is_main_loop(), "SLP should only work on main loops");
  Compile* C = _phase->C;
  if (_packset.length() == 0) {
    return false;
@ -2603,16 +2519,13 @@ bool SuperWord::output() {
  }
 #endif

-  if (cl->is_main_loop()) {
-    // MUST ENSURE main loop's initial value is properly aligned:
-    //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
+  // Ensure main loop's initial value is properly aligned
+  //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
+  align_initial_loop_index(align_to_ref());

-    align_initial_loop_index(align_to_ref());
-
-    // Insert extract (unpack) operations for scalar uses
-    for (int i = 0; i < _packset.length(); i++) {
-      insert_extracts(_packset.at(i));
-    }
+  // Insert extract (unpack) operations for scalar uses
+  for (int i = 0; i < _packset.length(); i++) {
+    insert_extracts(_packset.at(i));
  }

  uint max_vlen_in_bytes = 0;
@ -2629,16 +2542,6 @@ bool SuperWord::output() {
    return false;
  }

-  Node* vmask = nullptr;
-  if (cl->is_rce_post_loop() && do_reserve_copy()) {
-    // Create a vector mask node for post loop, bail out if not created
-    vmask = create_post_loop_vmask();
-    if (vmask == nullptr) {
-      // create_post_loop_vmask checks many conditions, any of them could fail
-      return false; // and reverse to backup IG
-    }
-  }
-
  for (int i = 0; i < _block.length(); i++) {
    Node* n = _block.at(i);
    Node_List* p = my_pack(n);
@ -2650,10 +2553,6 @@ bool SuperWord::output() {
      uint vlen = p->size();
      uint vlen_in_bytes = 0;
      Node* vn = nullptr;
-      if (cl->is_rce_post_loop()) {
-        // override vlen with the main loops vector length
-        vlen = cl->slp_max_unroll();
-      }
      NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
      int   opc = n->Opcode();
      if (n->is_Load()) {
@ -2675,13 +2574,7 @@ bool SuperWord::output() {
        }
        Node* adr = first->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
-        if (cl->is_rce_post_loop()) {
-          assert(vmask != nullptr, "vector mask should be generated");
-          const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen);
-          vn = new LoadVectorMaskedNode(ctl, mem, adr, atyp, vt, vmask);
-        } else {
-          vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
-        }
+        vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
        vlen_in_bytes = vn->as_LoadVector()->memory_size();
      } else if (n->is_Store()) {
        // Promote value to be stored to vector
@ -2699,13 +2592,7 @@ bool SuperWord::output() {
        Node* mem = first->in(MemNode::Memory);
        Node* adr = first->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
-        if (cl->is_rce_post_loop()) {
-          assert(vmask != nullptr, "vector mask should be generated");
-          const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen);
-          vn = new StoreVectorMaskedNode(ctl, mem, adr, val, atyp, vmask);
-        } else {
-          vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
-        }
+        vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
        vlen_in_bytes = vn->as_StoreVector()->memory_size();
      } else if (VectorNode::is_scalar_rotate(n)) {
        Node* in1 = first->in(1);
@ -2961,20 +2848,12 @@ bool SuperWord::output() {
        if (TraceSuperWordLoopUnrollAnalysis) {
          tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
        }
-
        // For atomic unrolled loops which are vector mapped, instigate more unrolling
        cl->set_notpassed_slp();
-        if (cl->is_main_loop()) {
-          // if vector resources are limited, do not allow additional unrolling, also
-          // do not unroll more on pure vector loops which were not reduced so that we can
-          // program the post loop to single iteration execution.
-          if (Matcher::float_pressure_limit() > 8) {
-            C->set_major_progress();
-            cl->mark_do_unroll_only();
-          }
-        }
-        if (cl->is_rce_post_loop() && do_reserve_copy()) {
-          cl->mark_is_multiversioned();
+        // if vector resources are limited, do not allow additional unrolling
+        if (Matcher::float_pressure_limit() > 8) {
+          C->set_major_progress();
+          cl->mark_do_unroll_only();
        }
      }
    }
@ -2988,107 +2867,6 @@ bool SuperWord::output() {
  return true;
 }

-//-------------------------create_post_loop_vmask-------------------------
-// Check the post loop vectorizability and create a vector mask if yes.
-// Return null to bail out if post loop is not vectorizable.
-Node* SuperWord::create_post_loop_vmask() {
-  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
-  assert(cl->is_rce_post_loop(), "Must be an rce post loop");
-  assert(!is_marked_reduction_loop(), "no vector reduction in post loop");
-  assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1");
-
-  // Collect vector element types of all post loop packs. Also collect
-  // superword pointers of each memory access operation if the address
-  // expression is supported. (Note that vectorizable post loop should
-  // only have positive scale in counting-up loop and negative scale in
-  // counting-down loop.) Collected SWPointer(s) are also used for data
-  // dependence check next.
-  VectorElementSizeStats stats(_arena);
-  GrowableArray<SWPointer*> swptrs(_arena, _packset.length(), 0, nullptr);
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    assert(p->size() == 1, "all post loop packs should be singleton");
-    Node* n = p->at(0);
-    BasicType bt = velt_basic_type(n);
-    if (!is_java_primitive(bt)) {
-      return nullptr;
-    }
-    if (n->is_Mem()) {
-      SWPointer* mem_p = new (_arena) SWPointer(n->as_Mem(), this, nullptr, false);
-      // For each memory access, we check if the scale (in bytes) in its
-      // address expression is equal to the data size times loop stride.
-      // With this, Only positive scales exist in counting-up loops and
-      // negative scales exist in counting-down loops.
-      if (mem_p->scale_in_bytes() != type2aelembytes(bt) * cl->stride_con()) {
-        return nullptr;
-      }
-      swptrs.append(mem_p);
-    }
-    stats.record_size(type2aelembytes(bt));
-  }
-
-  // Find the vector data type for generating vector masks. Currently we
-  // don't support post loops with mixed vector data sizes
-  int unique_size = stats.unique_size();
-  BasicType vmask_bt;
-  switch (unique_size) {
-    case 1:  vmask_bt = T_BYTE; break;
-    case 2:  vmask_bt = T_SHORT; break;
-    case 4:  vmask_bt = T_INT; break;
-    case 8:  vmask_bt = T_LONG; break;
-    default: return nullptr;
-  }
-
-  // Currently we can't remove this MaxVectorSize constraint. Without it,
-  // it's not guaranteed that the RCE'd post loop runs at most "vlen - 1"
-  // iterations, because the vector drain loop may not be cloned from the
-  // vectorized main loop. We should re-engineer PostLoopMultiversioning
-  // to fix this problem.
-  int vlen = cl->slp_max_unroll();
-  if (unique_size * vlen != MaxVectorSize) {
-    return nullptr;
-  }
-
-  // Bail out if target doesn't support mask generator or masked load/store
-  if (!Matcher::match_rule_supported_vector(Op_LoadVectorMasked, vlen, vmask_bt)  ||
-      !Matcher::match_rule_supported_vector(Op_StoreVectorMasked, vlen, vmask_bt) ||
-      !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, vmask_bt)) {
-    return nullptr;
-  }
-
-  // Bail out if potential data dependence exists between memory accesses
-  if (SWPointer::has_potential_dependence(swptrs)) {
-    return nullptr;
-  }
-
-  // Create vector mask with the post loop trip count. Note there's another
-  // vector drain loop which is cloned from main loop before super-unrolling
-  // so the scalar post loop runs at most vlen-1 trips. Hence, this version
-  // only runs at most 1 iteration after vector mask transformation.
-  Node* trip_cnt;
-  Node* new_incr;
-  if (cl->stride_con() > 0) {
-    trip_cnt = new SubINode(cl->limit(), cl->init_trip());
-    new_incr = new AddINode(cl->phi(), trip_cnt);
-  } else {
-    trip_cnt = new SubINode(cl->init_trip(), cl->limit());
-    new_incr = new SubINode(cl->phi(), trip_cnt);
-  }
-  _igvn.register_new_node_with_optimizer(trip_cnt);
-  _igvn.register_new_node_with_optimizer(new_incr);
-  _igvn.replace_node(cl->incr(), new_incr);
-  Node* length = new ConvI2LNode(trip_cnt);
-  _igvn.register_new_node_with_optimizer(length);
-  Node* vmask = VectorMaskGenNode::make(length, vmask_bt);
-  _igvn.register_new_node_with_optimizer(vmask);
-
-  // Remove exit test to transform 1-iteration loop to straight-line code.
-  // This results in redundant cmp+branch instructions been eliminated.
-  Node *cl_exit = cl->loopexit();
-  _igvn.replace_input_of(cl_exit, 1, _igvn.intcon(0));
-  return vmask;
-}
-
 //------------------------------vector_opd---------------------------
 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
@ -3098,19 +2876,11 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  bool have_same_inputs = same_inputs(p, opd_idx);

-  if (cl->is_rce_post_loop()) {
-    // override vlen with the main loops vector length
-    assert(p->size() == 1, "Packs in post loop should have only one node");
-    vlen = cl->slp_max_unroll();
-  }
-
  // Insert index population operation to create a vector of increasing
  // indices starting from the iv value. In some special unrolled loops
  // (see JDK-8286125), we need scalar replications of the iv value if
-  // all inputs are the same iv, so we do a same inputs check here. But
-  // in post loops, "have_same_inputs" is always true because all packs
-  // are singleton. That's why a pack size check is also required.
-  if (opd == iv() && (!have_same_inputs || p->size() == 1)) {
+  // all inputs are the same iv, so we do a same inputs check here.
+  if (opd == iv() && !have_same_inputs) {
    BasicType p0_bt = velt_basic_type(p0);
    BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
    assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
@ -4026,7 +3796,6 @@ void SuperWord::init() {
  _packset.clear();
  _disjoint_ptrs.clear();
  _block.clear();
-  _post_block.clear();
  _data_entry.clear();
  _mem_slice_head.clear();
  _mem_slice_tail.clear();
@ -4447,34 +4216,6 @@ void SWPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
  _invar = register_if_new(add);
 }

-//-----------------has_potential_dependence-----------------
-// Check potential data dependence among all memory accesses.
-// We require every two accesses (with at least one store) of
-// the same element type has the same address expression.
-bool SWPointer::has_potential_dependence(GrowableArray<SWPointer*> swptrs) {
-  for (int i1 = 0; i1 < swptrs.length(); i1++) {
-    SWPointer* p1 = swptrs.at(i1);
-    MemNode* n1 = p1->mem();
-    BasicType bt1 = n1->memory_type();
-
-    // Iterate over remaining SWPointers
-    for (int i2 = i1 + 1; i2 < swptrs.length(); i2++) {
-      SWPointer* p2 = swptrs.at(i2);
-      MemNode* n2 = p2->mem();
-      BasicType bt2 = n2->memory_type();
-
-      // Data dependence exists between load-store, store-load
-      // or store-store with the same element type or subword
-      // size (subword load/store may have inaccurate type)
-      if ((n1->is_Store() || n2->is_Store()) &&
-          same_type_or_subword_size(bt1, bt2) && !p1->equal(*p2)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 //----------------------------print------------------------
 void SWPointer::print() {
 #ifndef PRODUCT
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -285,7 +285,6 @@ class SuperWord : public ResourceObj {
  GrowableArray<int> _bb_idx;            // Map from Node _idx to index within block

  GrowableArray<Node*> _block;           // Nodes in current block
-  GrowableArray<Node*> _post_block;      // Nodes in post loop block
  GrowableArray<Node*> _data_entry;      // Nodes with all inputs from outside
  GrowableArray<Node*> _mem_slice_head;  // Memory slice head nodes
  GrowableArray<Node*> _mem_slice_tail;  // Memory slice tail nodes
@ -579,8 +578,6 @@ private:

  // Convert packs into vector node operations
  bool output();
-  // Create vector mask for post loop vectorization
-  Node* create_post_loop_vmask();
  // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
  Node* vector_opd(Node_List* p, int opd_idx);
  // Can code be generated for pack p?
@ -725,8 +722,6 @@ class SWPointer : public ArenaObj {
  static bool equal(int cmp)      { return cmp == Equal; }
  static bool comparable(int cmp) { return cmp < NotComparable; }

-  static bool has_potential_dependence(GrowableArray<SWPointer*> swptrs);
-
  void print();

 #ifndef PRODUCT
--- a/test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java
+++ b/test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java
@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/**
- * @test TestRangeCheckEliminationDisabled
- * @bug 8154763
- * @summary Tests PostLoopMultiversioning with RangeCheckElimination disabled.
- * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
- *                   -XX:+UnlockExperimentalVMOptions -XX:+PostLoopMultiversioning -XX:-RangeCheckElimination
- *                   compiler.rangechecks.TestRangeCheckEliminationDisabled
- */
-
-package compiler.rangechecks;
-
-public class TestRangeCheckEliminationDisabled {
-
-    public static void main(String[] args) {
-      System.out.println("Passed");
-    }
-}
-
--- a/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java
@ -61,9 +61,6 @@ public class VectorizationTestRunner {
        // each test method returning a primitive value or an array of primitive type.
        // And each test method should not throw any exceptions.
        Class klass = getClass();
-        // Add extra VM options to verify experimental auto-vectorization
-        WB.setBooleanVMFlag("UnlockExperimentalVMOptions", true);
-        WB.setBooleanVMFlag("PostLoopMultiversioning", true);
        for (Method method : klass.getDeclaredMethods()) {
            try {
                if (method.isAnnotationPresent(Test.class)) {