8324890: C2 SuperWord: refactor out VLoop, make unrolling_analysis static, remove init/reset mechanism

Reviewed-by: kvn, roland
2024-02-10 14:19:01 +00:00 · 2024-02-10 14:19:01 +00:00 · 232d136885
commit 232d136885
parent 71d2dbd0b6
9 changed files with 483 additions and 365 deletions
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -1104,12 +1104,9 @@ void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLo
    if (!cl->was_slp_analyzed()) {
      Compile::TracePhase tp("autoVectorize", &Phase::timers[Phase::_t_autoVectorize]);

-      SuperWord sw(phase);
-      sw.transform_loop(this, false);
-
-      // If the loop is slp canonical analyze it
-      if (sw.early_return() == false) {
-        sw.unrolling_analysis(_local_loop_unroll_factor);
+      VLoop vloop(this, true);
+      if (vloop.check_preconditions()) {
+        SuperWord::unrolling_analysis(vloop, _local_loop_unroll_factor);
      }
    }

--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@ -45,7 +45,7 @@
 #include "opto/predicates.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
-#include "opto/superword.hpp"
+#include "opto/vectorization.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "utilities/checkedCast.hpp"
 #include "utilities/powerOfTwo.hpp"
@ -4863,30 +4863,30 @@ void PhaseIdealLoop::build_and_optimize() {
     C->set_major_progress();
  }

-  // Convert scalar to superword operations at the end of all loop opts.
+  // Auto-vectorize main-loop
  if (C->do_superword() && C->has_loops() && !C->major_progress()) {
    Compile::TracePhase tp("autoVectorize", &timers[_t_autoVectorize]);
-    // SuperWord transform
-    SuperWord sw(this);
+
+    // Shared data structures for all AutoVectorizations, to reduce allocations
+    // of large arrays.
+    VSharedData vshared;
    for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
      IdealLoopTree* lpt = iter.current();
-      if (lpt->is_counted()) {
-        CountedLoopNode *cl = lpt->_head->as_CountedLoop();
-        if (cl->is_main_loop()) {
-          if (!sw.transform_loop(lpt, true)) {
-            // Instigate more unrolling for optimization when vectorization fails.
-            if (cl->has_passed_slp()) {
-              C->set_major_progress();
-              cl->set_notpassed_slp();
-              cl->mark_do_unroll_only();
-            }
-          }
+      AutoVectorizeStatus status = auto_vectorize(lpt, vshared);
+
+      if (status == AutoVectorizeStatus::TriedAndFailed) {
+        // We tried vectorization, but failed. From now on only unroll the loop.
+        CountedLoopNode* cl = lpt->_head->as_CountedLoop();
+        if (cl->has_passed_slp()) {
+          C->set_major_progress();
+          cl->set_notpassed_slp();
+          cl->mark_do_unroll_only();
        }
      }
    }
  }

-  // Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
+  // Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
  if (C->has_loops() && !C->major_progress()) {
    for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
      IdealLoopTree* lpt = iter.current();
@ -5963,30 +5963,6 @@ CountedLoopEndNode* CountedLoopNode::find_pre_loop_end() {
  return pre_end;
 }

-  CountedLoopNode* CountedLoopNode::pre_loop_head() const {
-    assert(is_main_loop(), "Only main loop has pre loop");
-    assert(_pre_loop_end != nullptr && _pre_loop_end->loopnode() != nullptr,
-           "should find head from pre loop end");
-    return _pre_loop_end->loopnode();
-  }
-
-  CountedLoopEndNode* CountedLoopNode::pre_loop_end() {
-#ifdef ASSERT
-    assert(is_main_loop(), "Only main loop has pre loop");
-    assert(_pre_loop_end != nullptr, "should be set when fetched");
-    Node* found_pre_end = find_pre_loop_end();
-    assert(_pre_loop_end == found_pre_end && _pre_loop_end == pre_loop_head()->loopexit(),
-           "should find the pre loop end and must be the same result");
-#endif
-    return _pre_loop_end;
-  }
-
-  void CountedLoopNode::set_pre_loop_end(CountedLoopEndNode* pre_loop_end) {
-    assert(is_main_loop(), "Only main loop has pre loop");
-    assert(pre_loop_end, "must be valid");
-    _pre_loop_end = pre_loop_end;
-  }
-
 //------------------------------get_late_ctrl----------------------------------
 // Compute latest legal control.
 Node *PhaseIdealLoop::get_late_ctrl( Node *n, Node *early ) {
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -43,6 +43,7 @@ class PredicateBlock;
 class PathFrequency;
 class PhaseIdealLoop;
 class VectorSet;
+class VSharedData;
 class Invariance;
 struct small_cache;

@ -231,14 +232,11 @@ class CountedLoopNode : public BaseCountedLoopNode {
  // vector mapped unroll factor here
  int _slp_maximum_unroll_factor;

-  // Cached CountedLoopEndNode of pre loop for main loops
-  CountedLoopEndNode* _pre_loop_end;
-
 public:
  CountedLoopNode(Node *entry, Node *backedge)
    : BaseCountedLoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
      _unrolled_count_log2(0), _node_count_before_unroll(0),
-      _slp_maximum_unroll_factor(0), _pre_loop_end(nullptr) {
+      _slp_maximum_unroll_factor(0) {
    init_class_id(Class_CountedLoop);
    // Initialize _trip_count to the largest possible value.
    // Will be reset (lower) if the loop's trip count is known.
@ -330,9 +328,6 @@ public:

  Node* is_canonical_loop_entry();
  CountedLoopEndNode* find_pre_loop_end();
-  CountedLoopNode* pre_loop_head() const;
-  CountedLoopEndNode* pre_loop_end();
-  void set_pre_loop_end(CountedLoopEndNode* pre_loop_end);

 #ifndef PRODUCT
  virtual void dump_spec(outputStream *st) const;
@ -1437,6 +1432,14 @@ public:
  bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
  bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);

+  // AutoVectorize the loop: replace scalar ops with vector ops.
+  enum AutoVectorizeStatus {
+    Impossible,      // This loop has the wrong shape to even try vectorization.
+    Success,         // We just successfully vectorized the loop.
+    TriedAndFailed,  // We tried to vectorize, but failed.
+  };
+  AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
+
  // Move UnorderedReduction out of loop if possible
  void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@ -41,6 +41,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/subnode.hpp"
 #include "opto/subtypenode.hpp"
+#include "opto/superword.hpp"
 #include "opto/vectornode.hpp"
 #include "utilities/macros.hpp"

@ -4209,6 +4210,36 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old
  return true;
 }

+// AutoVectorize the loop: replace scalar ops with vector ops.
+PhaseIdealLoop::AutoVectorizeStatus
+PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
+  // Counted loop only
+  if (!lpt->is_counted()) {
+    return AutoVectorizeStatus::Impossible;
+  }
+
+  // Main-loop only
+  CountedLoopNode* cl = lpt->_head->as_CountedLoop();
+  if (!cl->is_main_loop()) {
+    return AutoVectorizeStatus::Impossible;
+  }
+
+  VLoop vloop(lpt, false);
+  if (!vloop.check_preconditions()) {
+    return AutoVectorizeStatus::TriedAndFailed;
+  }
+
+  // Ensure the shared data is cleared before each use
+  vshared.clear();
+
+  SuperWord sw(vloop, vshared);
+  if (!sw.transform_loop()) {
+    return AutoVectorizeStatus::TriedAndFailed;
+  }
+
+  return AutoVectorizeStatus::Success;
+}
+
 // Having ReductionNodes in the loop is expensive. They need to recursively
 // fold together the vector values, for every vectorized loop iteration. If
 // we encounter the following pattern, we can vector accumulate the values
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -38,134 +38,40 @@
 #include "opto/movenode.hpp"
 #include "utilities/powerOfTwo.hpp"

-//
-//                  S U P E R W O R D   T R A N S F O R M
-//=============================================================================
-
-//------------------------------SuperWord---------------------------
-SuperWord::SuperWord(PhaseIdealLoop* phase) :
-  _phase(phase),
-  _arena(phase->C->comp_arena()),
-  _igvn(phase->_igvn),
+SuperWord::SuperWord(const VLoop &vloop, VSharedData &vshared) :
+  _vloop(vloop),
+  _arena(mtCompiler),
  _packset(arena(), 8,  0, nullptr),                        // packs for the current block
-  _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
-  _block(arena(), 8,  0, nullptr),                          // nodes in current block
+  _bb_idx(vshared.node_idx_to_loop_body_idx()),             // node idx to index in bb
+  _block(arena(), vloop.estimated_body_length(), 0, nullptr), // nodes in current block
  _mem_slice_head(arena(), 8,  0, nullptr),                 // memory slice heads
  _mem_slice_tail(arena(), 8,  0, nullptr),                 // memory slice tails
-  _node_info(arena(), 8,  0, SWNodeInfo::initial),          // info needed per node
-  _clone_map(phase->C->clone_map()),                        // map of nodes created in cloning
+  _node_info(arena(), vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
+  _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
  _align_to_ref(nullptr),                                   // memory reference to align vectors to
-  _dg(_arena),                                              // dependence graph
-  _nlist(arena(), 8, 0, nullptr),                           // scratch list of nodes
-  _lpt(nullptr),                                            // loop tree node
-  _lp(nullptr),                                             // CountedLoopNode
+  _dg(arena()),                                             // dependence graph
+  _nlist(arena(), vloop.estimated_body_length(), 0, nullptr), // scratch list of nodes
  _loop_reductions(arena()),                                // reduction nodes in the current loop
-  _bb(nullptr),                                             // basic block
-  _iv(nullptr),                                             // induction var
  _race_possible(false),                                    // cases where SDMU is true
-  _early_return(true),                                      // analysis evaluations routine
-  _do_vector_loop(phase->C->do_vector_loop()),              // whether to do vectorization/simd style
+  _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
  _num_work_vecs(0),                                        // amount of vector work we have
  _num_reductions(0)                                        // amount of reduction work we have
 {
 }

-//------------------------------transform_loop---------------------------
-bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
-  assert(_phase->C->do_superword(), "SuperWord option should be enabled");
-  // SuperWord only works with power of two vector sizes.
-  int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
-  if (vector_width < 2 || !is_power_of_2(vector_width)) {
-    return false;
-  }
+void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) {
+  IdealLoopTree* lpt    = vloop.lpt();
+  CountedLoopNode* cl   = vloop.cl();
+  Node* cl_exit         = vloop.cl_exit();
+  PhaseIdealLoop* phase = vloop.phase();

-  assert(lpt->_head->is_CountedLoop(), "must be");
-  CountedLoopNode *cl = lpt->_head->as_CountedLoop();
-
-  if (!cl->is_valid_counted_loop(T_INT)) {
-    return false; // skip malformed counted loop
-  }
-
-  // Initialize simple data used by reduction marking early.
-  set_lpt(lpt);
-  set_lp(cl);
-  // For now, define one block which is the entire loop body.
-  set_bb(cl);
-
-  if (SuperWordReductions) {
-    mark_reductions();
-  }
-
-  // skip any loop that has not been assigned max unroll by analysis
-  if (do_optimization) {
-    if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
-      return false;
-    }
-  }
-
-  // Check for no control flow in body (other than exit)
-  Node *cl_exit = cl->loopexit();
-  if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
-    #ifndef PRODUCT
-      if (is_trace_superword_precondition()) {
-        tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
-        tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
-        tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
-        tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
-        lpt->dump_head();
-      }
-    #endif
-    return false;
-  }
-
-  // Make sure the are no extra control users of the loop backedge
-  if (cl->back_control()->outcnt() != 1) {
-    return false;
-  }
-
-  // Skip any loops already optimized by slp
-  if (cl->is_vectorized_loop()) {
-    return false;
-  }
-
-  if (cl->is_unroll_only()) {
-    return false;
-  }
-
-  if (cl->is_main_loop()) {
-    // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
-    CountedLoopEndNode* pre_end = cl->find_pre_loop_end();
-    if (pre_end == nullptr) {
-      return false;
-    }
-    Node* pre_opaq1 = pre_end->limit();
-    if (pre_opaq1->Opcode() != Op_Opaque1) {
-      return false;
-    }
-    cl->set_pre_loop_end(pre_end);
-  }
-
-  init(); // initialize data structures
-
-  bool success = true;
-  if (do_optimization) {
-    assert(_packset.length() == 0, "packset must be empty");
-    success = SLP_extract();
-  }
-  return success;
-}
-
-//------------------------------early unrolling analysis------------------------------
-void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
  bool is_slp = true;
-  size_t ignored_size = lpt()->_body.size();
+  size_t ignored_size = lpt->_body.size();
  int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
  Node_Stack nstack((int)ignored_size);
-  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
-  Node *cl_exit = cl->loopexit_or_null();

  // First clear the entries
-  for (uint i = 0; i < lpt()->_body.size(); i++) {
+  for (uint i = 0; i < lpt->_body.size(); i++) {
    ignored_loop_nodes[i] = -1;
  }

@ -173,8 +79,8 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {

  // Process the loop, some/all of the stack entries will not be in order, ergo
  // need to preprocess the ignored initial state before we process the loop
-  for (uint i = 0; i < lpt()->_body.size(); i++) {
-    Node* n = lpt()->_body.at(i);
+  for (uint i = 0; i < lpt->_body.size(); i++) {
+    Node* n = lpt->_body.at(i);
    if (n == cl->incr() ||
      n->is_AddP() ||
      n->is_Cmp() ||
@ -189,7 +95,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
    if (n->is_If()) {
      IfNode *iff = n->as_If();
      if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
-        if (lpt()->is_loop_exit(iff)) {
+        if (lpt->is_loop_exit(iff)) {
          ignored_loop_nodes[i] = n->_idx;
          continue;
        }
@ -233,10 +139,10 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
    if (n->is_Mem()) {
      MemNode* current = n->as_Mem();
      Node* adr = n->in(MemNode::Address);
-      Node* n_ctrl = _phase->get_ctrl(adr);
+      Node* n_ctrl = phase->get_ctrl(adr);

      // save a queue of post process nodes
-      if (n_ctrl != nullptr && lpt()->is_member(_phase->get_loop(n_ctrl))) {
+      if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) {
        // Process the memory expression
        int stack_idx = 0;
        bool have_side_effects = true;
@ -244,15 +150,15 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
          nstack.push(adr, stack_idx++);
        } else {
          // Mark the components of the memory operation in nstack
-          VPointer p1(current, phase(), lpt(), &nstack, true);
+          VPointer p1(current, vloop, &nstack);
          have_side_effects = p1.node_stack()->is_nonempty();
        }

        // Process the pointer stack
        while (have_side_effects) {
          Node* pointer_node = nstack.node();
-          for (uint j = 0; j < lpt()->_body.size(); j++) {
-            Node* cur_node = lpt()->_body.at(j);
+          for (uint j = 0; j < lpt->_body.size(); j++) {
+            Node* cur_node = lpt->_body.at(j);
            if (cur_node == pointer_node) {
              ignored_loop_nodes[j] = cur_node->_idx;
              break;
@ -269,11 +175,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
    // Now we try to find the maximum supported consistent vector which the machine
    // description can use
    bool flag_small_bt = false;
-    for (uint i = 0; i < lpt()->_body.size(); i++) {
+    for (uint i = 0; i < lpt->_body.size(); i++) {
      if (ignored_loop_nodes[i] != -1) continue;

      BasicType bt;
-      Node* n = lpt()->_body.at(i);
+      Node* n = lpt->_body.at(i);
      if (n->is_Mem()) {
        bt = n->as_Mem()->memory_type();
      } else {
@ -313,11 +219,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
            for (uint j = start; j < end; j++) {
              Node* in = n->in(j);
              // Don't propagate through a memory
-              if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
+              if (!in->is_Mem() && vloop.in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
                bool same_type = true;
                for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
                  Node *use = in->fast_out(k);
-                  if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
+                  if (!vloop.in_bb(use) && use->bottom_type()->basic_type() != bt) {
                    same_type = false;
                    break;
                  }
@ -403,8 +309,8 @@ void SuperWord::mark_reductions() {

  // Iterate through all phi nodes associated to the loop and search for
  // reduction cycles in the basic block.
-  for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
-    const Node* phi = lp()->fast_out(i);
+  for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
+    const Node* phi = cl()->fast_out(i);
    if (!phi->is_Phi()) {
      continue;
    }
@ -482,6 +388,44 @@ void SuperWord::mark_reductions() {
  }
 }

+bool SuperWord::transform_loop() {
+  assert(phase()->C->do_superword(), "SuperWord option should be enabled");
+  assert(cl()->is_main_loop(), "SLP should only work on main loops");
+#ifndef PRODUCT
+  if (is_trace_superword_any()) {
+    tty->print_cr("\nSuperWord::transform_loop:");
+    lpt()->dump_head();
+    cl()->dump();
+  }
+#endif
+
+  // Skip any loop that has not been assigned max unroll by analysis
+  if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
+#ifndef PRODUCT
+    if (is_trace_superword_any()) {
+      tty->print_cr("\nSuperWord::transform_loop failed: slp max unroll analysis was not already done");
+    }
+#endif
+    return false;
+  }
+
+  if (!SLP_extract()) {
+#ifndef PRODUCT
+    if (is_trace_superword_any()) {
+      tty->print_cr("\nSuperWord::transform_loop failed: SuperWord::SLP_extract did not vectorize");
+    }
+#endif
+    return false;
+  }
+
+#ifndef PRODUCT
+  if (is_trace_superword_any()) {
+    tty->print_cr("\nSuperWord::transform_loop: success");
+  }
+#endif
+  return true;
+}
+
 //------------------------------SLP_extract---------------------------
 // Extract the superword level parallelism
 //
@ -517,8 +461,11 @@ void SuperWord::mark_reductions() {
 //    extraction of scalar values from vectors.
 //
 bool SuperWord::SLP_extract() {
-  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
-  assert(cl->is_main_loop(), "SLP should only work on main loops");
+  assert(cl()->is_main_loop(), "SLP should only work on main loops");
+
+  if (SuperWordReductions) {
+    mark_reductions();
+  }

  // Find memory slices
  find_memory_slices();
@ -629,13 +576,13 @@ void SuperWord::find_adjacent_refs() {
      set_align_to_ref(align_to_mem_ref);
    }

-    VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
+    VPointer align_to_ref_p(mem_ref, vloop());
    // Set alignment relative to "align_to_ref" for all related memory operations.
    for (int i = memops.size() - 1; i >= 0; i--) {
      MemNode* s = memops.at(i)->as_Mem();
      if (isomorphic(s, mem_ref) &&
           (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
-        VPointer p2(s, phase(), lpt(), nullptr, false);
+        VPointer p2(s, vloop());
        if (p2.comparable(align_to_ref_p)) {
          int align = memory_alignment(s, iv_adjustment);
          set_alignment(s, align);
@ -694,11 +641,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
  // Count number of comparable memory ops
  for (uint i = 0; i < memops.size(); i++) {
    MemNode* s1 = memops.at(i)->as_Mem();
-    VPointer p1(s1, phase(), lpt(), nullptr, false);
+    VPointer p1(s1, vloop());
    for (uint j = i+1; j < memops.size(); j++) {
      MemNode* s2 = memops.at(j)->as_Mem();
      if (isomorphic(s1, s2)) {
-        VPointer p2(s2, phase(), lpt(), nullptr, false);
+        VPointer p2(s2, vloop());
        if (p1.comparable(p2)) {
          (*cmp_ct.adr_at(i))++;
          (*cmp_ct.adr_at(j))++;
@ -719,7 +666,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
    if (s->is_Store()) {
      int vw = vector_width_in_bytes(s);
      assert(vw > 1, "sanity");
-      VPointer p(s, phase(), lpt(), nullptr, false);
+      VPointer p(s, vloop());
      if ( cmp_ct.at(j) >  max_ct ||
          (cmp_ct.at(j) == max_ct &&
            ( vw >  max_vw ||
@ -742,7 +689,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
      if (s->is_Load()) {
        int vw = vector_width_in_bytes(s);
        assert(vw > 1, "sanity");
-        VPointer p(s, phase(), lpt(), nullptr, false);
+        VPointer p(s, vloop());
        if ( cmp_ct.at(j) >  max_ct ||
            (cmp_ct.at(j) == max_ct &&
              ( vw >  max_vw ||
@ -815,7 +762,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) {
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
+  VPointer align_to_ref_p(mem_ref, vloop());
  int offset = align_to_ref_p.offset_in_bytes();
  int scale  = align_to_ref_p.scale_in_bytes();
  int elt_size = align_to_ref_p.memory_size();
@ -884,13 +831,13 @@ void SuperWord::dependence_graph() {
      if (_dg.dep(s1)->in_cnt() == 0) {
        _dg.make_edge(slice, s1);
      }
-      VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
+      VPointer p1(s1->as_Mem(), vloop());
      bool sink_dependent = true;
      for (int k = j - 1; k >= 0; k--) {
        Node* s2 = _nlist.at(k);
        if (s1->is_Load() && s2->is_Load())
          continue;
-        VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
+        VPointer p2(s2->as_Mem(), vloop());

        int cmp = p1.cmp(p2);
        if (!VPointer::not_equal(cmp)) {
@ -923,8 +870,8 @@ void SuperWord::find_memory_slices() {
  assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");

  // Iterate over all memory phis
-  for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
-    PhiNode* phi = lp()->fast_out(i)->isa_Phi();
+  for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
+    PhiNode* phi = cl()->fast_out(i)->isa_Phi();
    if (phi != nullptr && in_bb(phi) && phi->is_memory_phi()) {
      Node* phi_tail = phi->in(LoopNode::LoopBackControl);
      if (phi_tail != phi->in(LoopNode::EntryControl)) {
@ -1060,8 +1007,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {

  // Adjacent memory references must have the same base, be comparable
  // and have the correct distance between them.
-  VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
-  VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
+  VPointer p1(s1->as_Mem(), vloop());
+  VPointer p2(s2->as_Mem(), vloop());
  if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
  int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
  return diff == data_size(s1);
@ -1352,7 +1299,7 @@ bool SuperWord::follow_def_uses(Node_List* p) {
        // Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
        continue;
      }
-      if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
+      if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
      if (!opnd_positions_match(s1, t1, s2, t2))
        continue;
      int adjusted_align = alignment(s1);
@ -1651,8 +1598,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(Node_List* pack) {
  assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");

  const MemNode* mem_ref = pack->at(0)->as_Mem();
-  VPointer mem_ref_p(mem_ref, phase(), lpt(), nullptr, false);
-  const CountedLoopEndNode* pre_end = lp()->pre_loop_end();
+  VPointer mem_ref_p(mem_ref, vloop());
+  const CountedLoopEndNode* pre_end = vloop().pre_loop_end();
  assert(pre_end->stride_is_con(), "pre loop stride is constant");

  AlignmentSolver solver(pack->at(0)->as_Mem(),
@ -1971,8 +1918,8 @@ bool SuperWord::profitable(Node_List* p) {
            // Reductions should only have a Phi use at the loop head or a non-phi use
            // outside of the loop if it is the last element of the pack (e.g. SafePoint).
            if (is_marked_reduction(def) &&
-                ((use->is_Phi() && use->in(0) == _lpt->_head) ||
-                 (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
+                ((use->is_Phi() && use->in(0) == lpt()->_head) ||
+                 (!lpt()->is_member(phase()->get_loop(phase()->ctrl_or_self(use))) && i == p->size()-1))) {
              continue;
            }
            if (!is_vector_use(use, k)) {
@ -2327,7 +2274,7 @@ void SuperWord::schedule() {
 #endif

  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
-  _phase->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
+  phase()->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);

  // (4) Use the memops_schedule to re-order the memops in all slices.
  schedule_reorder_memops(memops_schedule);
@ -2337,7 +2284,7 @@ void SuperWord::schedule() {
 // Reorder the memory graph for all slices in parallel. We walk over the schedule once,
 // and track the current memory state of each slice.
 void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
-  int max_slices = _phase->C->num_alias_types();
+  int max_slices = phase()->C->num_alias_types();
  // When iterating over the memops_schedule, we keep track of the current memory state,
  // which is the Phi or a store in the loop.
  GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
@ -2349,7 +2296,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
  for (int i = 0; i < _mem_slice_head.length(); i++) {
    Node* phi  = _mem_slice_head.at(i);
    assert(phi->is_Phi(), "must be phi");
-    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    int alias_idx = phase()->C->get_alias_index(phi->adr_type());
    current_state_in_slice.at_put(alias_idx, phi);

    // If we have a memory phi, we have a last store in the loop, find it over backedge.
@ -2362,7 +2309,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
  for (uint i = 0; i < memops_schedule.size(); i++) {
    MemNode* n = memops_schedule.at(i)->as_Mem();
    assert(n->is_Load() || n->is_Store(), "only loads or stores");
-    int alias_idx = _phase->C->get_alias_index(n->adr_type());
+    int alias_idx = phase()->C->get_alias_index(n->adr_type());
    Node* current_state = current_state_in_slice.at(alias_idx);
    if (current_state == nullptr) {
      // If there are only loads in a slice, we never update the memory
@ -2371,7 +2318,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
      assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
             "only loads can have memory state from outside loop");
    } else {
-      _igvn.replace_input_of(n, MemNode::Memory, current_state);
+      igvn().replace_input_of(n, MemNode::Memory, current_state);
      if (n->is_Store()) {
        current_state_in_slice.at_put(alias_idx, n);
      }
@ -2384,12 +2331,12 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
  Node_List uses_after_loop;
  for (int i = 0; i < _mem_slice_head.length(); i++) {
    Node* phi  = _mem_slice_head.at(i);
-    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    int alias_idx = phase()->C->get_alias_index(phi->adr_type());
    Node* current_state = current_state_in_slice.at(alias_idx);
    assert(current_state != nullptr, "slice is mapped");
    assert(current_state != phi, "did some work in between");
    assert(current_state->is_Store(), "sanity");
-    _igvn.replace_input_of(phi, 2, current_state);
+    igvn().replace_input_of(phi, 2, current_state);

    // Replace uses of old last store with current_state (new last store)
    // Do it in two loops: first find all the uses, and change the graph
@ -2408,7 +2355,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
      for (uint j = 0; j < use->req(); j++) {
        Node* def = use->in(j);
        if (def == last_store) {
-          _igvn.replace_input_of(use, j, current_state);
+          igvn().replace_input_of(use, j, current_state);
        }
      }
    }
@ -2425,7 +2372,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
 bool SuperWord::output() {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  assert(cl->is_main_loop(), "SLP should only work on main loops");
-  Compile* C = _phase->C;
+  Compile* C = phase()->C;
  if (_packset.length() == 0) {
    return false;
  }
@ -2436,7 +2383,7 @@ bool SuperWord::output() {
    lpt()->dump_head();
  }
 #endif
-  _phase->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
+  phase()->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);

  adjust_pre_loop_limit_to_align_main_loop_vectors();

@ -2464,7 +2411,7 @@ bool SuperWord::output() {
        // Walk up the memory chain, and ignore any StoreVector that provably
        // does not have any memory dependency.
        while (mem->is_StoreVector()) {
-          VPointer p_store(mem->as_Mem(), phase(), lpt(), nullptr, false);
+          VPointer p_store(mem->as_Mem(), vloop());
          if (p_store.overlap_possible_with_any_in(p)) {
            break;
          } else {
@ -2598,13 +2545,13 @@ bool SuperWord::output() {
        }

        // VectorMaskCmp
-        ConINode* bol_test_node  = _igvn.intcon((int)bol_test);
+        ConINode* bol_test_node  = igvn().intcon((int)bol_test);
        BasicType bt = velt_basic_type(cmp);
        const TypeVect* vt = TypeVect::make(bt, vlen);
        VectorNode* mask = new VectorMaskCmpNode(bol_test, cmp_in1, cmp_in2, bol_test_node, vt);
-        _igvn.register_new_node_with_optimizer(mask);
-        _phase->set_ctrl(mask, _phase->get_ctrl(p->at(0)));
-        _igvn._worklist.push(mask);
+        igvn().register_new_node_with_optimizer(mask);
+        phase()->set_ctrl(mask, phase()->get_ctrl(p->at(0)));
+        igvn()._worklist.push(mask);

        // VectorBlend
        vn = new VectorBlendNode(blend_in1, blend_in2, mask);
@ -2677,8 +2624,8 @@ bool SuperWord::output() {
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
        Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
-        _igvn.register_new_node_with_optimizer(longval);
-        _phase->set_ctrl(longval, _phase->get_ctrl(first));
+        igvn().register_new_node_with_optimizer(longval);
+        phase()->set_ctrl(longval, phase()->get_ctrl(first));
        vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (VectorNode::is_convert_opcode(opc)) {
@ -2719,13 +2666,13 @@ bool SuperWord::output() {
 #endif

      _block.at_put(i, vn);
-      _igvn.register_new_node_with_optimizer(vn);
-      _phase->set_ctrl(vn, _phase->get_ctrl(first));
+      igvn().register_new_node_with_optimizer(vn);
+      phase()->set_ctrl(vn, phase()->get_ctrl(first));
      for (uint j = 0; j < p->size(); j++) {
        Node* pm = p->at(j);
-        _igvn.replace_node(pm, vn);
+        igvn().replace_node(pm, vn);
      }
-      _igvn._worklist.push(vn);
+      igvn()._worklist.push(vn);

      if (vlen > max_vlen) {
        max_vlen = vlen;
@ -2764,7 +2711,7 @@ bool SuperWord::output() {
    }
  }

-  _phase->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
+  phase()->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);

  return true;
 }
@ -2787,10 +2734,10 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
    BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
    assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
    const TypeVect* vt = TypeVect::make(iv_bt, vlen);
-    Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
+    Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
    VectorNode::trace_new_vector(vn, "SuperWord");
-    _igvn.register_new_node_with_optimizer(vn);
-    _phase->set_ctrl(vn, _phase->get_ctrl(opd));
+    igvn().register_new_node_with_optimizer(vn);
+    phase()->set_ctrl(vn, phase()->get_ctrl(opd));
    return vn;
  }

@ -2811,15 +2758,15 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
        juint shift = t->get_con();
        if (shift > mask) { // Unsigned cmp
          cnt = ConNode::make(TypeInt::make(shift & mask));
-          _igvn.register_new_node_with_optimizer(cnt);
+          igvn().register_new_node_with_optimizer(cnt);
        }
      } else {
        if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
          cnt = ConNode::make(TypeInt::make(mask));
-          _igvn.register_new_node_with_optimizer(cnt);
+          igvn().register_new_node_with_optimizer(cnt);
          cnt = new AndINode(opd, cnt);
-          _igvn.register_new_node_with_optimizer(cnt);
-          _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+          igvn().register_new_node_with_optimizer(cnt);
+          phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
        }
        if (!opd->bottom_type()->isa_int()) {
          assert(false, "int type only");
@ -2828,8 +2775,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
      }
      // Move shift count into vector register.
      cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
-      _igvn.register_new_node_with_optimizer(cnt);
-      _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
+      igvn().register_new_node_with_optimizer(cnt);
+      phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
      return cnt;
    }
    if (opd->is_StoreVector()) {
@ -2847,8 +2794,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
       if (p0->bottom_type()->isa_long()) {
         p0_t = TypeLong::LONG;
         conv = new ConvI2LNode(opd);
-         _igvn.register_new_node_with_optimizer(conv);
-         _phase->set_ctrl(conv, _phase->get_ctrl(opd));
+         igvn().register_new_node_with_optimizer(conv);
+         phase()->set_ctrl(conv, phase()->get_ctrl(opd));
       }
       vn = VectorNode::scalar2vector(conv, vlen, p0_t);
    } else {
@ -2856,8 +2803,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
       vn = VectorNode::scalar2vector(opd, vlen, p0_t);
    }

-    _igvn.register_new_node_with_optimizer(vn);
-    _phase->set_ctrl(vn, _phase->get_ctrl(opd));
+    igvn().register_new_node_with_optimizer(vn);
+    phase()->set_ctrl(vn, phase()->get_ctrl(opd));
    VectorNode::trace_new_vector(vn, "SuperWord");
    return vn;
  }
@ -2886,8 +2833,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
      pk->add_opd(in2);
    }
  }
-  _igvn.register_new_node_with_optimizer(pk);
-  _phase->set_ctrl(pk, _phase->get_ctrl(opd));
+  igvn().register_new_node_with_optimizer(pk);
+  phase()->set_ctrl(pk, phase()->get_ctrl(opd));
  VectorNode::trace_new_vector(pk, "SuperWord");
  return pk;
 }
@ -3050,8 +2997,8 @@ bool SuperWord::construct_bb() {
  VectorSet visited;
  VectorSet post_visited;

-  visited.set(bb_idx(bb()));
-  stack.push(bb());
+  visited.set(bb_idx(cl()));
+  stack.push(cl());

  // Do a depth first walk over out edges
  int rpo_idx = block_count - 1;
@ -3066,7 +3013,7 @@ bool SuperWord::construct_bb() {
        Node* use = n->fast_out(i);
        if (in_bb(use) && !visited.test(bb_idx(use)) &&
            // Don't go around backedge
-            (!use->is_Phi() || n == bb())) {
+            (!use->is_Phi() || n == cl())) {
          stack.push(use);
        }
      }
@ -3297,7 +3244,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
    tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
  }
 #endif
-  VPointer p(s, phase(), lpt(), nullptr, false);
+  VPointer p(s, vloop());
  if (!p.valid()) {
    NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
    return bottom_align;
@ -3338,7 +3285,7 @@ const Type* SuperWord::container_type(Node* n) {
    }
    return Type::get_const_basic_type(bt);
  }
-  const Type* t = _igvn.type(n);
+  const Type* t = igvn().type(n);
  if (t->basic_type() == T_INT) {
    // A narrow type of arithmetic operations will be determined by
    // propagating the type of memory operations.
@ -3358,7 +3305,7 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) {
 }

 bool SuperWord::same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const {
-  return _phase->C->get_alias_index(mem_ref->adr_type()) == _phase->C->get_alias_index(best_align_to_mem_ref->adr_type());
+  return phase()->C->get_alias_index(mem_ref->adr_type()) == phase()->C->get_alias_index(best_align_to_mem_ref->adr_type());
 }

 //------------------------------in_packset---------------------------
@ -3438,22 +3385,22 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
 void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  const MemNode* align_to_ref = _align_to_ref;
  assert(align_to_ref != nullptr, "align_to_ref must be set");
-  assert(lp()->is_main_loop(), "can only do alignment for main loop");
+  assert(cl()->is_main_loop(), "can only do alignment for main loop");

  // The opaque node for the limit, where we adjust the input
-  Opaque1Node* pre_opaq = lp()->pre_loop_end()->limit()->as_Opaque1();
+  Opaque1Node* pre_opaq = vloop().pre_loop_end()->limit()->as_Opaque1();

  // Current pre-loop limit.
  Node* old_limit = pre_opaq->in(1);

  // Where we put new limit calculations.
-  Node* pre_ctrl = lp()->pre_loop_head()->in(LoopNode::EntryControl);
+  Node* pre_ctrl = vloop().pre_loop_head()->in(LoopNode::EntryControl);

  // Ensure the original loop limit is available from the pre-loop Opaque1 node.
  Node* orig_limit = pre_opaq->original_loop_limit();
-  assert(orig_limit != nullptr && _igvn.type(orig_limit) != Type::TOP, "");
+  assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");

-  VPointer align_to_ref_p(align_to_ref, phase(), lpt(), nullptr, false);
+  VPointer align_to_ref_p(align_to_ref, vloop());
  assert(align_to_ref_p.valid(), "sanity");

  // For the main-loop, we want the address of align_to_ref to be memory aligned
@ -3647,17 +3594,17 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  const bool is_sub = scale * stride > 0;

  // 1.1: offset
-  Node* xboi = _igvn.intcon(is_sub ? -offset : offset);
+  Node* xboi = igvn().intcon(is_sub ? -offset : offset);
  TRACE_ALIGN_VECTOR_NODE(xboi);

  // 1.2: invar (if it exists)
  if (invar != nullptr) {
-    if (_igvn.type(invar)->isa_long()) {
+    if (igvn().type(invar)->isa_long()) {
      // Computations are done % (vector width/element size) so it's
      // safe to simply convert invar to an int and loose the upper 32
      // bit half.
      invar = new ConvL2INode(invar);
-      _igvn.register_new_node_with_optimizer(invar);
+      igvn().register_new_node_with_optimizer(invar);
      TRACE_ALIGN_VECTOR_NODE(invar);
   }
    if (is_sub) {
@ -3665,8 +3612,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
    } else {
      xboi = new AddINode(xboi, invar);
    }
-    _igvn.register_new_node_with_optimizer(xboi);
-    _phase->set_ctrl(xboi, pre_ctrl);
+    igvn().register_new_node_with_optimizer(xboi);
+    phase()->set_ctrl(xboi, pre_ctrl);
    TRACE_ALIGN_VECTOR_NODE(xboi);
  }

@ -3676,11 +3623,11 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
    // When the base() is top, we have no alignment guarantee at all.
    // Hence, we must now take the base into account for the calculation.
    Node* xbase = new CastP2XNode(nullptr, base);
-    _igvn.register_new_node_with_optimizer(xbase);
+    igvn().register_new_node_with_optimizer(xbase);
    TRACE_ALIGN_VECTOR_NODE(xbase);
 #ifdef _LP64
    xbase  = new ConvL2INode(xbase);
-    _igvn.register_new_node_with_optimizer(xbase);
+    igvn().register_new_node_with_optimizer(xbase);
    TRACE_ALIGN_VECTOR_NODE(xbase);
 #endif
    if (is_sub) {
@ -3688,18 +3635,18 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
    } else {
      xboi = new AddINode(xboi, xbase);
    }
-    _igvn.register_new_node_with_optimizer(xboi);
-    _phase->set_ctrl(xboi, pre_ctrl);
+    igvn().register_new_node_with_optimizer(xboi);
+    phase()->set_ctrl(xboi, pre_ctrl);
    TRACE_ALIGN_VECTOR_NODE(xboi);
  }

  // 2: Compute (14):
  //    XBOI = xboi / abs(scale)
  //    The division is executed as shift
-  Node* log2_abs_scale = _igvn.intcon(exact_log2(abs(scale)));
+  Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale)));
  Node* XBOI = new URShiftINode(xboi, log2_abs_scale);
-  _igvn.register_new_node_with_optimizer(XBOI);
-  _phase->set_ctrl(XBOI, pre_ctrl);
+  igvn().register_new_node_with_optimizer(XBOI);
+  phase()->set_ctrl(XBOI, pre_ctrl);
  TRACE_ALIGN_VECTOR_NODE(log2_abs_scale);
  TRACE_ALIGN_VECTOR_NODE(XBOI);

@ -3713,8 +3660,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  } else {
    XBOI_OP_old_limit = new AddINode(XBOI, old_limit);
  }
-  _igvn.register_new_node_with_optimizer(XBOI_OP_old_limit);
-  _phase->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
+  igvn().register_new_node_with_optimizer(XBOI_OP_old_limit);
+  phase()->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
  TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit);

  // 3.2: Compute:
@ -3723,10 +3670,10 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  //                    = XBOI_OP_old_limit AND (AW - 1)
  //    Since AW is a power of 2, the modulo operation can be replaced with
  //    a bitmask operation.
-  Node* mask_AW = _igvn.intcon(AW-1);
+  Node* mask_AW = igvn().intcon(AW-1);
  Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW);
-  _igvn.register_new_node_with_optimizer(adjust_pre_iter);
-  _phase->set_ctrl(adjust_pre_iter, pre_ctrl);
+  igvn().register_new_node_with_optimizer(adjust_pre_iter);
+  phase()->set_ctrl(adjust_pre_iter, pre_ctrl);
  TRACE_ALIGN_VECTOR_NODE(mask_AW);
  TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter);

@ -3739,8 +3686,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  } else {
    new_limit = new AddINode(old_limit, adjust_pre_iter);
  }
-  _igvn.register_new_node_with_optimizer(new_limit);
-  _phase->set_ctrl(new_limit, pre_ctrl);
+  igvn().register_new_node_with_optimizer(new_limit);
+  phase()->set_ctrl(new_limit, pre_ctrl);
  TRACE_ALIGN_VECTOR_NODE(new_limit);

  // 5: Compute (15a, b):
@ -3748,27 +3695,12 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
  Node* constrained_limit =
    (stride > 0) ? (Node*) new MinINode(new_limit, orig_limit)
                 : (Node*) new MaxINode(new_limit, orig_limit);
-  _igvn.register_new_node_with_optimizer(constrained_limit);
-  _phase->set_ctrl(constrained_limit, pre_ctrl);
+  igvn().register_new_node_with_optimizer(constrained_limit);
+  phase()->set_ctrl(constrained_limit, pre_ctrl);
  TRACE_ALIGN_VECTOR_NODE(constrained_limit);

  // 6: Hack the pre-loop limit
-  _igvn.replace_input_of(pre_opaq, 1, constrained_limit);
-}
-
-//------------------------------init---------------------------
-void SuperWord::init() {
-  _dg.init();
-  _packset.clear();
-  _block.clear();
-  _mem_slice_head.clear();
-  _mem_slice_tail.clear();
-  _node_info.clear();
-  _align_to_ref = nullptr;
-  _race_possible = 0;
-  _early_return = false;
-  _num_work_vecs = 0;
-  _num_reductions = 0;
+  igvn().replace_input_of(pre_opaq, 1, constrained_limit);
 }

 //------------------------------print_packset---------------------------
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -139,8 +139,6 @@ class DepGraph {
  DepEdge* make_edge(DepMem* pred, Node* succ)   { return make_edge(pred,      dep(succ)); }
  DepEdge* make_edge(Node* pred,   DepMem* succ) { return make_edge(dep(pred), succ);      }

-  void init() { _map.clear(); } // initialize
-
  void print(Node* n)   { dep(n)->print(); }
  void print(DepMem* d) { d->print(); }
 };
@ -200,18 +198,18 @@ class SWNodeInfo {
 // -----------------------------SuperWord---------------------------------
 // Transforms scalar operations into packed (superword) operations.
 class SuperWord : public ResourceObj {
- friend class VPointer;
- friend class CMoveKit;
 private:
-  PhaseIdealLoop* _phase;
-  Arena*          _arena;
-  PhaseIterGVN   &_igvn;
+  const VLoop& _vloop;
+
+  // Arena for small data structures. Large data structures are allocated in
+  // VSharedData, and reused over many AutoVectorizations.
+  Arena _arena;

  enum consts { top_align = -1, bottom_align = -666 };

  GrowableArray<Node_List*> _packset;    // Packs for the current block

-  GrowableArray<int> _bb_idx;            // Map from Node _idx to index within block
+  GrowableArray<int> &_bb_idx;           // Map from Node _idx to index within block

  GrowableArray<Node*> _block;           // Nodes in current block
  GrowableArray<PhiNode*> _mem_slice_head; // Memory slice head nodes
@ -226,88 +224,87 @@ class SuperWord : public ResourceObj {
  GrowableArray<Node*> _nlist; // List of nodes

 public:
-  SuperWord(PhaseIdealLoop* phase);
+  SuperWord(const VLoop &vloop, VSharedData &vshared);

-  bool transform_loop(IdealLoopTree* lpt, bool do_optimization);
+  // Attempt to run the SuperWord algorithm on the loop. Return true if we succeed.
+  bool transform_loop();

-  void unrolling_analysis(int &local_loop_unroll_factor);
+  // Decide if loop can eventually be vectorized, and what unrolling factor is required.
+  static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);

-  // Accessors for VPointer
-  PhaseIdealLoop* phase() const    { return _phase; }
-  IdealLoopTree* lpt() const       { return _lpt; }
-  PhiNode* iv() const              { return _iv; }
-
-  bool early_return() const        { return _early_return; }
+  // VLoop Accessors
+  const VLoop& vloop()        const { return _vloop; }
+  PhaseIdealLoop* phase()     const { return vloop().phase(); }
+  PhaseIterGVN& igvn()        const { return vloop().phase()->igvn(); }
+  IdealLoopTree* lpt()        const { return vloop().lpt(); }
+  CountedLoopNode* cl()       const { return vloop().cl(); }
+  PhiNode* iv()               const { return vloop().iv(); }
+  int iv_stride()             const { return cl()->stride_con(); }
+  bool in_bb(const Node* n)   const { return vloop().in_bb(n); }

 #ifndef PRODUCT
  // TraceAutoVectorization and TraceSuperWord
-  bool is_trace_superword_precondition() const {
-    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION);
-  }
-
  bool is_trace_superword_vector_element_type() const {
    // Too verbose for TraceSuperWord
-    return _vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES);
+    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES);
  }

  bool is_trace_superword_alignment() const {
    // Too verbose for TraceSuperWord
-    return _vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
+    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
  }

  bool is_trace_superword_memory_slices() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
  }

  bool is_trace_superword_dependence_graph() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
  }

  bool is_trace_superword_adjacent_memops() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
  }

  bool is_trace_superword_rejections() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
  }

  bool is_trace_superword_packset() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET);
  }

  bool is_trace_superword_info() const {
    return TraceSuperWord ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
  }

  bool is_trace_superword_verbose() const {
    // Too verbose for TraceSuperWord
-    return _vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
+    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
  }

  bool is_trace_superword_any() const {
    return TraceSuperWord ||
           is_trace_align_vector() ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO) ||
-           _vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) ||
+           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
  }

  bool is_trace_align_vector() const {
-    return _vtrace.is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
+    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
           is_trace_superword_verbose();
  }
 #endif
@ -318,30 +315,14 @@ class SuperWord : public ResourceObj {
  const GrowableArray<Node*>&      block()   const { return _block; }
  const DepGraph&                  dg()      const { return _dg; }
 private:
-  IdealLoopTree* _lpt;             // Current loop tree node
-  CountedLoopNode* _lp;            // Current CountedLoopNode
  VectorSet      _loop_reductions; // Reduction nodes in the current loop
-  Node*          _bb;              // Current basic block
-  PhiNode*       _iv;              // Induction var
  bool           _race_possible;   // In cases where SDMU is true
-  bool           _early_return;    // True if we do not initialize
  bool           _do_vector_loop;  // whether to do vectorization/simd style
  int            _num_work_vecs;   // Number of non memory vector operations
  int            _num_reductions;  // Number of reduction expressions applied
-  NOT_PRODUCT(VTrace _vtrace);

  // Accessors
-  Arena* arena()                   { return _arena; }
-
-  Node* bb()                       { return _bb; }
-  void set_bb(Node* bb)            { _bb = bb; }
-  void set_lpt(IdealLoopTree* lpt) { _lpt = lpt; }
-  CountedLoopNode* lp() const      { return _lp; }
-  void set_lp(CountedLoopNode* lp) {
-    _lp = lp;
-    _iv = lp->as_CountedLoop()->phi()->as_Phi();
-  }
-  int iv_stride() const            { return lp()->stride_con(); }
+  Arena* arena()                   { return &_arena; }

  int vector_width(const Node* n) const {
    BasicType bt = velt_basic_type(n);
@ -355,11 +336,8 @@ class SuperWord : public ResourceObj {
  const MemNode* align_to_ref() const { return _align_to_ref; }
  void set_align_to_ref(const MemNode* m) { _align_to_ref = m; }

-  const Node* ctrl(const Node* n) const { return _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n; }
-
  // block accessors
 public:
-  bool in_bb(const Node* n) const  { return n != nullptr && n->outcnt() > 0 && ctrl(n) == _bb; }
  int  bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); }
 private:
  void set_bb_idx(Node* n, int i)  { _bb_idx.at_put_grow(n->_idx, i); }
@ -563,7 +541,6 @@ private:
  void adjust_pre_loop_limit_to_align_main_loop_vectors();
  // Is the use of d1 in u1 at the same operand position as d2 in u2?
  bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
-  void init();

  // print methods
  void print_packset();
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@ -30,7 +30,7 @@

 #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
  flags(POINTER_ANALYSIS,     "Trace VPointer") \
-  flags(SW_PRECONDITION,      "Trace SuperWord precondition") \
+  flags(PRECONDITIONS,        "Trace VLoop::check_preconditions") \
  flags(SW_TYPES,             "Trace SuperWord::compute_vector_element_type") \
  flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
  flags(SW_MEMORY_SLICES,     "Trace SuperWord memory slices") \
@ -112,7 +112,6 @@ class TraceAutoVectorizationTagValidator {
      } else if (ALL == tag) {
        _tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
      } else if (SW_VERBOSE == tag) {
-        _tags.at_put(SW_PRECONDITION, set_bit);
        _tags.at_put(SW_TYPES, set_bit);
        _tags.at_put(SW_ALIGNMENT, set_bit);
        _tags.at_put(SW_MEMORY_SLICES, set_bit);
@ -123,7 +122,6 @@ class TraceAutoVectorizationTagValidator {
        _tags.at_put(SW_INFO, set_bit);
        _tags.at_put(SW_VERBOSE, set_bit);
      } else if (SW_INFO == tag) {
-        _tags.at_put(SW_PRECONDITION, set_bit);
        _tags.at_put(SW_MEMORY_SLICES, set_bit);
        _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
        _tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -31,22 +31,103 @@
 #include "opto/rootnode.hpp"
 #include "opto/vectorization.hpp"

+bool VLoop::check_preconditions() {
+#ifndef PRODUCT
+  if (is_trace_preconditions()) {
+    tty->print_cr("\nVLoop::check_preconditions");
+    lpt()->dump_head();
+    lpt()->head()->dump();
+  }
+#endif
+
+  const char* return_state = check_preconditions_helper();
+  assert(return_state != nullptr, "must have return state");
+  if (return_state == VLoop::SUCCESS) {
+    return true; // success
+  }
+
+#ifndef PRODUCT
+  if (is_trace_preconditions()) {
+    tty->print_cr("VLoop::check_preconditions: failed: %s", return_state);
+  }
+#endif
+  return false; // failure
+}
+
+const char* VLoop::check_preconditions_helper() {
+  // Only accept vector width that is power of 2
+  int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
+  if (vector_width < 2 || !is_power_of_2(vector_width)) {
+    return VLoop::FAILURE_VECTOR_WIDTH;
+  }
+
+  // Only accept valid counted loops (int)
+  if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
+    return VLoop::FAILURE_VALID_COUNTED_LOOP;
+  }
+  _cl = _lpt->_head->as_CountedLoop();
+  _iv = _cl->phi()->as_Phi();
+
+  if (_cl->is_vectorized_loop()) {
+    return VLoop::FAILURE_ALREADY_VECTORIZED;
+  }
+
+  if (_cl->is_unroll_only()) {
+    return VLoop::FAILURE_UNROLL_ONLY;
+  }
+
+  // Check for control flow in the body
+  _cl_exit = _cl->loopexit();
+  bool has_cfg = _cl_exit->in(0) != _cl;
+  if (has_cfg && !is_allow_cfg()) {
+#ifndef PRODUCT
+    if (is_trace_preconditions()) {
+      tty->print_cr("VLoop::check_preconditions: fails because of control flow.");
+      tty->print("  cl_exit %d", _cl_exit->_idx); _cl_exit->dump();
+      tty->print("  cl_exit->in(0) %d", _cl_exit->in(0)->_idx); _cl_exit->in(0)->dump();
+      tty->print("  lpt->_head %d", _cl->_idx); _cl->dump();
+      _lpt->dump_head();
+    }
+#endif
+    return VLoop::FAILURE_CONTROL_FLOW;
+  }
+
+  // Make sure the are no extra control users of the loop backedge
+  if (_cl->back_control()->outcnt() != 1) {
+    return VLoop::FAILURE_BACKEDGE;
+  }
+
+  // To align vector memory accesses in the main-loop, we will have to adjust
+  // the pre-loop limit.
+  if (_cl->is_main_loop()) {
+    CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
+    if (pre_end == nullptr) {
+      return VLoop::FAILURE_PRE_LOOP_LIMIT;
+    }
+    Node* pre_opaq1 = pre_end->limit();
+    if (pre_opaq1->Opcode() != Op_Opaque1) {
+      return VLoop::FAILURE_PRE_LOOP_LIMIT;
+    }
+    _pre_loop_end = pre_end;
+  }
+
+  return VLoop::SUCCESS;
+}
+
 #ifndef PRODUCT
 int VPointer::Tracer::_depth = 0;
 #endif

-VPointer::VPointer(const MemNode* mem,
-                   PhaseIdealLoop* phase, IdealLoopTree* lpt,
+VPointer::VPointer(const MemNode* mem, const VLoop& vloop,
                   Node_Stack* nstack, bool analyze_only) :
-  _mem(mem), _phase(phase), _lpt(lpt),
-  _iv(lpt->_head->as_CountedLoop()->phi()->as_Phi()),
+  _mem(mem), _vloop(vloop),
  _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
 #ifdef ASSERT
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
 #endif
  _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
 #ifndef PRODUCT
-  , _tracer(phase->C->directive()->trace_auto_vectorization_tags().at(TraceAutoVectorizationTag::POINTER_ANALYSIS))
+  , _tracer(vloop.is_trace_pointer_analysis())
 #endif
 {
  NOT_PRODUCT(_tracer.ctor_1(mem);)
@ -109,7 +190,7 @@ VPointer::VPointer(const MemNode* mem,
 // Following is used to create a temporary object during
 // the pattern match of an address expression.
 VPointer::VPointer(VPointer* p) :
-  _mem(p->_mem), _phase(p->_phase), _lpt(p->_lpt), _iv(p->_iv),
+  _mem(p->_mem), _vloop(p->_vloop),
  _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
 #ifdef ASSERT
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
@ -153,7 +234,7 @@ bool VPointer::invariant(Node* n) const {
      // main loop (Illegal invariant happens when n_c is a CastII node that
      // prevents data nodes to flow above the main loop).
      Node* n_c = phase()->get_ctrl(n);
-      return phase()->is_dominator(n_c, cl->pre_loop_head());
+      return phase()->is_dominator(n_c, vloop().pre_loop_head());
    }
  }
  return is_not_member;
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -48,15 +48,131 @@ public:
 };
 #endif

+// Basic loop structure accessors and vectorization preconditions checking
+class VLoop : public StackObj {
+private:
+  PhaseIdealLoop* const _phase;
+  IdealLoopTree* const _lpt;
+  const bool _allow_cfg;
+  CountedLoopNode* _cl;
+  Node* _cl_exit;
+  PhiNode* _iv;
+  CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only
+
+  NOT_PRODUCT(VTrace _vtrace;)
+
+  static constexpr char const* SUCCESS                    = "success";
+  static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized";
+  static constexpr char const* FAILURE_UNROLL_ONLY        = "loop only wants to be unrolled";
+  static constexpr char const* FAILURE_VECTOR_WIDTH       = "vector_width must be power of 2";
+  static constexpr char const* FAILURE_VALID_COUNTED_LOOP = "must be valid counted loop (int)";
+  static constexpr char const* FAILURE_CONTROL_FLOW       = "control flow in loop not allowed";
+  static constexpr char const* FAILURE_BACKEDGE           = "nodes on backedge not allowed";
+  static constexpr char const* FAILURE_PRE_LOOP_LIMIT     = "main-loop must be able to adjust pre-loop-limit (not found)";
+
+public:
+  VLoop(IdealLoopTree* lpt, bool allow_cfg) :
+    _phase     (lpt->_phase),
+    _lpt       (lpt),
+    _allow_cfg (allow_cfg),
+    _cl        (nullptr),
+    _cl_exit   (nullptr),
+    _iv        (nullptr) {}
+  NONCOPYABLE(VLoop);
+
+  IdealLoopTree* lpt()        const { return _lpt; };
+  PhaseIdealLoop* phase()     const { return _phase; }
+  CountedLoopNode* cl()       const { return _cl; };
+  Node* cl_exit()             const { return _cl_exit; };
+  PhiNode* iv()               const { return _iv; };
+  int iv_stride()             const { return cl()->stride_con(); };
+  bool is_allow_cfg()         const { return _allow_cfg; }
+
+  CountedLoopEndNode* pre_loop_end() const {
+    assert(cl()->is_main_loop(), "only main loop can reference pre-loop");
+    assert(_pre_loop_end != nullptr, "must have found it");
+    return _pre_loop_end;
+  };
+
+  CountedLoopNode* pre_loop_head() const {
+    CountedLoopNode* head = pre_loop_end()->loopnode();
+    assert(head != nullptr, "must find head");
+    return head;
+  };
+
+  // Estimate maximum size for data structures, to avoid repeated reallocation
+  int estimated_body_length() const { return lpt()->_body.size(); };
+  int estimated_node_count()  const { return (int)(1.10 * phase()->C->unique()); };
+
+#ifndef PRODUCT
+  const VTrace& vtrace()      const { return _vtrace; }
+
+  bool is_trace_preconditions() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
+  }
+
+  bool is_trace_pointer_analysis() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
+  }
+#endif
+
+  // Is the node in the basic block of the loop?
+  // We only accept any nodes which have the loop head as their ctrl.
+  bool in_bb(const Node* n) const {
+    const Node* ctrl = _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n;
+    return n != nullptr && n->outcnt() > 0 && ctrl == _cl;
+  }
+
+  // Check if the loop passes some basic preconditions for vectorization.
+  // Return indicates if analysis succeeded.
+  bool check_preconditions();
+
+private:
+  const char* check_preconditions_helper();
+};
+
+// Optimization to keep allocation of large arrays in AutoVectorization low.
+// We allocate the arrays once, and reuse them for multiple loops that we
+// AutoVectorize, clearing them before every new use.
+class VSharedData : public StackObj {
+private:
+  // Arena, used to allocate all arrays from.
+  Arena _arena;
+
+  // An array that maps node->_idx to a much smaller idx, which is at most the
+  // size of a loop body. This allow us to have smaller arrays for other data
+  // structures, since we are using smaller indices.
+  GrowableArray<int> _node_idx_to_loop_body_idx;
+
+public:
+  VSharedData() :
+    _arena(mtCompiler),
+    _node_idx_to_loop_body_idx(&_arena, estimated_node_count(), 0, 0)
+  {
+  }
+
+  GrowableArray<int>& node_idx_to_loop_body_idx() {
+    return _node_idx_to_loop_body_idx;
+  }
+
+  // Must be cleared before each AutoVectorization use
+  void clear() {
+    _node_idx_to_loop_body_idx.clear();
+  }
+
+private:
+  static int estimated_node_count() {
+    return (int)(1.10 * Compile::current()->unique());
+  }
+};
+
 // A vectorization pointer (VPointer) has information about an address for
 // dependence checking and vector alignment. It's usually bound to a memory
 // operation in a counted loop for vectorizable analysis.
 class VPointer : public ArenaObj {
 protected:
  const MemNode*  _mem;      // My memory reference node
-  PhaseIdealLoop* _phase;    // PhaseIdealLoop handle
-  IdealLoopTree*  _lpt;      // Current IdealLoopTree
-  PhiNode*        _iv;       // The loop induction variable
+  const VLoop&    _vloop;

  Node* _base;               // null if unsafe nonheap reference
  Node* _adr;                // address pointer
@ -74,9 +190,10 @@ class VPointer : public ArenaObj {
  bool        _analyze_only; // Used in loop unrolling only for vpointer trace
  uint        _stack_idx;    // Used in loop unrolling only for vpointer trace

-  PhaseIdealLoop* phase() const { return _phase; }
-  IdealLoopTree*  lpt() const   { return _lpt; }
-  PhiNode*        iv() const    { return _iv; }
+  const VLoop&    vloop() const { return _vloop; }
+  PhaseIdealLoop* phase() const { return vloop().phase(); }
+  IdealLoopTree*  lpt() const   { return vloop().lpt(); }
+  PhiNode*        iv() const    { return vloop().iv(); }

  bool is_loop_member(Node* n) const;
  bool invariant(Node* n) const;
@ -97,13 +214,19 @@ class VPointer : public ArenaObj {
    NotComparable = (Less | Greater | Equal)
  };

-  VPointer(const MemNode* mem,
-           PhaseIdealLoop* phase, IdealLoopTree* lpt,
+  VPointer(const MemNode* mem, const VLoop& vloop) :
+    VPointer(mem, vloop, nullptr, false) {}
+  VPointer(const MemNode* mem, const VLoop& vloop, Node_Stack* nstack) :
+    VPointer(mem, vloop, nstack, true) {}
+ private:
+  VPointer(const MemNode* mem, const VLoop& vloop,
           Node_Stack* nstack, bool analyze_only);
  // Following is used to create a temporary object during
  // the pattern match of an address expression.
  VPointer(VPointer* p);
+  NONCOPYABLE(VPointer);

+ public:
  bool valid()             const { return _adr != nullptr; }
  bool has_iv()            const { return _scale != 0; }

@ -143,7 +266,7 @@ class VPointer : public ArenaObj {
  bool overlap_possible_with_any_in(Node_List* p) {
    for (uint k = 0; k < p->size(); k++) {
      MemNode* mem = p->at(k)->as_Mem();
-      VPointer p_mem(mem, phase(), lpt(), nullptr, false);
+      VPointer p_mem(mem, vloop());
      // Only if we know that we have Less or Greater can we
      // be sure that there can never be an overlap between
      // the two memory regions.