From 5cddc2de493d9d8712e4bee3aed4f1a0d4c228c3 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <epeter@openjdk.org>
Date: Tue, 2 Apr 2024 06:10:08 +0000
Subject: [PATCH] 8325252: C2 SuperWord: refactor the packset

Reviewed-by: chagedorn, kvn
---
 src/hotspot/share/opto/superword.cpp          | 873 ++++++++----------
 src/hotspot/share/opto/superword.hpp          | 515 +++++++----
 src/hotspot/share/opto/vectorization.hpp      |   2 +-
 src/hotspot/share/opto/vectornode.cpp         |   2 +-
 src/hotspot/share/opto/vectornode.hpp         |   2 +-
 .../loopopts/superword/TestMulAddS2I.java     | 119 ++-
 6 files changed, 862 insertions(+), 651 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index d5b05426a89..8bef9d09980 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -42,11 +42,14 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
   _vloop_analyzer(vloop_analyzer),
   _vloop(vloop_analyzer.vloop()),
   _arena(mtCompiler),
-  _packset(arena(), 8,  0, nullptr),                        // packs for the current block
   _node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
   _align_to_ref(nullptr),                                   // memory reference to align vectors to
-  _race_possible(false),                                    // cases where SDMU is true
+  _pairset(&_arena, _vloop_analyzer),
+  _packset(&_arena, _vloop_analyzer
+           NOT_PRODUCT(COMMA is_trace_superword_packset())
+           NOT_PRODUCT(COMMA is_trace_superword_rejections())
+           ),
   _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
   _num_work_vecs(0),                                        // amount of vector work we have
   _num_reductions(0)                                        // amount of reduction work we have
@@ -454,7 +457,7 @@ bool SuperWord::SLP_extract() {
   // Attempt vectorization
   find_adjacent_refs();
 
-  if (_packset.length() == 0) {
+  if (_pairset.is_empty()) {
 #ifndef PRODUCT
     if (is_trace_superword_any()) {
       tty->print_cr("\nNo pair packs generated, abort SuperWord.");
@@ -464,18 +467,15 @@ bool SuperWord::SLP_extract() {
     return false;
   }
 
-  extend_packset_with_more_pairs_by_following_use_and_def();
+  extend_pairset_with_more_pairs_by_following_use_and_def();
 
   combine_pairs_to_longer_packs();
 
-  construct_my_pack_map();
-
   split_packs_at_use_def_boundaries();  // a first time: create natural boundaries
   split_packs_only_implemented_with_smaller_size();
   split_packs_to_break_mutual_dependence();
   split_packs_at_use_def_boundaries();  // again: propagate split of other packs
 
-  // Now we only remove packs:
   filter_packs_for_power_of_2_size();
   filter_packs_for_mutual_independence();
   filter_packs_for_alignment();
@@ -555,11 +555,8 @@ void SuperWord::find_adjacent_refs() {
         if (alignment(s2) == top_align) continue;
         if (s1 != s2 && are_adjacent_refs(s1, s2)) {
           if (stmts_can_pack(s1, s2, align)) {
-            Node_List* pair = new Node_List();
-            pair->push(s1);
-            pair->push(s2);
             if (!_do_vector_loop || same_origin_idx(s1, s2)) {
-              _packset.append(pair);
+              _pairset.add_pair(s1, s2);
             }
           }
         }
@@ -575,13 +572,13 @@ void SuperWord::find_adjacent_refs() {
     }
   } // while (memops.size() != 0)
 
-  assert(_packset.is_empty() || align_to_mem_ref != nullptr,
-         "packset empty or we find the alignment reference");
+  assert(_pairset.is_empty() || align_to_mem_ref != nullptr,
+         "pairset empty or we find the alignment reference");
 
 #ifndef PRODUCT
   if (is_trace_superword_packset()) {
     tty->print_cr("\nAfter Superword::find_adjacent_refs");
-    print_packset();
+    _pairset.print();
   }
 #endif
 }
@@ -843,7 +840,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
   // and will still be vectorized by SuperWord::vector_opd.
   if (isomorphic(s1, s2) && !is_populate_index(s1, s2)) {
     if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
-      if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
+      if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
         if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
           int s1_align = alignment(s1);
           int s2_align = alignment(s2);
@@ -859,21 +856,9 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
   return false;
 }
 
-//------------------------------exists_at---------------------------
-// Does s exist in a pack at position pos?
-bool SuperWord::exists_at(Node* s, uint pos) {
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    if (p->at(pos) == s) {
-      return true;
-    }
-  }
-  return false;
-}
-
 //------------------------------are_adjacent_refs---------------------------
 // Is s1 immediately before s2 in memory?
-bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
+bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const {
   if (!s1->is_Mem() || !s2->is_Mem()) return false;
   if (!in_bb(s1)    || !in_bb(s2))    return false;
 
@@ -1024,7 +1009,7 @@ bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
   return true;
 }
 
-bool VLoopReductions::is_marked_reduction_pair(Node* s1, Node* s2) const {
+bool VLoopReductions::is_marked_reduction_pair(const Node* s1, const Node* s2) const {
   if (is_marked_reduction(s1) &&
       is_marked_reduction(s2)) {
     // This is an ordered set, so s1 should define s2
@@ -1049,30 +1034,37 @@ void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
   }
 }
 
-// Extend packset by following use->def and def->use links from pack members.
-void SuperWord::extend_packset_with_more_pairs_by_following_use_and_def() {
+// Extend pairset by following use->def and def->use links from pair members.
+void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
   bool changed;
   do {
-    packset_sort(_packset.length());
     changed = false;
-    for (int i = 0; i < _packset.length(); i++) {
-      Node_List* p = _packset.at(i);
-      changed |= follow_use_defs(p);
-      changed |= follow_def_uses(p);
+    // Iterate the pairs in insertion order.
+    for (int i = 0; i < _pairset.length(); i++) {
+      Node* left  = _pairset.left_at_in_insertion_order(i);
+      Node* right = _pairset.right_at_in_insertion_order(i);
+      changed |= extend_pairset_with_more_pairs_by_following_def(left, right);
+      changed |= extend_pairset_with_more_pairs_by_following_use(left, right);
     }
   } while (changed);
 
-  if (_race_possible) {
-    for (int i = 0; i < _packset.length(); i++) {
-      Node_List* p = _packset.at(i);
-      order_def_uses(p);
-    }
+  // During extend_pairset_with_more_pairs_by_following_use, we may have re-ordered the
+  // inputs of some nodes, when calling order_inputs_of_uses_to_match_def_pair. If a def
+  // node has multiple uses, we may have re-ordered some of the inputs one use after
+  // packing another use with the old order. Now that we have all pairs, we must ensure
+  // that the order between the pairs is matching again. Since the PairSetIterator visits
+  // all pair-chains from left-to-right, we essencially impose the order of the first
+  // element on all other elements in the pair-chain.
+  for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
+    Node* left  = pair.left();
+    Node* right = pair.right();
+    order_inputs_of_all_use_pairs_to_match_def_pair(left, right);
   }
 
 #ifndef PRODUCT
   if (is_trace_superword_packset()) {
-    tty->print_cr("\nAfter Superword::extend_packset_with_more_pairs_by_following_use_and_def");
-    print_packset();
+    tty->print_cr("\nAfter Superword::extend_pairset_with_more_pairs_by_following_use_and_def");
+    _pairset.print();
   }
 #endif
 }
@@ -1093,12 +1085,8 @@ int SuperWord::adjust_alignment_for_type_conversion(Node* s, Node* t, int align)
   return align;
 }
 
-//------------------------------follow_use_defs---------------------------
-// Extend the packset by visiting operand definitions of nodes in pack p
-bool SuperWord::follow_use_defs(Node_List* p) {
-  assert(p->size() == 2, "just checking");
-  Node* s1 = p->at(0);
-  Node* s2 = p->at(1);
+bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2) {
+  assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
   assert(s1->req() == s2->req(), "just checking");
   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
 
@@ -1106,7 +1094,7 @@ bool SuperWord::follow_use_defs(Node_List* p) {
 
 #ifndef PRODUCT
   if (is_trace_superword_alignment()) {
-    tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d",
+    tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: s1 %d, align %d",
                   s1->_idx, alignment(s1));
   }
 #endif
@@ -1123,14 +1111,11 @@ bool SuperWord::follow_use_defs(Node_List* p) {
     }
     align = adjust_alignment_for_type_conversion(s1, t1, align);
     if (stmts_can_pack(t1, t2, align)) {
-      if (est_savings(t1, t2) >= 0) {
-        Node_List* pair = new Node_List();
-        pair->push(t1);
-        pair->push(t2);
-        _packset.append(pair);
+      if (estimate_cost_savings_when_packing_as_pair(t1, t2) >= 0) {
+        _pairset.add_pair(t1, t2);
 #ifndef PRODUCT
         if (is_trace_superword_alignment()) {
-          tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)",
+          tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: set_alignment(%d, %d, %d)",
                         t1->_idx, t2->_idx, align);
         }
 #endif
@@ -1142,13 +1127,11 @@ bool SuperWord::follow_use_defs(Node_List* p) {
   return changed;
 }
 
-//------------------------------follow_def_uses---------------------------
-// Extend the packset by visiting uses of nodes in pack p
-bool SuperWord::follow_def_uses(Node_List* p) {
-  bool changed = false;
-  Node* s1 = p->at(0);
-  Node* s2 = p->at(1);
-  assert(p->size() == 2, "just checking");
+// Note: we only extend with a single pair (the one with most savings) for every call. Since we keep
+//       calling this method as long as there are some changes, we will eventually pack all pairs that
+//       can be packed.
+bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2) {
+  assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
   assert(s1->req() == s2->req(), "just checking");
   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
 
@@ -1157,17 +1140,15 @@ bool SuperWord::follow_def_uses(Node_List* p) {
   int align = alignment(s1);
 #ifndef PRODUCT
   if (is_trace_superword_alignment()) {
-    tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d",
+    tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: s1 %d, align %d",
                   s1->_idx, align);
   }
 #endif
   int savings = -1;
-  int num_s1_uses = 0;
   Node* u1 = nullptr;
   Node* u2 = nullptr;
   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
     Node* t1 = s1->fast_out(i);
-    num_s1_uses++;
     if (!in_bb(t1) || t1->is_Mem()) {
       // Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
       continue;
@@ -1179,12 +1160,11 @@ bool SuperWord::follow_def_uses(Node_List* p) {
         continue;
       }
       if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
-      if (!opnd_positions_match(s1, t1, s2, t2))
-        continue;
+      if (order_inputs_of_uses_to_match_def_pair(s1, s2, t1, t2) != PairOrderStatus::Ordered) { continue; }
       int adjusted_align = alignment(s1);
       adjusted_align = adjust_alignment_for_type_conversion(s1, t1, adjusted_align);
       if (stmts_can_pack(t1, t2, adjusted_align)) {
-        int my_savings = est_savings(t1, t2);
+        int my_savings = estimate_cost_savings_when_packing_as_pair(t1, t2);
         if (my_savings > savings) {
           savings = my_savings;
           u1 = t1;
@@ -1194,130 +1174,150 @@ bool SuperWord::follow_def_uses(Node_List* p) {
       }
     }
   }
-  if (num_s1_uses > 1) {
-    _race_possible = true;
-  }
   if (savings >= 0) {
-    Node_List* pair = new Node_List();
-    pair->push(u1);
-    pair->push(u2);
-    _packset.append(pair);
+    _pairset.add_pair(u1, u2);
 #ifndef PRODUCT
     if (is_trace_superword_alignment()) {
-      tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)",
+      tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: set_alignment(%d, %d, %d)",
                     u1->_idx, u2->_idx, align);
     }
 #endif
     set_alignment(u1, u2, align);
-    changed = true;
+    return true; // changed
   }
-  return changed;
+  return false; // no change
 }
 
-//------------------------------order_def_uses---------------------------
-// For extended packsets, ordinally arrange uses packset by major component
-void SuperWord::order_def_uses(Node_List* p) {
-  Node* s1 = p->at(0);
+// For a pair (def1, def2), find all use packs (use1, use2), and ensure that their inputs have an order
+// that matches the (def1, def2) pair.
+void SuperWord::order_inputs_of_all_use_pairs_to_match_def_pair(Node* def1, Node* def2) {
+  assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
 
-  if (s1->is_Store()) return;
+  if (def1->is_Store()) return;
 
   // reductions are always managed beforehand
-  if (is_marked_reduction(s1)) return;
+  if (is_marked_reduction(def1)) return;
 
-  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
-    Node* t1 = s1->fast_out(i);
+  for (DUIterator_Fast imax, i = def1->fast_outs(imax); i < imax; i++) {
+    Node* use1 = def1->fast_out(i);
 
     // Only allow operand swap on commuting operations
-    if (!t1->is_Add() && !t1->is_Mul() && !VectorNode::is_muladds2i(t1)) {
+    if (!use1->is_Add() && !use1->is_Mul() && !VectorNode::is_muladds2i(use1)) {
       break;
     }
 
-    // Now find t1's packset
-    Node_List* p2 = nullptr;
-    for (int j = 0; j < _packset.length(); j++) {
-      p2 = _packset.at(j);
-      Node* first = p2->at(0);
-      if (t1 == first) {
-        break;
-      }
-      p2 = nullptr;
-    }
-    // Arrange all sub components by the major component
-    if (p2 != nullptr) {
-      for (uint j = 1; j < p->size(); j++) {
-        Node* d1 = p->at(j);
-        Node* u1 = p2->at(j);
-        opnd_positions_match(s1, t1, d1, u1);
-      }
-    }
+    // Find pair (use1, use2)
+    Node* use2 = _pairset.get_right_or_null_for(use1);
+    if (use2 == nullptr) { break; }
+
+    order_inputs_of_uses_to_match_def_pair(def1, def2, use1, use2);
   }
 }
 
-//---------------------------opnd_positions_match-------------------------
-// Is the use of d1 in u1 at the same operand position as d2 in u2?
-bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
-  // check reductions to see if they are marshalled to represent the reduction
-  // operator in a specified opnd
-  if (is_marked_reduction(u1) && is_marked_reduction(u2)) {
-    // ensure reductions have phis and reduction definitions feeding the 1st operand
-    Node* first = u1->in(2);
-    if (first->is_Phi() || is_marked_reduction(first)) {
-      u1->swap_edges(1, 2);
+// For a def-pair (def1, def2), and their use-nodes (use1, use2):
+// Ensure that the input order of (use1, use2) matches the order of (def1, def2).
+//
+// We have different cases:
+//
+// 1. Reduction (use1, use2): must always reduce left-to-right. Make sure that we have pattern:
+//
+//    phi/reduction x1  phi/reduction x2                    phi/reduction x1
+//                | |               | |    and hopefully:               | |
+//                use1              use2                                use1 x2
+//                                                                         | |
+//                                                                         use2
+//
+// 2: Commutative operations, just as Add/Mul and their subclasses: we can try to swap edges:
+//
+//     def1 x1   x2 def2           def1 x1   def2 x2
+//        | |     | |       ==>       | |       | |
+//        use1    use2                use1      use2
+//
+// 3: MulAddS2I (use1, use2): we can try to swap edges:
+//
+//    (x1 * x2) + (x3 * x4)    ==>  3.a: (x2 * x1) + (x4 * x3)
+//                                  3.b: (x4 * x3) + (x2 * x1)
+//                                  3.c: (x3 * x4) + (x1 * x2)
+//
+//    Note: MulAddS2I with its 4 inputs is too complicated, if there is any mismatch, we always
+//          return PairOrderStatus::Unknown.
+//          Therefore, extend_pairset_with_more_pairs_by_following_use cannot extend to MulAddS2I,
+//          but there is a chance that extend_pairset_with_more_pairs_by_following_def can do it.
+//
+// 4: Otherwise, check if the inputs of (use1, use2) already match (def1, def2), i.e. for all input indices i:
+//
+//    use1->in(i) == def1 || use2->in(i) == def2   ->    use1->in(i) == def1 && use2->in(i) == def2
+//
+SuperWord::PairOrderStatus SuperWord::order_inputs_of_uses_to_match_def_pair(Node* def1, Node* def2, Node* use1, Node* use2) {
+  assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
+
+  // 1. Reduction
+  if (is_marked_reduction(use1) && is_marked_reduction(use2)) {
+    Node* use1_in2 = use1->in(2);
+    if (use1_in2->is_Phi() || is_marked_reduction(use1_in2)) {
+      use1->swap_edges(1, 2);
     }
-    // ensure reductions have phis and reduction definitions feeding the 1st operand
-    first = u2->in(2);
-    if (first->is_Phi() || is_marked_reduction(first)) {
-      u2->swap_edges(1, 2);
+    Node* use2_in2 = use2->in(2);
+    if (use2_in2->is_Phi() || is_marked_reduction(use2_in2)) {
+      use2->swap_edges(1, 2);
     }
-    return true;
+    return PairOrderStatus::Ordered;
   }
 
-  uint ct = u1->req();
-  if (ct != u2->req()) return false;
+  uint ct = use1->req();
+  if (ct != use2->req()) { return PairOrderStatus::Unordered; };
   uint i1 = 0;
   uint i2 = 0;
   do {
-    for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
-    for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
+    for (i1++; i1 < ct; i1++) { if (use1->in(i1) == def1) { break; } }
+    for (i2++; i2 < ct; i2++) { if (use2->in(i2) == def2) { break; } }
     if (i1 != i2) {
-      if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
-        // Further analysis relies on operands position matching.
-        u2->swap_edges(i1, i2);
-      } else if (VectorNode::is_muladds2i(u2) && u1 != u2) {
+      if ((i1 == (3-i2)) && (use2->is_Add() || use2->is_Mul())) {
+        // 2. Commutative: swap edges, and hope the other position matches too.
+        use2->swap_edges(i1, i2);
+      } else if (VectorNode::is_muladds2i(use2) && use1 != use2) {
+        // 3.a/b: MulAddS2I.
         if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
-          u2->swap_edges(1, 2);
-          u2->swap_edges(3, 4);
+          use2->swap_edges(1, 2);
+          use2->swap_edges(3, 4);
         }
         if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
-          u2->swap_edges(2, 3);
-          u2->swap_edges(1, 4);
+          use2->swap_edges(2, 3);
+          use2->swap_edges(1, 4);
         }
-        return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
+        return PairOrderStatus::Unknown;
       } else {
-        return false;
+        // 4. The inputs are not ordered, and we cannot do anything about it.
+        return PairOrderStatus::Unordered;
       }
-    } else if (i1 == i2 && VectorNode::is_muladds2i(u2) && u1 != u2) {
-      u2->swap_edges(1, 3);
-      u2->swap_edges(2, 4);
-      return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
+    } else if (i1 == i2 && VectorNode::is_muladds2i(use2) && use1 != use2) {
+      // 3.c: MulAddS2I.
+      use2->swap_edges(1, 3);
+      use2->swap_edges(2, 4);
+      return PairOrderStatus::Unknown;
     }
   } while (i1 < ct);
-  return true;
+
+  // 4. All inputs match.
+  return PairOrderStatus::Ordered;
 }
 
-//------------------------------est_savings---------------------------
-// Estimate the savings from executing s1 and s2 as a pack
-int SuperWord::est_savings(Node* s1, Node* s2) {
+// Estimate the savings from executing s1 and s2 as a pair.
+int SuperWord::estimate_cost_savings_when_packing_as_pair(const Node* s1, const Node* s2) const {
   int save_in = 2 - 1; // 2 operations per instruction in packed form
 
+  const int adjacent_profit = 2;
+  auto pack_cost       = [&] (const int size) { return size; };
+  auto unpack_cost     = [&] (const int size) { return size; };
+
   // inputs
   for (uint i = 1; i < s1->req(); i++) {
     Node* x1 = s1->in(i);
     Node* x2 = s2->in(i);
     if (x1 != x2) {
       if (are_adjacent_refs(x1, x2)) {
-        save_in += adjacent_profit(x1, x2);
-      } else if (!in_packset(x1, x2)) {
+        save_in += adjacent_profit;
+      } else if (!_pairset.is_pair(x1, x2)) {
         save_in -= pack_cost(2);
       } else {
         save_in += unpack_cost(2);
@@ -1326,85 +1326,76 @@ int SuperWord::est_savings(Node* s1, Node* s2) {
   }
 
   // uses of result
-  uint ct = 0;
+  uint number_of_packed_use_pairs = 0;
   int save_use = 0;
   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
-    Node* s1_use = s1->fast_out(i);
-    for (int j = 0; j < _packset.length(); j++) {
-      Node_List* p = _packset.at(j);
-      if (p->at(0) == s1_use) {
-        for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
-          Node* s2_use = s2->fast_out(k);
-          if (p->at(p->size()-1) == s2_use) {
-            ct++;
-            if (are_adjacent_refs(s1_use, s2_use)) {
-              save_use += adjacent_profit(s1_use, s2_use);
-            }
-          }
+    Node* use1 = s1->fast_out(i);
+
+    // Find pair (use1, use2)
+    Node* use2 = _pairset.get_right_or_null_for(use1);
+    if (use2 == nullptr) { continue; }
+
+    for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
+      if (use2 == s2->fast_out(k)) {
+        // We have pattern:
+        //
+        //   s1    s2
+        //    |    |
+        // [use1, use2]
+        //
+        number_of_packed_use_pairs++;
+        if (are_adjacent_refs(use1, use2)) {
+          save_use += adjacent_profit;
         }
       }
     }
   }
 
-  if (ct < s1->outcnt()) save_use += unpack_cost(1);
-  if (ct < s2->outcnt()) save_use += unpack_cost(1);
+  if (number_of_packed_use_pairs < s1->outcnt()) save_use += unpack_cost(1);
+  if (number_of_packed_use_pairs < s2->outcnt()) save_use += unpack_cost(1);
 
   return MAX2(save_in, save_use);
 }
 
-//------------------------------costs---------------------------
-int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
-int SuperWord::pack_cost(int ct)   { return ct; }
-int SuperWord::unpack_cost(int ct) { return ct; }
-
-// Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
+// Combine pairs (n1, n2), (n2, n3), ... into pack (n1, n2, n3 ...)
 void SuperWord::combine_pairs_to_longer_packs() {
 #ifdef ASSERT
-  assert(!_packset.is_empty(), "packset not empty");
-  for (int i = 0; i < _packset.length(); i++) {
-    assert(_packset.at(i) != nullptr, "no nullptr in packset");
-    assert(_packset.at(i)->size() == 2, "all packs are pairs");
-  }
+  assert(!_pairset.is_empty(), "pairset not empty");
+  assert(_packset.is_empty(), "packset not empty");
 #endif
 
-  bool changed = true;
-  // Combine packs regardless max vector size.
-  while (changed) {
-    changed = false;
-    for (int i = 0; i < _packset.length(); i++) {
-      Node_List* p1 = _packset.at(i);
-      if (p1 == nullptr) continue;
-      // Because of sorting we can start at i + 1
-      for (int j = i + 1; j < _packset.length(); j++) {
-        Node_List* p2 = _packset.at(j);
-        if (p2 == nullptr) continue;
-        if (p1->at(p1->size()-1) == p2->at(0)) {
-          for (uint k = 1; k < p2->size(); k++) {
-            p1->push(p2->at(k));
-          }
-          _packset.at_put(j, nullptr);
-          changed = true;
-        }
-      }
+  // Iterate pair-chain by pair-chain, each from left-most to right-most.
+  Node_List* pack = nullptr;
+  for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
+    Node* left  = pair.left();
+    Node* right = pair.right();
+    if (_pairset.is_left_in_a_left_most_pair(left)) {
+      assert(pack == nullptr, "no unfinished pack");
+      pack = new (arena()) Node_List(arena());
+      pack->push(left);
+    }
+    assert(pack != nullptr, "must have unfinished pack");
+    pack->push(right);
+    if (_pairset.is_right_in_a_right_most_pair(right)) {
+      _packset.add_pack(pack);
+      pack = nullptr;
     }
   }
-
-  // Remove all nullptr from packset
-  compress_packset();
+  assert(pack == nullptr, "no unfinished pack");
 
   assert(!_packset.is_empty(), "must have combined some packs");
 
 #ifndef PRODUCT
   if (is_trace_superword_packset()) {
     tty->print_cr("\nAfter Superword::combine_pairs_to_longer_packs");
-    print_packset();
+    _packset.print();
   }
 #endif
 }
 
-SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
-                                             Node_List* pack,
-                                             SplitTask task)
+SplitStatus PackSet::split_pack(const char* split_name,
+                                Node_List* pack,
+                                SplitTask task)
 {
   uint pack_size = pack->size();
 
@@ -1416,14 +1407,11 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
 #ifndef PRODUCT
       if (is_trace_superword_rejections()) {
         tty->cr();
-        tty->print_cr("WARNING: Removed pack during split: %s:", task.message());
+        tty->print_cr("WARNING: Removed pack: %s:", task.message());
         print_pack(pack);
       }
 #endif
-    for (uint i = 0; i < pack_size; i++) {
-      Node* n = pack->at(i);
-      set_my_pack(n, nullptr);
-    }
+    unmap_all_nodes_in_pack(pack);
     return SplitStatus::make_rejected();
   }
 
@@ -1453,10 +1441,7 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
         print_pack(pack);
       }
 #endif
-    for (uint i = 0; i < pack_size; i++) {
-      Node* n = pack->at(i);
-      set_my_pack(n, nullptr);
-    }
+    unmap_all_nodes_in_pack(pack);
     return SplitStatus::make_rejected();
   }
 
@@ -1464,7 +1449,7 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
   if (new_size < 2) {
     assert(new_size == 1 && old_size >= 2, "implied");
     Node* n = pack->pop();
-    set_my_pack(n, nullptr);
+    unmap_node_in_pack(n);
 #ifndef PRODUCT
       if (is_trace_superword_rejections()) {
         tty->cr();
@@ -1480,7 +1465,7 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
     assert(old_size == 1 && new_size >= 2, "implied");
     Node* n = pack->at(0);
     pack->remove(0);
-    set_my_pack(n, nullptr);
+    unmap_node_in_pack(n);
 #ifndef PRODUCT
       if (is_trace_superword_rejections()) {
         tty->cr();
@@ -1498,7 +1483,7 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
   for (uint i = 0; i < new_size; i++) {
     Node* n = pack->at(old_size + i);
     new_pack->push(n);
-    set_my_pack(n, new_pack);
+    remap_node_in_pack(n, new_pack);
   }
 
   for (uint i = 0; i < new_size; i++) {
@@ -1511,130 +1496,110 @@ SuperWord::SplitStatus SuperWord::split_pack(const char* split_name,
 }
 
 template <typename SplitStrategy>
-void SuperWord::split_packs(const char* split_name,
-                            SplitStrategy strategy) {
+void PackSet::split_packs(const char* split_name,
+                          SplitStrategy strategy) {
   bool changed;
   do {
     changed = false;
     int new_packset_length = 0;
-    for (int i = 0; i < _packset.length(); i++) {
-      Node_List* pack = _packset.at(i);
+    for (int i = 0; i < _packs.length(); i++) {
+      Node_List* pack = _packs.at(i);
       assert(pack != nullptr && pack->size() >= 2, "no nullptr, at least size 2");
       SplitTask task = strategy(pack);
       SplitStatus status = split_pack(split_name, pack, task);
       changed |= !status.is_unchanged();
       Node_List* first_pack = status.first_pack();
       Node_List* second_pack = status.second_pack();
-      _packset.at_put(i, nullptr); // take out pack
+      _packs.at_put(i, nullptr); // take out pack
       if (first_pack != nullptr) {
         // The first pack can be put at the current position
         assert(i >= new_packset_length, "only move packs down");
-        _packset.at_put(new_packset_length++, first_pack);
+        _packs.at_put(new_packset_length++, first_pack);
       }
       if (second_pack != nullptr) {
         // The second node has to be appended at the end
-        _packset.append(second_pack);
+        _packs.append(second_pack);
       }
     }
-    _packset.trunc_to(new_packset_length);
+    _packs.trunc_to(new_packset_length);
   } while (changed);
 
 #ifndef PRODUCT
   if (is_trace_superword_packset()) {
     tty->print_cr("\nAfter %s", split_name);
-    print_packset();
+    print();
   }
 #endif
 }
 
 // Split packs at boundaries where left and right have different use or def packs.
 void SuperWord::split_packs_at_use_def_boundaries() {
-  split_packs("SuperWord::split_packs_at_use_def_boundaries",
-               [&](const Node_List* pack) {
-                 uint pack_size = pack->size();
-                 uint boundary = find_use_def_boundary(pack);
-                 assert(boundary < pack_size, "valid boundary %d", boundary);
-                 if (boundary != 0) {
-                   return SplitTask::make_split(pack_size - boundary, "found a use/def boundary");
-                 }
-                 return SplitTask::make_unchanged();
-               });
+  auto split_strategy = [&](const Node_List* pack) {
+    uint pack_size = pack->size();
+    uint boundary = find_use_def_boundary(pack);
+    assert(boundary < pack_size, "valid boundary %d", boundary);
+    if (boundary != 0) {
+      return SplitTask::make_split(pack_size - boundary, "found a use/def boundary");
+    }
+    return SplitTask::make_unchanged();
+  };
+  _packset.split_packs("SuperWord::split_packs_at_use_def_boundaries", split_strategy);
 }
 
 // Split packs that are only implemented with a smaller pack size. Also splits packs
 // such that they eventually have power of 2 size.
 void SuperWord::split_packs_only_implemented_with_smaller_size() {
-  split_packs("SuperWord::split_packs_only_implemented_with_smaller_size",
-               [&](const Node_List* pack) {
-                 uint pack_size = pack->size();
-                 uint implemented_size = max_implemented_size(pack);
-                 if (implemented_size == 0)  {
-                   return SplitTask::make_rejected("not implemented at any smaller size");
-                 }
-                 assert(is_power_of_2(implemented_size), "power of 2 size or zero: %d", implemented_size);
-                 if (implemented_size != pack_size) {
-                   return SplitTask::make_split(implemented_size, "only implemented at smaller size");
-                 }
-                 return SplitTask::make_unchanged();
-               });
+  auto split_strategy = [&](const Node_List* pack) {
+    uint pack_size = pack->size();
+    uint implemented_size = max_implemented_size(pack);
+    if (implemented_size == 0)  {
+      return SplitTask::make_rejected("not implemented at any smaller size");
+    }
+    assert(is_power_of_2(implemented_size), "power of 2 size or zero: %d", implemented_size);
+    if (implemented_size != pack_size) {
+      return SplitTask::make_split(implemented_size, "only implemented at smaller size");
+    }
+    return SplitTask::make_unchanged();
+  };
+  _packset.split_packs("SuperWord::split_packs_only_implemented_with_smaller_size", split_strategy);
 }
 
 // Split packs that have a mutual dependency, until all packs are mutually_independent.
 void SuperWord::split_packs_to_break_mutual_dependence() {
-  split_packs("SuperWord::split_packs_to_break_mutual_dependence",
-               [&](const Node_List* pack) {
-                 uint pack_size = pack->size();
-                 assert(is_power_of_2(pack_size), "ensured by earlier splits %d", pack_size);
-                 if (!is_marked_reduction(pack->at(0)) &&
-                     !mutually_independent(pack)) {
-                   // As a best guess, we split the pack in half. This way, we iteratively make the
-                   // packs smaller, until there is no dependency.
-                   return SplitTask::make_split(pack_size >> 1, "was not mutually independent");
-                 }
-                 return SplitTask::make_unchanged();
-               });
+  auto split_strategy = [&](const Node_List* pack) {
+    uint pack_size = pack->size();
+    assert(is_power_of_2(pack_size), "ensured by earlier splits %d", pack_size);
+    if (!is_marked_reduction(pack->at(0)) &&
+        !mutually_independent(pack)) {
+      // As a best guess, we split the pack in half. This way, we iteratively make the
+      // packs smaller, until there is no dependency.
+      return SplitTask::make_split(pack_size >> 1, "was not mutually independent");
+    }
+    return SplitTask::make_unchanged();
+  };
+  _packset.split_packs("SuperWord::split_packs_to_break_mutual_dependence", split_strategy);
 }
 
 template <typename FilterPredicate>
-void SuperWord::filter_packs(const char* filter_name,
-                             const char* error_message,
+void PackSet::filter_packs(const char* filter_name,
+                             const char* rejection_message,
                              FilterPredicate filter) {
-  int new_packset_length = 0;
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* pack = _packset.at(i);
-    assert(pack != nullptr, "no nullptr in packset");
+  auto split_strategy = [&](const Node_List* pack) {
     if (filter(pack)) {
-      assert(i >= new_packset_length, "only move packs down");
-      _packset.at_put(new_packset_length++, pack);
+      return SplitTask::make_unchanged();
     } else {
-      remove_pack_at(i);
-#ifndef PRODUCT
-      if (is_trace_superword_rejections()) {
-        tty->cr();
-        tty->print_cr("WARNING: Removed pack: %s:", error_message);
-        print_pack(pack);
-      }
-#endif
+      return SplitTask::make_rejected(rejection_message);
     }
-  }
-
-  assert(_packset.length() >= new_packset_length, "filter only reduces number of packs");
-  _packset.trunc_to(new_packset_length);
-
-#ifndef PRODUCT
-  if (is_trace_superword_packset() && filter_name != nullptr) {
-    tty->print_cr("\nAfter %s:", filter_name);
-    print_packset();
-  }
-#endif
+  };
+  split_packs(filter_name, split_strategy);
 }
 
 void SuperWord::filter_packs_for_power_of_2_size() {
-  filter_packs("SuperWord::filter_packs_for_power_of_2_size",
-               "size is not a power of 2",
-               [&](const Node_List* pack) {
-                 return is_power_of_2(pack->size());
-               });
+  auto filter = [&](const Node_List* pack) {
+    return is_power_of_2(pack->size());
+  };
+  _packset.filter_packs("SuperWord::filter_packs_for_power_of_2_size",
+                        "size is not a power of 2", filter);
 }
 
 // We know that the nodes in a pair pack were independent - this gives us independence
@@ -1658,13 +1623,13 @@ void SuperWord::filter_packs_for_power_of_2_size() {
 // for (int i ...) { v[i] = v[i + 1] + 5; }
 // for (int i ...) { v[i - 1] = v[i] + 5; }
 void SuperWord::filter_packs_for_mutual_independence() {
-  filter_packs("SuperWord::filter_packs_for_mutual_independence",
-               "found dependency between nodes at distance greater than 1",
-               [&](const Node_List* pack) {
-                 // reductions are trivially connected
-                 return is_marked_reduction(pack->at(0)) ||
-                        mutually_independent(pack);
-               });
+  auto filter = [&](const Node_List* pack) {
+    // reductions are trivially connected
+    return is_marked_reduction(pack->at(0)) ||
+           mutually_independent(pack);
+  };
+  _packset.filter_packs("SuperWord::filter_packs_for_mutual_independence",
+                        "found dependency between nodes at distance greater than 1", filter);
 }
 
 // Find the set of alignment solutions for load/store pack.
@@ -1713,35 +1678,36 @@ void SuperWord::filter_packs_for_alignment() {
   int mem_ops_count = 0;
   int mem_ops_rejected = 0;
 
-  filter_packs("SuperWord::filter_packs_for_alignment",
-               "rejected by AlignVector (strict alignment requirement)",
-               [&](const Node_List* pack) {
-                 // Only memops need to be aligned.
-                 if (!pack->at(0)->is_Load() &&
-                     !pack->at(0)->is_Store()) {
-                   return true; // accept all non memops
-                 }
+  auto filter = [&](const Node_List* pack) {
+    // Only memops need to be aligned.
+    if (!pack->at(0)->is_Load() &&
+        !pack->at(0)->is_Store()) {
+      return true; // accept all non memops
+    }
 
-                 mem_ops_count++;
-                 const AlignmentSolution* s = pack_alignment_solution(pack);
-                 const AlignmentSolution* intersect = current->filter(s);
+    mem_ops_count++;
+    const AlignmentSolution* s = pack_alignment_solution(pack);
+    const AlignmentSolution* intersect = current->filter(s);
 
 #ifndef PRODUCT
-                 if (is_trace_align_vector()) {
-                   tty->print("  solution for pack:         ");
-                   s->print();
-                   tty->print("  intersection with current: ");
-                   intersect->print();
-                 }
+    if (is_trace_align_vector()) {
+      tty->print("  solution for pack:         ");
+      s->print();
+      tty->print("  intersection with current: ");
+      intersect->print();
+    }
 #endif
-                 if (intersect->is_empty()) {
-                   mem_ops_rejected++;
-                   return false; // reject because of empty solution
-                 }
+    if (intersect->is_empty()) {
+      mem_ops_rejected++;
+      return false; // reject because of empty solution
+    }
 
-                 current = intersect;
-                 return true; // accept because of non-empty solution
-               });
+    current = intersect;
+    return true; // accept because of non-empty solution
+  };
+
+  _packset.filter_packs("SuperWord::filter_packs_for_alignment",
+                        "rejected by AlignVector (strict alignment requirement)", filter);
 
 #ifndef PRODUCT
   if (is_trace_superword_info() || is_trace_align_vector()) {
@@ -1760,47 +1726,13 @@ void SuperWord::filter_packs_for_alignment() {
   }
 }
 
-// Compress packset, such that it has no nullptr entries
-void SuperWord::compress_packset() {
-  int j = 0;
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    if (p != nullptr) {
-      _packset.at_put(j, p);
-      j++;
-    }
-  }
-  _packset.trunc_to(j);
-}
-
-//-----------------------------construct_my_pack_map--------------------------
-// Construct the map from nodes to packs.  Only valid after the
-// point where a node is only in one pack (after combine_pairs_to_longer_packs).
-void SuperWord::construct_my_pack_map() {
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    for (uint j = 0; j < p->size(); j++) {
-      Node* s = p->at(j);
-#ifdef ASSERT
-      if (my_pack(s) != nullptr) {
-        s->dump(1);
-        tty->print_cr("packs[%d]:", i);
-        print_pack(p);
-        assert(false, "only in one pack");
-      }
-#endif
-      set_my_pack(s, p);
-    }
-  }
-}
-
 // Remove packs that are not implemented
 void SuperWord::filter_packs_for_implemented() {
-  filter_packs("SuperWord::filter_packs_for_implemented",
-               "Unimplemented",
-               [&](const Node_List* pack) {
-                 return implemented(pack, pack->size());
-               });
+  auto filter = [&](const Node_List* pack) {
+    return implemented(pack, pack->size());
+  };
+  _packset.filter_packs("SuperWord::filter_packs_for_implemented",
+                        "Unimplemented", filter);
 }
 
 // Remove packs that are not profitable.
@@ -1818,30 +1750,15 @@ void SuperWord::filter_packs_for_profitable() {
   }
 
   // Remove packs that are not profitable
-  while (true) {
-    int old_packset_length = _packset.length();
-    filter_packs(nullptr, // don't dump each time
-                 "not profitable",
-                 [&](const Node_List* pack) {
-                   return profitable(pack);
-                 });
-    // Repeat until stable
-    if (old_packset_length == _packset.length()) {
-      break;
-    }
-  }
-
-#ifndef PRODUCT
-  if (is_trace_superword_packset()) {
-    tty->print_cr("\nAfter Superword::filter_packs_for_profitable");
-    print_packset();
-    tty->cr();
-  }
-#endif
+  auto filter = [&](const Node_List* pack) {
+    return profitable(pack);
+  };
+  _packset.filter_packs("Superword::filter_packs_for_profitable",
+                        "not profitable", filter);
 }
 
 // Can code be generated for the pack, restricted to size nodes?
-bool SuperWord::implemented(const Node_List* pack, uint size) {
+bool SuperWord::implemented(const Node_List* pack, const uint size) const {
   assert(size >= 2 && size <= pack->size() && is_power_of_2(size), "valid size");
   bool retValue = false;
   Node* p0 = pack->at(0);
@@ -1917,7 +1834,7 @@ bool SuperWord::requires_long_to_int_conversion(int opc) {
 
 //------------------------------same_inputs--------------------------
 // For pack p, are all idx operands the same?
-bool SuperWord::same_inputs(const Node_List* p, int idx) {
+bool SuperWord::same_inputs(const Node_List* p, int idx) const {
   Node* p0 = p->at(0);
   uint vlen = p->size();
   Node* p0_def = p0->in(idx);
@@ -1933,7 +1850,7 @@ bool SuperWord::same_inputs(const Node_List* p, int idx) {
 
 //------------------------------profitable---------------------------
 // For pack p, are all operands and all uses (with in the block) vector?
-bool SuperWord::profitable(const Node_List* p) {
+bool SuperWord::profitable(const Node_List* p) const {
   Node* p0 = p->at(0);
   uint start, end;
   VectorNode::vector_operands(p0, &start, &end);
@@ -1951,7 +1868,7 @@ bool SuperWord::profitable(const Node_List* p) {
   // Check if reductions are connected
   if (is_marked_reduction(p0)) {
     Node* second_in = p0->in(2);
-    Node_List* second_pk = my_pack(second_in);
+    Node_List* second_pk = get_pack(second_in);
     if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
       // No parent pack or not enough work
       // to cover reduction expansion overhead
@@ -1964,7 +1881,7 @@ bool SuperWord::profitable(const Node_List* p) {
     // For now, return false if shift count is vector or not scalar promotion
     // case (different shift counts) because it is not supported yet.
     Node* cnt = p0->in(2);
-    Node_List* cnt_pk = my_pack(cnt);
+    Node_List* cnt_pk = get_pack(cnt);
     if (cnt_pk != nullptr)
       return false;
     if (!same_inputs(p, 2))
@@ -2017,12 +1934,12 @@ bool SuperWord::profitable(const Node_List* p) {
   if (p0->is_CMove()) {
     // Verify that CMove has a matching Bool pack
     BoolNode* bol = p0->in(1)->as_Bool();
-    if (bol == nullptr || my_pack(bol) == nullptr) {
+    if (bol == nullptr || get_pack(bol) == nullptr) {
       return false;
     }
     // Verify that Bool has a matching Cmp pack
     CmpNode* cmp = bol->in(1)->as_Cmp();
-    if (cmp == nullptr || my_pack(cmp) == nullptr) {
+    if (cmp == nullptr || get_pack(cmp) == nullptr) {
       return false;
     }
   }
@@ -2030,37 +1947,57 @@ bool SuperWord::profitable(const Node_List* p) {
 }
 
 #ifdef ASSERT
-void SuperWord::verify_packs() {
-  // Verify independence at pack level.
+void SuperWord::verify_packs() const {
+  _packset.verify();
+
+  // All packs must be:
   for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    if (!is_marked_reduction(p->at(0)) &&
-        !mutually_independent(p)) {
+    Node_List* pack = _packset.at(i);
+
+    // 1. Mutually independent (or a reduction).
+    if (!is_marked_reduction(pack->at(0)) &&
+        !mutually_independent(pack)) {
       tty->print_cr("FAILURE: nodes not mutually independent in pack[%d]", i);
-      print_pack(p);
+      _packset.print_pack(pack);
       assert(false, "pack nodes not mutually independent");
     }
-  }
 
-  // Verify all nodes in packset have my_pack set correctly.
+    // 2. Implemented.
+    if (!implemented(pack, pack->size())) {
+      tty->print_cr("FAILURE: nodes not implementable in pack[%d]", i);
+      _packset.print_pack(pack);
+      assert(false, "pack not implementable");
+    }
+
+    // 3. Profitable.
+    if (!profitable(pack)) {
+      tty->print_cr("FAILURE: nodes not profitable in pack[%d]", i);
+      _packset.print_pack(pack);
+      assert(false, "pack not profitable");
+    }
+  }
+}
+
+void PackSet::verify() const {
+  // Verify all nodes in packset have pack set correctly.
   ResourceMark rm;
   Unique_Node_List processed;
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
+  for (int i = 0; i < _packs.length(); i++) {
+    Node_List* p = _packs.at(i);
     for (uint k = 0; k < p->size(); k++) {
       Node* n = p->at(k);
-      assert(in_bb(n), "only nodes in bb can be in packset");
+      assert(_vloop.in_bb(n), "only nodes in bb can be in packset");
       assert(!processed.member(n), "node should only occur once in packset");
-      assert(my_pack(n) == p, "n has consisten packset info");
+      assert(get_pack(n) == p, "n has consisten packset info");
       processed.push(n);
     }
   }
 
-  // Check that no other node has my_pack set.
-  for (int i = 0; i < body().length(); i++) {
-    Node* n = body().at(i);
+  // Check that no other node has pack set.
+  for (int i = 0; i < _body.body().length(); i++) {
+    Node* n = _body.body().at(i);
     if (!processed.member(n)) {
-      assert(my_pack(n) == nullptr, "should not have pack if not in packset");
+      assert(get_pack(n) == nullptr, "should not have pack if not in packset");
     }
   }
 }
@@ -2087,7 +2024,7 @@ class PacksetGraph {
 private:
   // pid: packset graph node id.
   GrowableArray<int> _pid;                 // bb_idx(n) -> pid
-  GrowableArray<Node*> _pid_to_node;       // one node per pid, find rest via my_pack
+  GrowableArray<Node*> _pid_to_node;       // one node per pid, find rest via _packset.pack
   GrowableArray<GrowableArray<int>> _out;  // out-edges
   GrowableArray<int> _incnt;               // number of (implicit) in-edges
   int _max_pid = 0;
@@ -2141,7 +2078,7 @@ public:
 
   // Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph.
   void build() {
-    const GrowableArray<Node_List*>& packset = _slp->packset();
+    const PackSet& packset = _slp->packset();
     const GrowableArray<Node*>& body = _slp->body();
     // Map nodes in packsets
     for (int i = 0; i < packset.length(); i++) {
@@ -2150,7 +2087,7 @@ public:
       for (uint k = 0; k < p->size(); k++) {
         Node* n = p->at(k);
         set_pid(n, pid);
-        assert(_slp->my_pack(n) == p, "matching packset");
+        assert(packset.get_pack(n) == p, "matching packset");
       }
     }
 
@@ -2166,7 +2103,7 @@ public:
       if (pid == 0) {
         pid = new_pid();
         set_pid(n, pid);
-        assert(_slp->my_pack(n) == nullptr, "no packset");
+        assert(packset.get_pack(n) == nullptr, "no packset");
       }
     }
 
@@ -2232,7 +2169,7 @@ public:
 
       // Add memops to memops_schedule
       Node* n = get_node(pid);
-      Node_List* p = _slp->my_pack(n);
+      Node_List* p = _slp->packset().get_pack(n);
       if (n->is_Mem()) {
         if (p == nullptr) {
           memops_schedule.push(n);
@@ -2440,7 +2377,7 @@ bool SuperWord::output() {
   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   assert(cl->is_main_loop(), "SLP should only work on main loops");
   Compile* C = phase()->C;
-  if (_packset.length() == 0) {
+  if (_packset.is_empty()) {
     return false;
   }
 
@@ -2461,7 +2398,7 @@ bool SuperWord::output() {
 
   for (int i = 0; i < body().length(); i++) {
     Node* n = body().at(i);
-    Node_List* p = my_pack(n);
+    Node_List* p = get_pack(n);
     if (p != nullptr && n == p->at(p->size()-1)) {
       // After schedule_reorder_memops, we know that the memops have the same order in the pack
       // as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
@@ -2551,7 +2488,7 @@ bool SuperWord::output() {
                bol_test == BoolTest::lt ||
                bol_test == BoolTest::le,
                "CMove bool should be one of: eq,ne,ge,ge,lt,le");
-        Node_List* p_bol = my_pack(bol);
+        Node_List* p_bol = get_pack(bol);
         assert(p_bol != nullptr, "CMove must have matching Bool pack");
 
 #ifdef ASSERT
@@ -2564,7 +2501,7 @@ bool SuperWord::output() {
 
         CmpNode* cmp = bol->in(1)->as_Cmp();
         assert(cmp != nullptr, "must have cmp above CMove");
-        Node_List* p_cmp = my_pack(cmp);
+        Node_List* p_cmp = get_pack(cmp);
         assert(p_cmp != nullptr, "Bool must have matching Cmp pack");
 
         Node* cmp_in1 = vector_opd(p_cmp, 1);
@@ -2883,7 +2820,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
   for (uint i = 1; i < vlen; i++) {
     Node* pi = p->at(i);
     Node* in = pi->in(opd_idx);
-    if (my_pack(in) != nullptr) {
+    if (get_pack(in) != nullptr) {
       assert(false, "Should already have been unpacked");
       return nullptr;
     }
@@ -2891,7 +2828,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
     pk->add_opd(in);
     if (VectorNode::is_muladds2i(pi)) {
       Node* in2 = pi->in(opd_idx + 2);
-      if (my_pack(in2) != nullptr) {
+      if (get_pack(in2) != nullptr) {
         assert(false, "Should already have been unpacked");
         return nullptr;
       }
@@ -2924,7 +2861,7 @@ void SuperWord::verify_no_extract() {
         for (uint k = 0; k < use->req(); k++) {
           Node* maybe_def = use->in(k);
           if (def == maybe_def) {
-            Node_List* p_use = my_pack(use);
+            Node_List* p_use = get_pack(use);
             if (is_marked_reduction(def)) { continue; }
             assert(p_use != nullptr && is_vector_use(use, k), "all uses must be vector uses");
           }
@@ -2937,13 +2874,13 @@ void SuperWord::verify_no_extract() {
 
 // Check if n_super's pack uses are a superset of n_sub's pack uses.
 bool SuperWord::has_use_pack_superset(const Node* n_super, const Node* n_sub) const {
-  Node_List* pack = my_pack(n_super);
-  assert(pack != nullptr && pack == my_pack(n_sub), "must have the same pack");
+  Node_List* pack = get_pack(n_super);
+  assert(pack != nullptr && pack == get_pack(n_sub), "must have the same pack");
 
   // For all uses of n_sub that are in a pack (use_sub) ...
   for (DUIterator_Fast jmax, j = n_sub->fast_outs(jmax); j < jmax; j++) {
     Node* use_sub = n_sub->fast_out(j);
-    Node_List* pack_use_sub = my_pack(use_sub);
+    Node_List* pack_use_sub = get_pack(use_sub);
     if (pack_use_sub == nullptr) { continue; }
 
     // ... and all input edges: use_sub->in(i) == n_sub.
@@ -2956,7 +2893,7 @@ bool SuperWord::has_use_pack_superset(const Node* n_super, const Node* n_sub) co
       bool found = false;
       for (DUIterator_Fast kmax, k = n_super->fast_outs(kmax); k < kmax; k++) {
         Node* use_super = n_super->fast_out(k);
-        Node_List* pack_use_super = my_pack(use_super);
+        Node_List* pack_use_super = get_pack(use_super);
         if (pack_use_sub != pack_use_super) { continue; }
 
         // ... and where there is an edge use_super->in(i) == n_super.
@@ -3005,7 +2942,7 @@ uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
       // No boundary if:
       // 1) the same packs OR
       // 2) reduction edge n0->n1 or n1->n0
-      if (my_pack(n0_in) != my_pack(n1_in) &&
+      if (get_pack(n0_in) != get_pack(n1_in) &&
           !((n0 == n1_in || n1 == n0_in) && is_reduction_pack)) {
         return i + 1;
       }
@@ -3025,12 +2962,12 @@ uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
 
 //------------------------------is_vector_use---------------------------
 // Is use->in(u_idx) a vector use?
-bool SuperWord::is_vector_use(Node* use, int u_idx) {
-  Node_List* u_pk = my_pack(use);
+bool SuperWord::is_vector_use(Node* use, int u_idx) const {
+  Node_List* u_pk = get_pack(use);
   if (u_pk == nullptr) return false;
   if (is_marked_reduction(use)) return true;
   Node* def = use->in(u_idx);
-  Node_List* d_pk = my_pack(def);
+  Node_List* d_pk = get_pack(def);
   if (d_pk == nullptr) {
     Node* n = u_pk->at(0)->in(u_idx);
     if (n == iv()) {
@@ -3213,7 +3150,7 @@ void SuperWord::initialize_node_info() {
   grow_node_info(bb_idx(last));
 }
 
-BasicType SuperWord::longer_type_for_conversion(Node* n) {
+BasicType SuperWord::longer_type_for_conversion(Node* n) const {
   if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
         requires_long_to_int_conversion(n->Opcode())) ||
       !in_bb(n->in(1))) {
@@ -3427,49 +3364,6 @@ bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
          _vloop.phase()->C->get_alias_index(m2->adr_type());
 }
 
-//------------------------------in_packset---------------------------
-// Are s1 and s2 in a pack pair and ordered as s1,s2?
-bool SuperWord::in_packset(Node* s1, Node* s2) {
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* p = _packset.at(i);
-    assert(p->size() == 2, "must be");
-    if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
-      return true;
-    }
-  }
-  return false;
-}
-
-//------------------------------remove_pack_at---------------------------
-// Remove the pack at position pos in the packset
-void SuperWord::remove_pack_at(int pos) {
-  Node_List* p = _packset.at(pos);
-  for (uint i = 0; i < p->size(); i++) {
-    Node* s = p->at(i);
-    set_my_pack(s, nullptr);
-  }
-  _packset.at_put(pos, nullptr);
-}
-
-void SuperWord::packset_sort(int n) {
-  // simple bubble sort so that we capitalize with O(n) when its already sorted
-  do {
-    int max_swap_index = 0;
-    for (int i = 1; i < n; i++) {
-      Node_List* q_low = _packset.at(i-1);
-      Node_List* q_i = _packset.at(i);
-
-      // only swap when we find something to swap
-      if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
-        *(_packset.adr_at(i)) = q_low;
-        *(_packset.adr_at(i-1)) = q_i;
-        max_swap_index = i;
-      }
-    }
-    n = max_swap_index;
-  } while (n > 1);
-}
-
 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
   LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
   for (uint i = 0; i < p->size(); i++) {
@@ -3822,29 +3716,46 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
   igvn().replace_input_of(pre_opaq, 1, constrained_limit);
 }
 
-//------------------------------print_packset---------------------------
-void SuperWord::print_packset() {
 #ifndef PRODUCT
-  tty->print_cr("packset");
-  for (int i = 0; i < _packset.length(); i++) {
-    tty->print_cr("Pack: %d", i);
-    Node_List* p = _packset.at(i);
-    if (p == nullptr) {
-      tty->print_cr("  nullptr");
-    } else {
-      print_pack(p);
+void PairSet::print() const {
+  tty->print_cr("\nPairSet::print: %d pairs", length());
+  int chain = 0;
+  int chain_index = 0;
+  for (PairSetIterator pair(*this); !pair.done(); pair.next()) {
+    Node* left  = pair.left();
+    Node* right = pair.right();
+    if (is_left_in_a_left_most_pair(left)) {
+      chain_index = 0;
+      tty->print_cr(" Pair-chain %d:", chain++);
+      tty->print("  %3d: ", chain_index++);
+      left->dump();
     }
+    tty->print("  %3d: ", chain_index++);
+    right->dump();
   }
-#endif
 }
 
-//------------------------------print_pack---------------------------
-void SuperWord::print_pack(Node_List* p) {
-  for (uint i = 0; i < p->size(); i++) {
-    print_stmt(p->at(i));
+void PackSet::print() const {
+  tty->print_cr("\nPackSet::print: %d packs", _packs.length());
+  for (int i = 0; i < _packs.length(); i++) {
+    tty->print_cr(" Pack: %d", i);
+    Node_List* pack = _packs.at(i);
+    if (pack == nullptr) {
+      tty->print_cr("  nullptr");
+    } else {
+      print_pack(pack);
+    }
   }
 }
 
+void PackSet::print_pack(Node_List* pack) {
+  for (uint i = 0; i < pack->size(); i++) {
+    tty->print("  %3d: ", i);
+    pack->at(i)->dump();
+  }
+}
+#endif
+
 #ifndef PRODUCT
 void VLoopBody::print() const {
   tty->print_cr("\nBlock");
@@ -3858,14 +3769,6 @@ void VLoopBody::print() const {
 }
 #endif
 
-//------------------------------print_stmt---------------------------
-void SuperWord::print_stmt(Node* s) {
-#ifndef PRODUCT
-  tty->print(" align: %d \t", alignment(s));
-  s->dump();
-#endif
-}
-
 // ========================= SWNodeInfo =====================
 
 const SWNodeInfo SWNodeInfo::initial;
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index ba5944d48b4..e4f2e30052c 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -57,6 +57,333 @@
 
 class VPointer;
 
+// The PairSet is a set of pairs. These are later combined to packs,
+// and stored in the PackSet.
+class PairSet : public StackObj {
+private:
+  const VLoop& _vloop;
+  const VLoopBody& _body;
+
+  // Doubly-linked pairs. If not linked: -1
+  GrowableArray<int> _left_to_right; // bb_idx -> bb_idx
+  GrowableArray<int> _right_to_left; // bb_idx -> bb_idx
+  // Example:
+  //
+  //   Pairs: (n1, n2) and (n2, n3)
+  //   bb_idx(n1) = 1
+  //   bb_idx(n2) = 3
+  //   bb_idx(n3) = 5
+  //
+  //   index / bb_idx:   0   1   2   3   4   5   6
+  //
+  //   left_to_right:  |   | 3 |   | 5 |   |   |   |
+  //                         n1----->
+  //                                 n2----->
+  //
+  //   right_to_left:  |   |   |   | 1 |   | 3 |   |
+  //                          <------n2
+  //                                  <------n3
+  //
+  //   Nodes with bb_idx 0, 2, 4, and 6 are in no pair, they are thus neither left nor right elements,
+  //   and hence have no entries in the mapping.
+  //
+  //   Nodes with bb_idx 1 and 3 (n1 and n2) are both a left element in some pair. Therefore, they both
+  //   have an entry in the left_to_right mapping. This mapping indicates which right element they are
+  //   paired with, namely the nodes with bb_idx 3 and 5 (n2 and n3), respectively.
+  //
+  //   Nodes with bb_idx 3 and 5 (n2 and n4) are both a right element in some pair. Therefore, they both
+  //   have an entry in the right_to_left mapping. This mapping indicates which left element they are
+  //   paired with, namely the nodes with bb_idx 1 and 3 (n1 and n2), respectively.
+  //
+  //   Node n1 with bb_idx 1 is not a right element in any pair, thus its right_to_left is empty.
+  //
+  //   Node n2 with bb_idx 3 is both a left element of pair (n2, n3), and a right element of pair (n1, n2).
+  //   Thus it has entries in both left_to_right (mapping n2->n3) and right_to_left (mapping n2->n1).
+  //
+  //   Node n3 with bb_idx 5 is not a left element in any pair, thus its left_to_right is empty.
+
+  // List of all left elements bb_idx, in the order of pair addition.
+  GrowableArray<int> _lefts_in_insertion_order;
+
+public:
+  // Initialize empty, i.e. all not linked (-1).
+  PairSet(Arena* arena, const VLoopAnalyzer& vloop_analyzer) :
+    _vloop(vloop_analyzer.vloop()),
+    _body(vloop_analyzer.body()),
+    _left_to_right(arena, _body.body().length(), _body.body().length(), -1),
+    _right_to_left(arena, _body.body().length(), _body.body().length(), -1),
+    _lefts_in_insertion_order(arena, 8, 0, 0) {}
+
+  const VLoopBody& body() const { return _body; }
+
+  bool is_empty() const { return _lefts_in_insertion_order.is_empty(); }
+
+  bool is_left(int i)  const { return _left_to_right.at(i) != -1; }
+  bool is_right(int i) const { return _right_to_left.at(i) != -1; }
+  bool is_left(const Node* n)  const { return _vloop.in_bb(n) && is_left( _body.bb_idx(n)); }
+  bool is_right(const Node* n) const { return _vloop.in_bb(n) && is_right(_body.bb_idx(n)); }
+
+  bool is_pair(const Node* n1, const Node* n2) const { return is_left(n1) && get_right_for(n1) == n2; }
+
+  bool is_left_in_a_left_most_pair(int i)   const { return is_left(i) && !is_right(i); }
+  bool is_right_in_a_right_most_pair(int i) const { return !is_left(i) && is_right(i); }
+  bool is_left_in_a_left_most_pair(const Node* n)   const { return is_left_in_a_left_most_pair( _body.bb_idx(n)); }
+  bool is_right_in_a_right_most_pair(const Node* n) const { return is_right_in_a_right_most_pair(_body.bb_idx(n)); }
+
+  int get_right_for(int i) const { return _left_to_right.at(i); }
+  Node* get_right_for(const Node* n) const { return _body.body().at(get_right_for(_body.bb_idx(n))); }
+  Node* get_right_or_null_for(const Node* n) const { return is_left(n) ? get_right_for(n) : nullptr; }
+
+  // To access elements in insertion order:
+  int length() const { return _lefts_in_insertion_order.length(); }
+  Node* left_at_in_insertion_order(int i)  const { return _body.body().at(_lefts_in_insertion_order.at(i)); }
+  Node* right_at_in_insertion_order(int i) const { return _body.body().at(get_right_for(_lefts_in_insertion_order.at(i))); }
+
+  void add_pair(Node* n1, Node* n2) {
+    assert(n1 != nullptr && n2 != nullptr && n1 != n2, "no nullptr, and different nodes");
+    assert(!is_left(n1) && !is_right(n2), "cannot be left twice, or right twice");
+    int bb_idx_1 = _body.bb_idx(n1);
+    int bb_idx_2 = _body.bb_idx(n2);
+    _left_to_right.at_put(bb_idx_1, bb_idx_2);
+    _right_to_left.at_put(bb_idx_2, bb_idx_1);
+    _lefts_in_insertion_order.append(bb_idx_1);
+    assert(is_left(n1) && is_right(n2), "must be set now");
+  }
+
+  NOT_PRODUCT(void print() const;)
+};
+
+// Iterate over the PairSet, pair-chain by pair-chain.
+// A pair-chain starts with a "left-most" pair (n1, n2), where n1 is never a right-element
+// in any pair. We walk a chain: (n2, n3), (n3, n4) ... until we hit a "right-most" pair
+// where the right-element is never a left-element of any pair.
+// These pair-chains will later be combined into packs by combine_pairs_to_longer_packs.
+class PairSetIterator : public StackObj {
+private:
+  const PairSet& _pairset;
+  const VLoopBody& _body;
+
+  int _chain_start_bb_idx; // bb_idx of left-element in the left-most pair.
+  int _current_bb_idx;     // bb_idx of left-element of the current pair.
+  const int _end_bb_idx;
+
+public:
+  PairSetIterator(const PairSet& pairset) :
+    _pairset(pairset),
+    _body(pairset.body()),
+    _chain_start_bb_idx(-1),
+    _current_bb_idx(-1),
+    _end_bb_idx(_body.body().length())
+  {
+    next_chain();
+  }
+
+  bool done() const {
+    return _chain_start_bb_idx >= _end_bb_idx;
+  }
+
+  Node* left() const {
+    return _body.body().at(_current_bb_idx);
+  }
+
+  Node* right() const {
+    int bb_idx_2 = _pairset.get_right_for(_current_bb_idx);
+    return _body.body().at(bb_idx_2);
+  }
+
+  // Try to keep walking on the current pair-chain, else find a new pair-chain.
+  void next() {
+    assert(_pairset.is_left(_current_bb_idx), "current was valid");
+    _current_bb_idx = _pairset.get_right_for(_current_bb_idx);
+    if (!_pairset.is_left(_current_bb_idx)) {
+      next_chain();
+    }
+  }
+
+private:
+  void next_chain() {
+    do {
+      _chain_start_bb_idx++;
+    } while (!done() && !_pairset.is_left_in_a_left_most_pair(_chain_start_bb_idx));
+    _current_bb_idx = _chain_start_bb_idx;
+  }
+};
+
+class SplitTask {
+private:
+  enum Kind {
+    // The lambda method for split_packs can return one of these tasks:
+    Unchanged, // The pack is left in the packset, unchanged.
+    Rejected,  // The pack is removed from the packset.
+    Split,     // Split away split_size nodes from the end of the pack.
+  };
+  const Kind _kind;
+  const uint _split_size;
+  const char* _message;
+
+  SplitTask(const Kind kind, const uint split_size, const char* message) :
+      _kind(kind), _split_size(split_size), _message(message)
+  {
+    assert(message != nullptr, "must have message");
+    assert(_kind != Unchanged || split_size == 0, "unchanged task conditions");
+    assert(_kind != Rejected  || split_size == 0, "reject task conditions");
+    assert(_kind != Split     || split_size != 0, "split task conditions");
+  }
+
+public:
+  static SplitTask make_split(const uint split_size, const char* message) {
+    return SplitTask(Split, split_size, message);
+  }
+
+  static SplitTask make_unchanged() {
+    return SplitTask(Unchanged, 0, "unchanged");
+  }
+
+  static SplitTask make_rejected(const char* message) {
+    return SplitTask(Rejected, 0, message);
+  }
+
+  bool is_unchanged() const { return _kind == Unchanged; }
+  bool is_rejected() const { return _kind == Rejected; }
+  bool is_split() const { return _kind == Split; }
+  const char* message() const { return _message; }
+
+  uint split_size() const {
+    assert(is_split(), "only split tasks have split_size");
+    return _split_size;
+  }
+};
+
+class SplitStatus {
+private:
+  enum Kind {
+    // After split_pack, we have:                              first_pack   second_pack
+    Unchanged, // The pack is left in the pack, unchanged.     old_pack     nullptr
+    Rejected,  // The pack is removed from the packset.        nullptr      nullptr
+    Modified,  // The pack had some nodes removed.             old_pack     nullptr
+    Split,     // The pack was split into two packs.           pack1        pack2
+  };
+  Kind _kind;
+  Node_List* _first_pack;
+  Node_List* _second_pack;
+
+  SplitStatus(Kind kind, Node_List* first_pack, Node_List* second_pack) :
+    _kind(kind), _first_pack(first_pack), _second_pack(second_pack)
+  {
+    assert(_kind != Unchanged || (first_pack != nullptr && second_pack == nullptr), "unchanged status conditions");
+    assert(_kind != Rejected  || (first_pack == nullptr && second_pack == nullptr), "rejected status conditions");
+    assert(_kind != Modified  || (first_pack != nullptr && second_pack == nullptr), "modified status conditions");
+    assert(_kind != Split     || (first_pack != nullptr && second_pack != nullptr), "split status conditions");
+  }
+
+public:
+  static SplitStatus make_unchanged(Node_List* old_pack) {
+    return SplitStatus(Unchanged, old_pack, nullptr);
+  }
+
+  static SplitStatus make_rejected() {
+    return SplitStatus(Rejected, nullptr, nullptr);
+  }
+
+  static SplitStatus make_modified(Node_List* first_pack) {
+    return SplitStatus(Modified, first_pack, nullptr);
+  }
+
+  static SplitStatus make_split(Node_List* first_pack, Node_List* second_pack) {
+    return SplitStatus(Split, first_pack, second_pack);
+  }
+
+  bool is_unchanged() const { return _kind == Unchanged; }
+  Node_List* first_pack() const { return _first_pack; }
+  Node_List* second_pack() const { return _second_pack; }
+};
+
+class PackSet : public StackObj {
+private:
+  const VLoop& _vloop;
+  const VLoopBody& _body;
+
+  // Set of all packs:
+  GrowableArray<Node_List*> _packs;
+
+  // Mapping from nodes to their pack: bb_idx -> pack
+  GrowableArray<Node_List*> _node_to_pack;
+
+  NOT_PRODUCT(const bool _trace_packset;)
+  NOT_PRODUCT(const bool _trace_rejections;)
+
+public:
+  // Initialize empty, i.e. no packs, and unmapped (nullptr).
+  PackSet(Arena* arena, const VLoopAnalyzer& vloop_analyzer
+          NOT_PRODUCT(COMMA bool trace_packset COMMA bool trace_rejections)
+          ) :
+    _vloop(vloop_analyzer.vloop()),
+    _body(vloop_analyzer.body()),
+    _packs(arena, 8, 0, nullptr),
+    _node_to_pack(arena, _body.body().length(), _body.body().length(), nullptr)
+    NOT_PRODUCT(COMMA _trace_packset(trace_packset))
+    NOT_PRODUCT(COMMA _trace_rejections(trace_rejections))
+    {}
+
+  // Accessors to iterate over packs.
+  int length() const { return _packs.length(); }
+  bool is_empty() const { return _packs.is_empty(); }
+  Node_List* at(int i) const { return _packs.at(i); }
+
+private:
+  void map_node_in_pack(const Node* n, Node_List* new_pack) {
+    assert(get_pack(n) == nullptr, "was previously unmapped");
+    _node_to_pack.at_put(_body.bb_idx(n), new_pack);
+  }
+
+  void remap_node_in_pack(const Node* n, Node_List* new_pack) {
+    assert(get_pack(n) != nullptr && new_pack != nullptr && get_pack(n) != new_pack, "was previously mapped");
+    _node_to_pack.at_put(_body.bb_idx(n), new_pack);
+  }
+
+  void unmap_node_in_pack(const Node* n) {
+    assert(get_pack(n) != nullptr, "was previously mapped");
+    _node_to_pack.at_put(_body.bb_idx(n), nullptr);
+  }
+
+  void unmap_all_nodes_in_pack(Node_List* old_pack) {
+    for (uint i = 0; i < old_pack->size(); i++) {
+      unmap_node_in_pack(old_pack->at(i));
+    }
+  }
+public:
+  Node_List* get_pack(const Node* n) const { return !_vloop.in_bb(n) ? nullptr : _node_to_pack.at(_body.bb_idx(n)); }
+
+  void add_pack(Node_List* pack) {
+    _packs.append(pack);
+    for (uint i = 0; i < pack->size(); i++) {
+      Node* n = pack->at(i);
+      map_node_in_pack(n, pack);
+    }
+  }
+
+private:
+  SplitStatus split_pack(const char* split_name, Node_List* pack, SplitTask task);
+public:
+  template <typename SplitStrategy>
+  void split_packs(const char* split_name, SplitStrategy strategy);
+
+  template <typename FilterPredicate>
+  void filter_packs(const char* filter_name,
+                    const char* rejection_message,
+                    FilterPredicate filter);
+
+  void clear() { _packs.clear(); }
+
+private:
+  NOT_PRODUCT(bool is_trace_superword_packset() const { return _trace_packset; })
+  NOT_PRODUCT(bool is_trace_superword_rejections() const { return _trace_rejections; })
+public:
+  DEBUG_ONLY(void verify() const;)
+  NOT_PRODUCT(void print() const;)
+  NOT_PRODUCT(static void print_pack(Node_List* pack);)
+};
+
 // ========================= SuperWord =====================
 
 // -----------------------------SWNodeInfo---------------------------------
@@ -64,9 +391,8 @@ class VPointer;
 class SWNodeInfo {
  public:
   int         _alignment; // memory alignment for a node
-  Node_List*  _my_pack;   // pack containing this node
 
-  SWNodeInfo() : _alignment(-1), _my_pack(nullptr) {}
+  SWNodeInfo() : _alignment(-1) {}
   static const SWNodeInfo initial;
 };
 
@@ -83,12 +409,13 @@ class SuperWord : public ResourceObj {
 
   enum consts { top_align = -1, bottom_align = -666 };
 
-  GrowableArray<Node_List*> _packset;    // Packs for the current block
-
   GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
   CloneMap&            _clone_map;       // map of nodes created in cloning
   MemNode const* _align_to_ref;          // Memory reference that pre-loop will align to
 
+  PairSet _pairset;
+  PackSet _packset;
+
  public:
   SuperWord(const VLoopAnalyzer &vloop_analyzer);
 
@@ -112,7 +439,7 @@ class SuperWord : public ResourceObj {
     return _vloop_analyzer.reductions().is_marked_reduction(n);
   }
 
-  bool reduction(Node* n1, Node* n2) const {
+  bool reduction(const Node* n1, const Node* n2) const {
     return _vloop_analyzer.reductions().is_marked_reduction_pair(n1, n2);
   }
 
@@ -219,9 +546,10 @@ class SuperWord : public ResourceObj {
 
   bool     do_vector_loop()        { return _do_vector_loop; }
 
-  const GrowableArray<Node_List*>& packset() const { return _packset; }
+  const PackSet& packset() const { return _packset; }
+  Node_List* get_pack(const Node* n) const { return _packset.get_pack(n); }
+
  private:
-  bool           _race_possible;   // In cases where SDMU is true
   bool           _do_vector_loop;  // whether to do vectorization/simd style
   int            _num_work_vecs;   // Number of non memory vector operations
   int            _num_reductions;  // Number of reduction expressions applied
@@ -240,18 +568,13 @@ class SuperWord : public ResourceObj {
   bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }
 
   // memory alignment for a node
-  int alignment(Node* n)                     { return _node_info.adr_at(bb_idx(n))->_alignment; }
+  int alignment(Node* n) const               { return _node_info.adr_at(bb_idx(n))->_alignment; }
   void set_alignment(Node* n, int a)         { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_alignment = a; }
 
-  // my_pack
- public:
-  Node_List* my_pack(const Node* n)     const { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
- private:
-  void set_my_pack(Node* n, Node_List* p)     { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
   // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
   static bool requires_long_to_int_conversion(int opc);
   // For pack p, are all idx operands the same?
-  bool same_inputs(const Node_List* p, int idx);
+  bool same_inputs(const Node_List* p, int idx) const;
   // CloneMap utilities
   bool same_origin_idx(Node* a, Node* b) const;
   bool same_generation(Node* a, Node* b) const;
@@ -267,10 +590,8 @@ private:
 
   // Can s1 and s2 be in a pack with s1 immediately preceding s2 and  s1 aligned at "align"
   bool stmts_can_pack(Node* s1, Node* s2, int align);
-  // Does s exist in a pack at position pos?
-  bool exists_at(Node* s, uint pos);
   // Is s1 immediately before s2 in memory?
-  bool are_adjacent_refs(Node* s1, Node* s2);
+  bool are_adjacent_refs(Node* s1, Node* s2) const;
   // Are s1 and s2 similar?
   bool isomorphic(Node* s1, Node* s2);
   // Do we have pattern n1 = (iv + c) and n2 = (iv + c + 1)?
@@ -279,143 +600,31 @@ private:
   // do s1 and s2 have similar input edges?
   bool have_similar_inputs(Node* s1, Node* s2);
   void set_alignment(Node* s1, Node* s2, int align);
-  // Extend packset by following use->def and def->use links from pack members.
-  void extend_packset_with_more_pairs_by_following_use_and_def();
   int adjust_alignment_for_type_conversion(Node* s, Node* t, int align);
-  // Extend the packset by visiting operand definitions of nodes in pack p
-  bool follow_use_defs(Node_List* p);
-  // Extend the packset by visiting uses of nodes in pack p
-  bool follow_def_uses(Node_List* p);
-  // For extended packsets, ordinally arrange uses packset by major component
-  void order_def_uses(Node_List* p);
-  // Estimate the savings from executing s1 and s2 as a pack
-  int est_savings(Node* s1, Node* s2);
-  int adjacent_profit(Node* s1, Node* s2);
-  int pack_cost(int ct);
-  int unpack_cost(int ct);
 
-  // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
+  void extend_pairset_with_more_pairs_by_following_use_and_def();
+  bool extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2);
+  bool extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2);
+  void order_inputs_of_all_use_pairs_to_match_def_pair(Node* def1, Node* def2);
+  enum PairOrderStatus { Ordered, Unordered, Unknown };
+  PairOrderStatus order_inputs_of_uses_to_match_def_pair(Node* def1, Node* def2, Node* use1, Node* use2);
+  int estimate_cost_savings_when_packing_as_pair(const Node* s1, const Node* s2) const;
+
   void combine_pairs_to_longer_packs();
 
-  class SplitTask {
-  private:
-    enum Kind {
-      // The lambda method for split_packs can return one of these tasks:
-      Unchanged, // The pack is left in the packset, unchanged.
-      Rejected,  // The pack is removed from the packset.
-      Split,     // Split away split_size nodes from the end of the pack.
-    };
-    const Kind _kind;
-    const uint _split_size;
-    const char* _message;
-
-    SplitTask(const Kind kind, const uint split_size, const char* message) :
-        _kind(kind), _split_size(split_size), _message(message)
-    {
-      assert(message != nullptr, "must have message");
-      assert(_kind != Unchanged || split_size == 0, "unchanged task conditions");
-      assert(_kind != Rejected  || split_size == 0, "reject task conditions");
-      assert(_kind != Split     || split_size != 0, "split task conditions");
-    }
-
-  public:
-    static SplitTask make_split(const uint split_size, const char* message) {
-      return SplitTask(Split, split_size, message);
-    }
-
-    static SplitTask make_unchanged() {
-      return SplitTask(Unchanged, 0, "unchanged");
-    }
-
-    static SplitTask make_rejected(const char* message) {
-      return SplitTask(Rejected, 0, message);
-    }
-
-    bool is_unchanged() const { return _kind == Unchanged; }
-    bool is_rejected() const { return _kind == Rejected; }
-    bool is_split() const { return _kind == Split; }
-    const char* message() const { return _message; }
-
-    uint split_size() const {
-      assert(is_split(), "only split tasks have split_size");
-      return _split_size;
-    }
-  };
-
-  class SplitStatus {
-  private:
-    enum Kind {
-      // After split_pack, we have:                              first_pack   second_pack
-      Unchanged, // The pack is left in the pack, unchanged.     old_pack     nullptr
-      Rejected,  // The pack is removed from the packset.        nullptr      nullptr
-      Modified,  // The pack had some nodes removed.             old_pack     nullptr
-      Split,     // The pack was split into two packs.           pack1        pack2
-    };
-    Kind _kind;
-    Node_List* _first_pack;
-    Node_List* _second_pack;
-
-    SplitStatus(Kind kind, Node_List* first_pack, Node_List* second_pack) :
-      _kind(kind), _first_pack(first_pack), _second_pack(second_pack)
-    {
-      assert(_kind != Unchanged || (first_pack != nullptr && second_pack == nullptr), "unchanged status conditions");
-      assert(_kind != Rejected  || (first_pack == nullptr && second_pack == nullptr), "rejected status conditions");
-      assert(_kind != Modified  || (first_pack != nullptr && second_pack == nullptr), "modified status conditions");
-      assert(_kind != Split     || (first_pack != nullptr && second_pack != nullptr), "split status conditions");
-    }
-
-  public:
-    static SplitStatus make_unchanged(Node_List* old_pack) {
-      return SplitStatus(Unchanged, old_pack, nullptr);
-    }
-
-    static SplitStatus make_rejected() {
-      return SplitStatus(Rejected, nullptr, nullptr);
-    }
-
-    static SplitStatus make_modified(Node_List* first_pack) {
-      return SplitStatus(Modified, first_pack, nullptr);
-    }
-
-    static SplitStatus make_split(Node_List* first_pack, Node_List* second_pack) {
-      return SplitStatus(Split, first_pack, second_pack);
-    }
-
-    bool is_unchanged() const { return _kind == Unchanged; }
-    Node_List* first_pack() const { return _first_pack; }
-    Node_List* second_pack() const { return _second_pack; }
-  };
-
-  SplitStatus split_pack(const char* split_name, Node_List* pack, SplitTask task);
-  template <typename SplitStrategy>
-  void split_packs(const char* split_name, SplitStrategy strategy);
-
   void split_packs_at_use_def_boundaries();
   void split_packs_only_implemented_with_smaller_size();
   void split_packs_to_break_mutual_dependence();
 
-  // Filter out packs with various filter predicates
-  template <typename FilterPredicate>
-  void filter_packs(const char* filter_name,
-                    const char* error_message,
-                    FilterPredicate filter);
   void filter_packs_for_power_of_2_size();
   void filter_packs_for_mutual_independence();
-  // Ensure all packs are aligned, if AlignVector is on.
   void filter_packs_for_alignment();
-  // Find the set of alignment solutions for load/store pack.
   const AlignmentSolution* pack_alignment_solution(const Node_List* pack);
-  // Compress packset, such that it has no nullptr entries.
-  void compress_packset();
-  // Construct the map from nodes to packs.
-  void construct_my_pack_map();
-  // Remove packs that are not implemented.
   void filter_packs_for_implemented();
-  // Remove packs that are not profitable.
   void filter_packs_for_profitable();
-  // Verify that for every pack, all nodes are mutually independent.
-  // Also verify that packset and my_pack are consistent.
-  DEBUG_ONLY(void verify_packs();)
+
+  DEBUG_ONLY(void verify_packs() const;)
+
   // Adjust the memory graph for the packed operations
   void schedule();
   // Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
@@ -427,12 +636,13 @@ private:
   Node* vector_opd(Node_List* p, int opd_idx);
 
   // Can code be generated for the pack, restricted to size nodes?
-  bool implemented(const Node_List* pack, uint size);
+  bool implemented(const Node_List* pack, const uint size) const;
   // Find the maximal implemented size smaller or equal to the packs size
   uint max_implemented_size(const Node_List* pack);
 
   // For pack p, are all operands and all uses (with in the block) vector?
-  bool profitable(const Node_List* p);
+  bool profitable(const Node_List* p) const;
+
   // Verify that all uses of packs are also packs, i.e. we do not need extract operations.
   DEBUG_ONLY(void verify_no_extract();)
 
@@ -440,35 +650,22 @@ private:
   bool has_use_pack_superset(const Node* n1, const Node* n2) const;
   // Find a boundary in the pack, where left and right have different pack uses and defs.
   uint find_use_def_boundary(const Node_List* pack) const;
+
   // Is use->in(u_idx) a vector use?
-  bool is_vector_use(Node* use, int u_idx);
+  bool is_vector_use(Node* use, int u_idx) const;
 
   // Initialize per node info
   void initialize_node_info();
-  // Compute max depth for expressions from beginning of block
-  void compute_max_depth();
   // Return the longer type for vectorizable type-conversion node or illegal type for other nodes.
-  BasicType longer_type_for_conversion(Node* n);
+  BasicType longer_type_for_conversion(Node* n) const;
   // Find the longest type in def-use chain for packed nodes, and then compute the max vector size.
   int max_vector_size_in_def_use_chain(Node* n);
-  // Are s1 and s2 in a pack pair and ordered as s1,s2?
-  bool in_packset(Node* s1, Node* s2);
-  // Remove the pack at position pos in the packset
-  void remove_pack_at(int pos);
+
   static LoadNode::ControlDependency control_dependency(Node_List* p);
   // Alignment within a vector memory reference
   int memory_alignment(MemNode* s, int iv_adjust);
   // Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
   void adjust_pre_loop_limit_to_align_main_loop_vectors();
-  // Is the use of d1 in u1 at the same operand position as d2 in u2?
-  bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
-
-  // print methods
-  void print_packset();
-  void print_pack(Node_List* p);
-  void print_stmt(Node* s);
-
-  void packset_sort(int n);
 };
 
 #endif // SHARE_OPTO_SUPERWORD_HPP
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 88a46a3d688..acc6bbf475d 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -275,7 +275,7 @@ public:
   bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); }
 
   // Are s1 and s2 reductions with a data path between them?
-  bool is_marked_reduction_pair(Node* s1, Node* s2) const;
+  bool is_marked_reduction_pair(const Node* s1, const Node* s2) const;
 
 private:
   // Whether n is a standard reduction operator.
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index f5fdae79e32..f81c29649bb 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -426,7 +426,7 @@ bool VectorNode::is_type_transition_to_int(Node* n) {
   return is_type_transition_short_to_int(n);
 }
 
-bool VectorNode::is_muladds2i(Node* n) {
+bool VectorNode::is_muladds2i(const Node* n) {
   if (n->Opcode() == Op_MulAddS2I) {
     return true;
   }
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index e4d3d013cd3..740c07d64ff 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -102,7 +102,7 @@ class VectorNode : public TypeNode {
   static bool is_vshift_cnt(Node* n);
   static bool is_type_transition_short_to_int(Node* n);
   static bool is_type_transition_to_int(Node* n);
-  static bool is_muladds2i(Node* n);
+  static bool is_muladds2i(const Node* n);
   static bool is_roundopD(Node* n);
   static bool is_scalar_rotate(Node* n);
   static bool is_vector_rotate_supported(int opc, uint vlen, BasicType bt);
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMulAddS2I.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMulAddS2I.java
index 3cb0152659c..c65da58b285 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMulAddS2I.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMulAddS2I.java
@@ -23,7 +23,7 @@
 
 /**
  * @test
- * @bug 8310886
+ * @bug 8310886 8325252
  * @summary Test MulAddS2I vectorization.
  * @library /test/lib /
  * @run driver compiler.loopopts.superword.TestMulAddS2I
@@ -36,14 +36,20 @@ import jdk.test.lib.Asserts;
 import jdk.test.lib.Platform;
 
 public class TestMulAddS2I {
-    static final int RANGE = 1024;
+    static final int RANGE = 1024*16;
     static final int ITER  = RANGE/2 - 1;
 
     static short[] sArr1 = new short[RANGE];
     static short[] sArr2 = new short[RANGE];
+    static int[] ioutArr = new int[RANGE];
     static final int[] GOLDEN_A;
     static final int[] GOLDEN_B;
     static final int[] GOLDEN_C;
+    static final int[] GOLDEN_D;
+    static final int[] GOLDEN_E;
+    static final int[] GOLDEN_F;
+    static final int[] GOLDEN_G;
+    static final int[] GOLDEN_H;
 
     static {
         for (int i = 0; i < RANGE; i++) {
@@ -53,6 +59,11 @@ public class TestMulAddS2I {
         GOLDEN_A = testa();
         GOLDEN_B = testb();
         GOLDEN_C = testc();
+        GOLDEN_D = testd();
+        GOLDEN_E = teste();
+        GOLDEN_F = testf();
+        GOLDEN_G = testg();
+        GOLDEN_H = testh();
     }
 
 
@@ -65,12 +76,17 @@ public class TestMulAddS2I {
         }
     }
 
-    @Run(test = {"testa", "testb", "testc"})
+    @Run(test = {"testa", "testb", "testc", "testd", "teste", "testf", "testg", "testh"})
     @Warmup(0)
     public static void run() {
         compare(testa(), GOLDEN_A, "testa");
         compare(testb(), GOLDEN_B, "testb");
-        compare(testb(), GOLDEN_C, "testc");
+        compare(testc(), GOLDEN_C, "testc");
+        compare(testd(), GOLDEN_D, "testd");
+        compare(teste(), GOLDEN_E, "teste");
+        compare(testf(), GOLDEN_F, "testf");
+        compare(testg(), GOLDEN_G, "testg");
+        compare(testh(), GOLDEN_H, "testh");
     }
 
     public static void compare(int[] out, int[] golden, String name) {
@@ -133,4 +149,99 @@ public class TestMulAddS2I {
         }
         return out;
     }
+
+    @Test
+    @IR(applyIfCPUFeature = {"sse2", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", "16"}, // AD file requires vector_length = 16
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"avx512_vnni", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI_VNNI, "> 0"})
+    public static int[] testd() {
+        int[] out = ioutArr;
+        for (int i = 0; i < ITER-2; i+=2) {
+            // Unrolled, with the same structure.
+            out[i+0] += ((sArr1[2*i+0] * sArr2[2*i+0]) + (sArr1[2*i+1] * sArr2[2*i+1]));
+            out[i+1] += ((sArr1[2*i+2] * sArr2[2*i+2]) + (sArr1[2*i+3] * sArr2[2*i+3]));
+        }
+        return out;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"sse2", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", "16"}, // AD file requires vector_length = 16
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"avx512_vnni", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI_VNNI, "> 0"})
+    public static int[] teste() {
+        int[] out = ioutArr;
+        for (int i = 0; i < ITER-2; i+=2) {
+            // Unrolled, with some swaps.
+            out[i+0] += ((sArr1[2*i+0] * sArr2[2*i+0]) + (sArr1[2*i+1] * sArr2[2*i+1]));
+            out[i+1] += ((sArr2[2*i+2] * sArr1[2*i+2]) + (sArr1[2*i+3] * sArr2[2*i+3])); // swap(1 2)
+        }
+        return out;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"sse2", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", "16"}, // AD file requires vector_length = 16
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"avx512_vnni", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI_VNNI, "> 0"})
+    public static int[] testf() {
+        int[] out = ioutArr;
+        for (int i = 0; i < ITER-2; i+=2) {
+            // Unrolled, with some swaps.
+            out[i+0] += ((sArr1[2*i+0] * sArr2[2*i+0]) + (sArr1[2*i+1] * sArr2[2*i+1]));
+            out[i+1] += ((sArr2[2*i+2] * sArr1[2*i+2]) + (sArr2[2*i+3] * sArr1[2*i+3])); // swap(1 2), swap(3 4)
+        }
+        return out;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"sse2", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", "16"}, // AD file requires vector_length = 16
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"avx512_vnni", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI_VNNI, "> 0"})
+    public static int[] testg() {
+        int[] out = ioutArr;
+        for (int i = 0; i < ITER-2; i+=2) {
+            // Unrolled, with some swaps.
+            out[i+0] += ((sArr1[2*i+0] * sArr2[2*i+0]) + (sArr1[2*i+1] * sArr2[2*i+1]));
+            out[i+1] += ((sArr1[2*i+3] * sArr2[2*i+3]) + (sArr1[2*i+2] * sArr2[2*i+2])); // swap(1 3), swap(2 4)
+        }
+        return out;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"sse2", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", "16"}, // AD file requires vector_length = 16
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI, "> 0"})
+    @IR(applyIfCPUFeature = {"avx512_vnni", "true"},
+        counts = {IRNode.MUL_ADD_S2I, "> 0", IRNode.MUL_ADD_VS2VI_VNNI, "> 0"})
+    public static int[] testh() {
+        int[] out = ioutArr;
+        for (int i = 0; i < ITER-2; i+=2) {
+            // Unrolled, with some swaps.
+            out[i+0] += ((sArr1[2*i+0] * sArr2[2*i+0]) + (sArr1[2*i+1] * sArr2[2*i+1]));
+            out[i+1] += ((sArr2[2*i+3] * sArr1[2*i+3]) + (sArr2[2*i+2] * sArr1[2*i+2])); // swap(1 4), swap(2 3)
+        }
+        return out;
+    }
 }