8325155: C2 SuperWord: remove alignment boundaries

Reviewed-by: chagedorn, kvn
2024-06-07 05:01:23 +00:00 · 2024-06-07 05:01:23 +00:00 · 944aeb81b1
commit 944aeb81b1
parent d8af58941b
7 changed files with 722 additions and 470 deletions
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -43,7 +43,6 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
  _vloop_analyzer(vloop_analyzer),
  _vloop(vloop_analyzer.vloop()),
  _arena(mtCompiler),
-  _node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
  _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
  _pairset(&_arena, _vloop_analyzer),
  _packset(&_arena, _vloop_analyzer
@ -453,11 +452,8 @@ bool SuperWord::transform_loop() {
 bool SuperWord::SLP_extract() {
  assert(cl()->is_main_loop(), "SLP should only work on main loops");

-  // Ensure extra info is allocated.
-  initialize_node_info();
-
-  // Attempt vectorization
-  find_adjacent_refs();
+  // Find "seed" pairs.
+  create_adjacent_memop_pairs();

  if (_pairset.is_empty()) {
 #ifndef PRODUCT
@ -491,245 +487,133 @@ bool SuperWord::SLP_extract() {
  return output();
 }

-//------------------------------find_adjacent_refs---------------------------
-// Find the adjacent memory references and create pack pairs for them.
-// We can find adjacent memory references by comparing their relative
-// alignment. Whether the final vectors can be aligned is determined later
-// once all vectors are extended and combined.
-void SuperWord::find_adjacent_refs() {
-  // Get list of memory operations
-  Node_List memops;
-  for (int i = 0; i < body().length(); i++) {
-    Node* n = body().at(i);
-    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
-        is_java_primitive(n->as_Mem()->memory_type())) {
-      int align = memory_alignment(n->as_Mem(), 0);
-      if (align != bottom_align) {
-        memops.push(n);
-      }
-    }
-  }
+// Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
+void SuperWord::create_adjacent_memop_pairs() {
+  ResourceMark rm;
+  GrowableArray<const VPointer*> vpointers;
+
+  collect_valid_vpointers(vpointers);
+
+  // Sort the VPointers. This does 2 things:
+  //  - Separate the VPointer into groups: all memops that have the same opcode and the same
+  //    VPointer, except for the offset. Adjacent memops must have the same opcode and the
+  //    same VPointer, except for a shift in the offset. Thus, two memops can only be adjacent
+  //    if they are in the same group. This decreases the work.
+  //  - Sort by offset inside the groups. This decreases the work needed to determine adjacent
+  //    memops inside a group.
+  vpointers.sort(VPointer::cmp_for_sort);
+
 #ifndef PRODUCT
  if (is_trace_superword_adjacent_memops()) {
-    tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
+    tty->print_cr("\nSuperWord::create_adjacent_memop_pairs:");
  }
 #endif

-  int max_idx;
-
-  while (memops.size() != 0) {
-    // Find a memory reference to align to.
-    MemNode* mem_ref = find_align_to_ref(memops, max_idx);
-    if (mem_ref == nullptr) break;
-    int iv_adjustment = get_iv_adjustment(mem_ref);
-
-    const VPointer& align_to_ref_p = vpointer(mem_ref);
-    // Set alignment relative to "align_to_ref" for all related memory operations.
-    for (int i = memops.size() - 1; i >= 0; i--) {
-      MemNode* s = memops.at(i)->as_Mem();
-      if (isomorphic(s, mem_ref) &&
-           (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
-        const VPointer& p2 = vpointer(s);
-        if (p2.comparable(align_to_ref_p)) {
-          int align = memory_alignment(s, iv_adjustment);
-          set_alignment(s, align);
-        }
-      }
-    }
-
-    // Create initial pack pairs of memory operations for which alignment was set.
-    for (uint i = 0; i < memops.size(); i++) {
-      Node* s1 = memops.at(i);
-      int align = alignment(s1);
-      if (align == top_align) continue;
-      for (uint j = 0; j < memops.size(); j++) {
-        Node* s2 = memops.at(j);
-        if (alignment(s2) == top_align) continue;
-        if (s1 != s2 && are_adjacent_refs(s1, s2)) {
-          if (stmts_can_pack(s1, s2, align)) {
-            if (!_do_vector_loop || same_origin_idx(s1, s2)) {
-              _pairset.add_pair(s1, s2);
-            }
-          }
-        }
-      }
-    }
-
-    // Remove used mem nodes.
-    for (int i = memops.size() - 1; i >= 0; i--) {
-      MemNode* m = memops.at(i)->as_Mem();
-      if (alignment(m) != top_align) {
-        memops.remove(i);
-      }
-    }
-  } // while (memops.size() != 0)
+  create_adjacent_memop_pairs_in_all_groups(vpointers);

 #ifndef PRODUCT
  if (is_trace_superword_packset()) {
-    tty->print_cr("\nAfter Superword::find_adjacent_refs");
+    tty->print_cr("\nAfter Superword::create_adjacent_memop_pairs");
    _pairset.print();
  }
 #endif
 }

-//------------------------------find_align_to_ref---------------------------
-// Find a memory reference to align the loop induction variable to.
-// Looks first at stores then at loads, looking for a memory reference
-// with the largest number of references similar to it.
-MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
-  GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
-
-  // Count number of comparable memory ops
-  for (uint i = 0; i < memops.size(); i++) {
-    MemNode* s1 = memops.at(i)->as_Mem();
-    const VPointer& p1 = vpointer(s1);
-    for (uint j = i+1; j < memops.size(); j++) {
-      MemNode* s2 = memops.at(j)->as_Mem();
-      if (isomorphic(s1, s2)) {
-        const VPointer& p2 = vpointer(s2);
-        if (p1.comparable(p2)) {
-          (*cmp_ct.adr_at(i))++;
-          (*cmp_ct.adr_at(j))++;
-        }
-      }
+// Collect all memops vpointers that could potentially be vectorized.
+void SuperWord::collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers) {
+  for_each_mem([&] (const MemNode* mem, int bb_idx) {
+    const VPointer& p = vpointer(mem);
+    if (p.valid() &&
+        !mem->is_LoadStore() &&
+        is_java_primitive(mem->memory_type())) {
+      vpointers.append(&p);
    }
-  }
-
-  // Find Store (or Load) with the greatest number of "comparable" references,
-  // biggest vector size, smallest data size and smallest iv offset.
-  int max_ct        = 0;
-  int max_vw        = 0;
-  int max_idx       = -1;
-  int min_size      = max_jint;
-  int min_iv_offset = max_jint;
-  for (uint j = 0; j < memops.size(); j++) {
-    MemNode* s = memops.at(j)->as_Mem();
-    if (s->is_Store()) {
-      int vw = vector_width_in_bytes(s);
-      assert(vw > 1, "sanity");
-      const VPointer& p = vpointer(s);
-      if ( cmp_ct.at(j) >  max_ct ||
-          (cmp_ct.at(j) == max_ct &&
-            ( vw >  max_vw ||
-             (vw == max_vw &&
-              ( data_size(s) <  min_size ||
-               (data_size(s) == min_size &&
-                p.offset_in_bytes() < min_iv_offset)))))) {
-        max_ct = cmp_ct.at(j);
-        max_vw = vw;
-        max_idx = j;
-        min_size = data_size(s);
-        min_iv_offset = p.offset_in_bytes();
-      }
-    }
-  }
-  // If no stores, look at loads
-  if (max_ct == 0) {
-    for (uint j = 0; j < memops.size(); j++) {
-      MemNode* s = memops.at(j)->as_Mem();
-      if (s->is_Load()) {
-        int vw = vector_width_in_bytes(s);
-        assert(vw > 1, "sanity");
-        const VPointer& p = vpointer(s);
-        if ( cmp_ct.at(j) >  max_ct ||
-            (cmp_ct.at(j) == max_ct &&
-              ( vw >  max_vw ||
-               (vw == max_vw &&
-                ( data_size(s) <  min_size ||
-                 (data_size(s) == min_size &&
-                  p.offset_in_bytes() < min_iv_offset)))))) {
-          max_ct = cmp_ct.at(j);
-          max_vw = vw;
-          max_idx = j;
-          min_size = data_size(s);
-          min_iv_offset = p.offset_in_bytes();
-        }
-      }
-    }
-  }
-
-#ifndef PRODUCT
-  if (is_trace_superword_verbose()) {
-    tty->print_cr("\nVector memops after find_align_to_ref");
-    for (uint i = 0; i < memops.size(); i++) {
-      MemNode* s = memops.at(i)->as_Mem();
-      s->dump();
-    }
-  }
-#endif
-
-  idx = max_idx;
-  if (max_ct > 0) {
-#ifndef PRODUCT
-    if (is_trace_superword_adjacent_memops()) {
-      tty->print("SuperWord::find_align_to_ref: ");
-      memops.at(max_idx)->as_Mem()->dump();
-    }
-#endif
-    return memops.at(max_idx)->as_Mem();
-  }
-  return nullptr;
+  });
 }

-//---------------------------get_vw_bytes_special------------------------
-int SuperWord::get_vw_bytes_special(MemNode* s) {
-  // Get the vector width in bytes.
-  int vw = vector_width_in_bytes(s);
-
-  // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
-  BasicType btype = velt_basic_type(s);
-  if (type2aelembytes(btype) == 2) {
-    bool should_combine_adjacent = true;
-    for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
-      Node* user = s->fast_out(i);
-      if (!VectorNode::is_muladds2i(user)) {
-        should_combine_adjacent = false;
-      }
-    }
-    if (should_combine_adjacent) {
-      vw = MIN2(Matcher::max_vector_size_auto_vectorization(btype)*type2aelembytes(btype), vw * 2);
-    }
+// For each group, find the adjacent memops.
+void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*> &vpointers) {
+  int group_start = 0;
+  while (group_start < vpointers.length()) {
+    int group_end = find_group_end(vpointers, group_start);
+    create_adjacent_memop_pairs_in_one_group(vpointers, group_start, group_end);
+    group_start = group_end;
  }
-
-  // Check for special case where there is a type conversion between different data size.
-  int vectsize = max_vector_size_in_def_use_chain(s);
-  if (vectsize < Matcher::max_vector_size_auto_vectorization(btype)) {
-    vw = MIN2(vectsize * type2aelembytes(btype), vw);
-  }
-
-  return vw;
 }

-//---------------------------get_iv_adjustment---------------------------
-// Calculate loop's iv adjustment for this memory ops.
-int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  const VPointer& align_to_ref_p = vpointer(mem_ref);
-  int offset = align_to_ref_p.offset_in_bytes();
-  int scale  = align_to_ref_p.scale_in_bytes();
-  int elt_size = align_to_ref_p.memory_size();
-  int vw       = get_vw_bytes_special(mem_ref);
-  assert(vw > 1, "sanity");
-  int iv_adjustment;
-  if (scale != 0) {
-    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
-    // At least one iteration is executed in pre-loop by default. As result
-    // several iterations are needed to align memory operations in main-loop even
-    // if offset is 0.
-    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
-    iv_adjustment = iv_adjustment_in_bytes/elt_size;
-  } else {
-    // This memory op is not dependent on iv (scale == 0)
-    iv_adjustment = 0;
+// Step forward until we find a VPointer of another group, or we reach the end of the array.
+int SuperWord::find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start) {
+  int group_end = group_start + 1;
+  while (group_end < vpointers.length() &&
+         VPointer::cmp_for_sort_by_group(
+           vpointers.adr_at(group_start),
+           vpointers.adr_at(group_end)
+         ) == 0) {
+    group_end++;
  }
+  return group_end;
+}

+// Find adjacent memops for a single group, e.g. for all LoadI of the same base, invar, etc.
+// Create pairs and add them to the pairset.
+void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, const int group_end) {
 #ifndef PRODUCT
-  if (is_trace_superword_alignment()) {
-    tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
-      mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
-    mem_ref->dump();
+  if (is_trace_superword_adjacent_memops()) {
+    tty->print_cr(" group:");
+    for (int i = group_start; i < group_end; i++) {
+      const VPointer* p = vpointers.at(i);
+      tty->print("  ");
+      p->print();
+    }
  }
 #endif
-  return iv_adjustment;
+
+  MemNode* first = vpointers.at(group_start)->mem();
+  int element_size = data_size(first);
+
+  // For each ref in group: find others that can be paired:
+  for (int i = group_start; i < group_end; i++) {
+    const VPointer* p1 = vpointers.at(i);
+    MemNode* mem1 = p1->mem();
+
+    bool found = false;
+    // For each ref in group with larger or equal offset:
+    for (int j = i + 1; j < group_end; j++) {
+      const VPointer* p2 = vpointers.at(j);
+      MemNode* mem2 = p2->mem();
+      assert(mem1 != mem2, "look only at pair of different memops");
+
+      // Check for correct distance.
+      assert(data_size(mem1) == element_size, "all nodes in group must have the same element size");
+      assert(data_size(mem2) == element_size, "all nodes in group must have the same element size");
+      assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset");
+      if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; }
+      if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; }
+
+      // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize)
+      if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; }
+
+      if (!can_pack_into_pair(mem1, mem2)) { continue; }
+
+#ifndef PRODUCT
+      if (is_trace_superword_adjacent_memops()) {
+        if (found) {
+          tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:");
+        } else {
+          tty->print_cr(" pair:");
+        }
+        tty->print("  ");
+        p1->print();
+        tty->print("  ");
+        p2->print();
+      }
+#endif
+
+      if (!found) {
+        _pairset.add_pair(mem1, mem2);
+      }
+    }
+  }
 }

 void VLoopMemorySlices::find_memory_slices() {
@ -809,10 +693,8 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail,
 #endif
 }

-//------------------------------stmts_can_pack---------------------------
-// Can s1 and s2 be in a pack with s1 immediately preceding s2 and
-// s1 aligned at "align"
-bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
+// Check if two nodes can be packed into a pair.
+bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) {

  // Do not use superword for non-primitives
  BasicType bt1 = velt_basic_type(s1);
@ -831,13 +713,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
    if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
      if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
        if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
-          int s1_align = alignment(s1);
-          int s2_align = alignment(s2);
-          if (s1_align == top_align || s1_align == align) {
-            if (s2_align == top_align || s2_align == align + data_size(s1)) {
-              return true;
-            }
-          }
+          return true;
        }
      }
    }
@ -1013,16 +889,6 @@ bool VLoopReductions::is_marked_reduction_pair(const Node* s1, const Node* s2) c
  return false;
 }

-//------------------------------set_alignment---------------------------
-void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
-  set_alignment(s1, align);
-  if (align == top_align || align == bottom_align) {
-    set_alignment(s2, align);
-  } else {
-    set_alignment(s2, align + data_size(s1));
-  }
-}
-
 // Extend pairset by following use->def and def->use links from pair members.
 void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
  bool changed;
@ -1058,57 +924,25 @@ void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
 #endif
 }

-//------------------------------adjust_alignment_for_type_conversion---------------------------------
-// Adjust the target alignment if conversion between different data size exists in def-use nodes.
-int SuperWord::adjust_alignment_for_type_conversion(Node* s, Node* t, int align) {
-  // Do not use superword for non-primitives
-  BasicType bt1 = velt_basic_type(s);
-  BasicType bt2 = velt_basic_type(t);
-  if (!is_java_primitive(bt1) || !is_java_primitive(bt2)) {
-    return align;
-  }
-  if (longer_type_for_conversion(s) != T_ILLEGAL ||
-      longer_type_for_conversion(t) != T_ILLEGAL) {
-    align = align / data_size(s) * data_size(t);
-  }
-  return align;
-}
-
 bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2) {
  assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
  assert(s1->req() == s2->req(), "just checking");
-  assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");

  if (s1->is_Load()) return false;

-#ifndef PRODUCT
-  if (is_trace_superword_alignment()) {
-    tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: s1 %d, align %d",
-                  s1->_idx, alignment(s1));
-  }
-#endif
  bool changed = false;
  int start = s1->is_Store() ? MemNode::ValueIn   : 1;
  int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
  for (int j = start; j < end; j++) {
-    int align = alignment(s1);
    Node* t1 = s1->in(j);
    Node* t2 = s2->in(j);
    if (!in_bb(t1) || !in_bb(t2) || t1->is_Mem() || t2->is_Mem())  {
      // Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
      continue;
    }
-    align = adjust_alignment_for_type_conversion(s1, t1, align);
-    if (stmts_can_pack(t1, t2, align)) {
+    if (can_pack_into_pair(t1, t2)) {
      if (estimate_cost_savings_when_packing_as_pair(t1, t2) >= 0) {
        _pairset.add_pair(t1, t2);
-#ifndef PRODUCT
-        if (is_trace_superword_alignment()) {
-          tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: set_alignment(%d, %d, %d)",
-                        t1->_idx, t2->_idx, align);
-        }
-#endif
-        set_alignment(t1, t2, align);
        changed = true;
      }
    }
@ -1122,17 +956,9 @@ bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node*
 bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2) {
  assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
  assert(s1->req() == s2->req(), "just checking");
-  assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");

  if (s1->is_Store()) return false;

-  int align = alignment(s1);
-#ifndef PRODUCT
-  if (is_trace_superword_alignment()) {
-    tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: s1 %d, align %d",
-                  s1->_idx, align);
-  }
-#endif
  int savings = -1;
  Node* u1 = nullptr;
  Node* u2 = nullptr;
@ -1150,28 +976,18 @@ bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node*
      }
      if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
      if (order_inputs_of_uses_to_match_def_pair(s1, s2, t1, t2) != PairOrderStatus::Ordered) { continue; }
-      int adjusted_align = alignment(s1);
-      adjusted_align = adjust_alignment_for_type_conversion(s1, t1, adjusted_align);
-      if (stmts_can_pack(t1, t2, adjusted_align)) {
+      if (can_pack_into_pair(t1, t2)) {
        int my_savings = estimate_cost_savings_when_packing_as_pair(t1, t2);
        if (my_savings > savings) {
          savings = my_savings;
          u1 = t1;
          u2 = t2;
-          align = adjusted_align;
        }
      }
    }
  }
  if (savings >= 0) {
    _pairset.add_pair(u1, u2);
-#ifndef PRODUCT
-    if (is_trace_superword_alignment()) {
-      tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: set_alignment(%d, %d, %d)",
-                    u1->_idx, u2->_idx, align);
-    }
-#endif
-    set_alignment(u1, u2, align);
    return true; // changed
  }
  return false; // no change
@ -1814,6 +1630,11 @@ uint SuperWord::max_implemented_size(const Node_List* pack) {
  }
 }

+// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
+// returns int type, but Vector API for them returns long type. To unify
+// the implementation in backend, superword splits the vector implementation
+// for Java API into an execution node with long type plus another node
+// converting long to int.
 bool SuperWord::requires_long_to_int_conversion(int opc) {
  switch(opc) {
    case Op_PopCountL:
@ -2948,7 +2769,17 @@ uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
 bool SuperWord::is_vector_use(Node* use, int u_idx) const {
  Node_List* u_pk = get_pack(use);
  if (u_pk == nullptr) return false;
-  if (is_marked_reduction(use)) return true;
+
+  // Reduction: first input is internal connection.
+  if (is_marked_reduction(use) && u_idx == 1) {
+#ifdef ASSERT
+      for (uint i = 1; i < u_pk->size(); i++) {
+        assert(u_pk->at(i - 1) == u_pk->at(i)->in(1), "internal connection");
+      }
+#endif
+    return true;
+  }
+
  Node* def = use->in(u_idx);
  Node_List* d_pk = get_pack(def);
  if (d_pk == nullptr) {
@ -2975,51 +2806,64 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) const {
    return true;
  }

+  if (!is_velt_basic_type_compatible_use_def(use, def)) {
+    return false;
+  }
+
  if (VectorNode::is_muladds2i(use)) {
-    // MulAddS2I takes shorts and produces ints - hence the special checks
-    // on alignment and size.
+    // MulAddS2I takes shorts and produces ints.
    if (u_pk->size() * 2 != d_pk->size()) {
      return false;
    }
-    for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
-      Node* ui = u_pk->at(i);
-      Node* di = d_pk->at(i);
-      if (alignment(ui) != alignment(di) * 2) {
-        return false;
-      }
-    }
    return true;
  }

-  if (u_pk->size() != d_pk->size())
+  if (u_pk->size() != d_pk->size()) {
    return false;
-
-  if (longer_type_for_conversion(use) != T_ILLEGAL) {
-    // These opcodes take a type of a kind of size and produce a type of
-    // another size - hence the special checks on alignment and size.
-    for (uint i = 0; i < u_pk->size(); i++) {
-      Node* ui = u_pk->at(i);
-      Node* di = d_pk->at(i);
-      if (ui->in(u_idx) != di) {
-        return false;
-      }
-      if (alignment(ui) / type2aelembytes(velt_basic_type(ui)) !=
-          alignment(di) / type2aelembytes(velt_basic_type(di))) {
-        return false;
-      }
-    }
-    return true;
  }

  for (uint i = 0; i < u_pk->size(); i++) {
    Node* ui = u_pk->at(i);
    Node* di = d_pk->at(i);
-    if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
+    if (ui->in(u_idx) != di) {
      return false;
+    }
  }
  return true;
 }

+// Check if the output type of def is compatible with the input type of use, i.e. if the
+// types have the same size.
+bool SuperWord::is_velt_basic_type_compatible_use_def(Node* use, Node* def) const {
+  assert(in_bb(def) && in_bb(use), "both use and def are in loop");
+
+  // Conversions are trivially compatible.
+  if (VectorNode::is_convert_opcode(use->Opcode())) {
+    return true;
+  }
+
+  BasicType use_bt = velt_basic_type(use);
+  BasicType def_bt = velt_basic_type(def);
+
+  assert(is_java_primitive(use_bt), "sanity %s", type2name(use_bt));
+  assert(is_java_primitive(def_bt), "sanity %s", type2name(def_bt));
+
+  // Nodes like Long.bitCount: expect long input, and int output.
+  if (requires_long_to_int_conversion(use->Opcode())) {
+    return type2aelembytes(def_bt) == 8 &&
+           type2aelembytes(use_bt) == 4;
+  }
+
+  // MulAddS2I: expect short input, and int output.
+  if (VectorNode::is_muladds2i(use)) {
+    return type2aelembytes(def_bt) == 2 &&
+           type2aelembytes(use_bt) == 4;
+  }
+
+  // Default case: input size of use equals output size of def.
+  return type2aelembytes(use_bt) == type2aelembytes(def_bt);
+}
+
 // Return nullptr if success, else failure message
 VStatus VLoopBody::construct() {
  assert(_body.is_empty(), "body is empty");
@ -3150,12 +2994,6 @@ VStatus VLoopBody::construct() {
  return VStatus::make_success();
 }

-// Initialize per node info
-void SuperWord::initialize_node_info() {
-  Node* last = body().at(body().length() - 1);
-  grow_node_info(bb_idx(last));
-}
-
 BasicType SuperWord::longer_type_for_conversion(Node* n) const {
  if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
        requires_long_to_int_conversion(n->Opcode())) ||
@ -3177,34 +3015,6 @@ BasicType SuperWord::longer_type_for_conversion(Node* n) const {
                              : (src_size > dst_size ? src_t : dst_t);
 }

-int SuperWord::max_vector_size_in_def_use_chain(Node* n) {
-  BasicType bt = velt_basic_type(n);
-  BasicType vt = bt;
-
-  // find the longest type among def nodes.
-  uint start, end;
-  VectorNode::vector_operands(n, &start, &end);
-  for (uint i = start; i < end; ++i) {
-    Node* input = n->in(i);
-    if (!in_bb(input)) continue;
-    BasicType newt = longer_type_for_conversion(input);
-    vt = (newt == T_ILLEGAL) ? vt : newt;
-  }
-
-  // find the longest type among use nodes.
-  for (uint i = 0; i < n->outcnt(); ++i) {
-    Node* output = n->raw_out(i);
-    if (!in_bb(output)) continue;
-    BasicType newt = longer_type_for_conversion(output);
-    vt = (newt == T_ILLEGAL) ? vt : newt;
-  }
-
-  int max = Matcher::max_vector_size_auto_vectorization(vt);
-  // If now there is no vectors for the longest type, the nodes with the longest
-  // type in the def-use chain are not packed in SuperWord::stmts_can_pack.
-  return max < 2 ? Matcher::max_vector_size_auto_vectorization(bt) : max;
-}
-
 void VLoopTypes::compute_vector_element_type() {
 #ifndef PRODUCT
  if (_vloop.is_trace_vector_element_type()) {
@ -3308,36 +3118,6 @@ void VLoopTypes::compute_vector_element_type() {
 #endif
 }

-//------------------------------memory_alignment---------------------------
-// Alignment within a vector memory reference
-int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
-#ifndef PRODUCT
-  if (is_trace_superword_alignment()) {
-    tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
-  }
-#endif
-  const VPointer& p = vpointer(s);
-  if (!p.valid()) {
-    NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
-    return bottom_align;
-  }
-  int vw = get_vw_bytes_special(s);
-  if (vw < 2) {
-    NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
-    return bottom_align; // No vectors for this type
-  }
-  int offset  = p.offset_in_bytes();
-  offset     += iv_adjust*p.memory_size();
-  int off_rem = offset % vw;
-  int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
-#ifndef PRODUCT
-  if (is_trace_superword_alignment()) {
-    tty->print_cr("SuperWord::memory_alignment: off_rem = %d, off_mod = %d (offset = %d)", off_rem, off_mod, offset);
-  }
-#endif
-  return off_mod;
-}
-
 // Smallest type containing range of values
 const Type* VLoopTypes::container_type(Node* n) const {
  if (n->is_Mem()) {
@ -3794,10 +3574,6 @@ void VLoopBody::print() const {
 }
 #endif

-// ========================= SWNodeInfo =====================
-
-const SWNodeInfo SWNodeInfo::initial;
-
 //
 // --------------------------------- vectorization/simd -----------------------------------
 //
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -384,18 +384,6 @@ public:
  NOT_PRODUCT(static void print_pack(Node_List* pack);)
 };

-// ========================= SuperWord =====================
-
-// -----------------------------SWNodeInfo---------------------------------
-// Per node info needed by SuperWord
-class SWNodeInfo {
- public:
-  int         _alignment; // memory alignment for a node
-
-  SWNodeInfo() : _alignment(-1) {}
-  static const SWNodeInfo initial;
-};
-
 // -----------------------------SuperWord---------------------------------
 // Transforms scalar operations into packed (superword) operations.
 class SuperWord : public ResourceObj {
@ -407,9 +395,6 @@ class SuperWord : public ResourceObj {
  // VSharedData, and reused over many AutoVectorizations.
  Arena _arena;

-  enum consts { top_align = -1, bottom_align = -666 };
-
-  GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
  CloneMap&            _clone_map;       // map of nodes created in cloning

  PairSet _pairset;
@ -461,6 +446,11 @@ class SuperWord : public ResourceObj {
    return _vloop_analyzer.body().bb_idx(n);
  }

+  template<typename Callback>
+  void for_each_mem(Callback callback) const {
+    return _vloop_analyzer.body().for_each_mem(callback);
+  }
+
  // VLoopTypes accessors
  const Type* velt_type(Node* n) const {
    return _vloop_analyzer.types().velt_type(n);
@ -506,11 +496,6 @@ class SuperWord : public ResourceObj {

 #ifndef PRODUCT
  // TraceAutoVectorization and TraceSuperWord
-  bool is_trace_superword_alignment() const {
-    // Too verbose for TraceSuperWord
-    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
-  }
-
  bool is_trace_superword_adjacent_memops() const {
    return TraceSuperWord ||
           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
@ -531,15 +516,9 @@ class SuperWord : public ResourceObj {
           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
  }

-  bool is_trace_superword_verbose() const {
-    // Too verbose for TraceSuperWord
-    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
-  }
-
  bool is_trace_superword_any() const {
    return TraceSuperWord ||
           is_trace_align_vector() ||
-           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
@ -549,7 +528,7 @@ class SuperWord : public ResourceObj {

  bool is_trace_align_vector() const {
    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
-           is_trace_superword_verbose();
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
  }
 #endif

@ -566,37 +545,28 @@ class SuperWord : public ResourceObj {
  // Accessors
  Arena* arena()                   { return &_arena; }

-  int get_vw_bytes_special(MemNode* s);
-
-  // Ensure node_info contains element "i"
-  void grow_node_info(int i) { if (i >= _node_info.length()) _node_info.at_put_grow(i, SWNodeInfo::initial); }
-
  // should we align vector memory references on this platform?
  bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }

-  // memory alignment for a node
-  int alignment(Node* n) const               { return _node_info.adr_at(bb_idx(n))->_alignment; }
-  void set_alignment(Node* n, int a)         { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_alignment = a; }
-
-  // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
-  static bool requires_long_to_int_conversion(int opc);
  // For pack p, are all idx operands the same?
  bool same_inputs(const Node_List* p, int idx) const;
+
  // CloneMap utilities
  bool same_origin_idx(Node* a, Node* b) const;
  bool same_generation(Node* a, Node* b) const;

 private:
  bool SLP_extract();
-  // Find the adjacent memory references and create pack pairs for them.
-  void find_adjacent_refs();
-  // Find a memory reference to align the loop induction variable to.
-  MemNode* find_align_to_ref(Node_List &memops, int &idx);
-  // Calculate loop's iv adjustment for this memory ops.
-  int get_iv_adjustment(MemNode* mem);

-  // Can s1 and s2 be in a pack with s1 immediately preceding s2 and  s1 aligned at "align"
-  bool stmts_can_pack(Node* s1, Node* s2, int align);
+  // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
+  void create_adjacent_memop_pairs();
+  void collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers);
+  void create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*>& vpointers);
+  static int find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start);
+  void create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, int group_end);
+
+  // Various methods to check if we can pack two nodes.
+  bool can_pack_into_pair(Node* s1, Node* s2);
  // Is s1 immediately before s2 in memory?
  bool are_adjacent_refs(Node* s1, Node* s2) const;
  // Are s1 and s2 similar?
@ -606,8 +576,6 @@ private:
  // For a node pair (s1, s2) which is isomorphic and independent,
  // do s1 and s2 have similar input edges?
  bool have_similar_inputs(Node* s1, Node* s2);
-  void set_alignment(Node* s1, Node* s2, int align);
-  int adjust_alignment_for_type_conversion(Node* s, Node* t, int align);

  void extend_pairset_with_more_pairs_by_following_use_and_def();
  bool extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2);
@ -661,16 +629,15 @@ private:
  // Is use->in(u_idx) a vector use?
  bool is_vector_use(Node* use, int u_idx) const;

-  // Initialize per node info
-  void initialize_node_info();
  // Return the longer type for vectorizable type-conversion node or illegal type for other nodes.
  BasicType longer_type_for_conversion(Node* n) const;
-  // Find the longest type in def-use chain for packed nodes, and then compute the max vector size.
-  int max_vector_size_in_def_use_chain(Node* n);
+
+  static bool requires_long_to_int_conversion(int opc);
+
+  bool is_velt_basic_type_compatible_use_def(Node* use, Node* def) const;

  static LoadNode::ControlDependency control_dependency(Node_List* p);
-  // Alignment within a vector memory reference
-  int memory_alignment(MemNode* s, int iv_adjust);
+
  // Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
  void determine_mem_ref_and_aw_for_main_loop_alignment();
  void adjust_pre_loop_limit_to_align_main_loop_vectors();
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@ -37,8 +37,7 @@
  flags(TYPES,                "Trace VLoopTypes") \
  flags(POINTERS,             "Trace VLoopPointers") \
  flags(DEPENDENCY_GRAPH,     "Trace VLoopDependencyGraph") \
-  flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
-  flags(SW_ADJACENT_MEMOPS,   "Trace SuperWord::find_adjacent_refs") \
+  flags(SW_ADJACENT_MEMOPS,   "Trace SuperWord::find_adjacent_memop_pairs") \
  flags(SW_REJECTIONS,        "Trace SuperWord rejections (non vectorizations)") \
  flags(SW_PACKSET,           "Trace SuperWord packset at different stages") \
  flags(SW_INFO,              "Trace SuperWord info (equivalent to TraceSuperWord)") \
@ -115,7 +114,6 @@ class TraceAutoVectorizationTagValidator {
      } else if (ALL == tag) {
        _tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
      } else if (SW_VERBOSE == tag) {
-        _tags.at_put(SW_ALIGNMENT, set_bit);
        _tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
        _tags.at_put(SW_REJECTIONS, set_bit);
        _tags.at_put(SW_PACKSET, set_bit);
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -202,7 +202,7 @@ void VLoopVPointers::allocate_vpointers_array() {

 void VLoopVPointers::compute_and_cache_vpointers() {
  int pointers_idx = 0;
-  _body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
+  _body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
    // Placement new: construct directly into the array.
    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
    _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
@ -410,7 +410,7 @@ void VLoopDependencyGraph::PredsIterator::next() {
 int VPointer::Tracer::_depth = 0;
 #endif

-VPointer::VPointer(const MemNode* mem, const VLoop& vloop,
+VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
                   Node_Stack* nstack, bool analyze_only) :
  _mem(mem), _vloop(vloop),
  _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
@ -807,10 +807,50 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
  _invar = register_if_new(add);
 }

+// To be in the same group, two VPointers must be the same,
+// except for the offset.
+int VPointer::cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2) {
+  const VPointer* a = *p1;
+  const VPointer* b = *p2;
+
+  int cmp_base = a->base()->_idx - b->base()->_idx;
+  if (cmp_base != 0) { return cmp_base; }
+
+  int cmp_opcode = a->mem()->Opcode() - b->mem()->Opcode();
+  if (cmp_opcode != 0) { return cmp_opcode; }
+
+  int cmp_scale = a->scale_in_bytes() - b->scale_in_bytes();
+  if (cmp_scale != 0) { return cmp_scale; }
+
+  int cmp_invar = (a->invar() == nullptr ? 0 : a->invar()->_idx) -
+                  (b->invar() == nullptr ? 0 : b->invar()->_idx);
+  return cmp_invar;
+}
+
+// We compare by group, then by offset, and finally by node idx.
+int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) {
+  int cmp_group = cmp_for_sort_by_group(p1, p2);
+  if (cmp_group != 0) { return cmp_group; }
+
+  const VPointer* a = *p1;
+  const VPointer* b = *p2;
+
+  int cmp_offset = a->offset_in_bytes() - b->offset_in_bytes();
+  if (cmp_offset != 0) { return cmp_offset; }
+
+  return a->mem()->_idx - b->mem()->_idx;
+}
+
 #ifndef PRODUCT
 // Function for printing the fields of a VPointer
 void VPointer::print() const {
  tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name());
+
+  if (!valid()) {
+    tty->print_cr("invalid]");
+    return;
+  }
+
  tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0);
  tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);

--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -669,7 +669,7 @@ private:
 // operation in a counted loop for vectorizable analysis.
 class VPointer : public ArenaObj {
 protected:
-  const MemNode*  _mem;      // My memory reference node
+  MemNode* const  _mem;      // My memory reference node
  const VLoop&    _vloop;

  Node* _base;               // null if unsafe nonheap reference
@ -711,12 +711,12 @@ class VPointer : public ArenaObj {
    NotComparable = (Less | Greater | Equal)
  };

-  VPointer(const MemNode* mem, const VLoop& vloop) :
+  VPointer(MemNode* const mem, const VLoop& vloop) :
    VPointer(mem, vloop, nullptr, false) {}
-  VPointer(const MemNode* mem, const VLoop& vloop, Node_Stack* nstack) :
+  VPointer(MemNode* const mem, const VLoop& vloop, Node_Stack* nstack) :
    VPointer(mem, vloop, nstack, true) {}
 private:
-  VPointer(const MemNode* mem, const VLoop& vloop,
+  VPointer(MemNode* const mem, const VLoop& vloop,
           Node_Stack* nstack, bool analyze_only);
  // Following is used to create a temporary object during
  // the pattern match of an address expression.
@ -729,7 +729,7 @@ class VPointer : public ArenaObj {

  Node* base()             const { return _base; }
  Node* adr()              const { return _adr; }
-  const MemNode* mem()     const { return _mem; }
+  MemNode* mem()           const { return _mem; }
  int   scale_in_bytes()   const { return _scale; }
  Node* invar()            const { return _invar; }
  int   offset_in_bytes()  const { return _offset; }
@ -781,6 +781,11 @@ class VPointer : public ArenaObj {
  static bool equal(int cmp)      { return cmp == Equal; }
  static bool comparable(int cmp) { return cmp < NotComparable; }

+  // We need to be able to sort the VPointer to efficiently group the
+  // memops into groups, and to find adjacent memops.
+  static int cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2);
+  static int cmp_for_sort(const VPointer** p1, const VPointer** p2);
+
  NOT_PRODUCT( void print() const; )

 #ifndef PRODUCT
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestCompatibleUseDefTypeSize.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestCompatibleUseDefTypeSize.java
@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+import jdk.test.lib.Utils;
+import jdk.test.whitebox.WhiteBox;
+import java.lang.reflect.Array;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Random;
+import java.nio.ByteOrder;
+
+/*
+ * @test
+ * @bug 8325155
+ * @summary Test some cases that vectorize after the removal of the alignment boundaries code.
+ *          Now, we instead check if use-def connections have compatible type size.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestCompatibleUseDefTypeSize
+ */
+
+public class TestCompatibleUseDefTypeSize {
+    static int RANGE = 1024*8;
+    private static final Random RANDOM = Utils.getRandomInstance();
+
+    // Inputs
+    byte[] aB;
+    byte[] bB;
+    short[] aS;
+    short[] bS;
+    char[] aC;
+    char[] bC;
+    int[] aI;
+    int[] bI;
+    long[] aL;
+    long[] bL;
+    float[] aF;
+    float[] bF;
+    double[] aD;
+    double[] bD;
+
+    // List of tests
+    Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
+
+    // List of gold, the results from the first run before compilation
+    Map<String,Object[]> golds = new HashMap<String,Object[]>();
+
+    interface TestFunction {
+        Object[] run();
+    }
+
+    public static void main(String[] args) {
+        TestFramework.run();
+    }
+
+    public TestCompatibleUseDefTypeSize() {
+        // Generate input once
+        aB = generateB();
+        bB = generateB();
+        aS = generateS();
+        bS = generateS();
+        aC = generateC();
+        bC = generateC();
+        aI = generateI();
+        bI = generateI();
+        aL = generateL();
+        bL = generateL();
+        aF = generateF();
+        bF = generateF();
+        aD = generateD();
+        bD = generateD();
+
+        // Add all tests to list
+        tests.put("test0",       () -> { return test0(aB.clone(), bC.clone()); });
+        tests.put("test1",       () -> { return test1(aB.clone(), bC.clone()); });
+        tests.put("test2",       () -> { return test2(aB.clone(), bC.clone()); });
+        tests.put("test3",       () -> { return test3(aI.clone(), bI.clone()); });
+        tests.put("test4",       () -> { return test4(aI.clone(), bI.clone()); });
+        tests.put("test5",       () -> { return test5(aI.clone(), bF.clone()); });
+        tests.put("test6",       () -> { return test6(aI.clone(), bF.clone()); });
+        tests.put("test7",       () -> { return test7(aI.clone(), bF.clone()); });
+        tests.put("test8",       () -> { return test8(aL.clone(), bD.clone()); });
+        tests.put("test9",       () -> { return test9(aL.clone(), bD.clone()); });
+        tests.put("test10",      () -> { return test10(aL.clone(), bD.clone()); });
+        tests.put("test11",      () -> { return test11(aC.clone()); });
+
+        // Compute gold value for all test methods before compilation
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            Object[] gold = test.run();
+            golds.put(name, gold);
+        }
+    }
+
+    @Warmup(100)
+    @Run(test = {"test0",
+                 "test1",
+                 "test2",
+                 "test3",
+                 "test4",
+                 "test5",
+                 "test6",
+                 "test7",
+                 "test8",
+                 "test9",
+                 "test10",
+                 "test11"})
+    public void runTests() {
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            // Recall gold value from before compilation
+            Object[] gold = golds.get(name);
+            // Compute new result
+            Object[] result = test.run();
+            // Compare gold and new result
+            verify(name, gold, result);
+        }
+    }
+
+    static byte[] generateB() {
+        byte[] a = new byte[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (byte)RANDOM.nextInt();
+        }
+        return a;
+    }
+
+    static short[] generateS() {
+        short[] a = new short[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (short)RANDOM.nextInt();
+        }
+        return a;
+    }
+
+    static char[] generateC() {
+        char[] a = new char[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (char)RANDOM.nextInt();
+        }
+        return a;
+    }
+
+    static int[] generateI() {
+        int[] a = new int[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = RANDOM.nextInt();
+        }
+        return a;
+    }
+
+    static long[] generateL() {
+        long[] a = new long[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = RANDOM.nextLong();
+        }
+        return a;
+    }
+
+    static float[] generateF() {
+        float[] a = new float[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Float.intBitsToFloat(RANDOM.nextInt());
+        }
+        return a;
+    }
+
+    static double[] generateD() {
+        double[] a = new double[RANGE];
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Double.longBitsToDouble(RANDOM.nextLong());
+        }
+        return a;
+    }
+
+    static void verify(String name, Object[] gold, Object[] result) {
+        if (gold.length != result.length) {
+            throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
+                                       gold.length + ", result.length = " + result.length);
+        }
+        for (int i = 0; i < gold.length; i++) {
+            Object g = gold[i];
+            Object r = result[i];
+            if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
+                throw new RuntimeException("verify " + name + ": must both be array of same type:" +
+                                           " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
+                                           " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
+            }
+            if (g == r) {
+                throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
+                                           " gold[" + i + "] == result[" + i + "]");
+            }
+            if (Array.getLength(g) != Array.getLength(r)) {
+                    throw new RuntimeException("verify " + name + ": arrays must have same length:" +
+                                           " gold[" + i + "].length = " + Array.getLength(g) +
+                                           " result[" + i + "].length = " + Array.getLength(r));
+            }
+            Class c = g.getClass().getComponentType();
+            if (c == byte.class) {
+                verifyB(name, i, (byte[])g, (byte[])r);
+            } else if (c == short.class) {
+                verifyS(name, i, (short[])g, (short[])r);
+            } else if (c == char.class) {
+                verifyC(name, i, (char[])g, (char[])r);
+            } else if (c == int.class) {
+                verifyI(name, i, (int[])g, (int[])r);
+            } else if (c == long.class) {
+                verifyL(name, i, (long[])g, (long[])r);
+            } else if (c == float.class) {
+                verifyF(name, i, (float[])g, (float[])r);
+            } else if (c == double.class) {
+                verifyD(name, i, (double[])g, (double[])r);
+            } else {
+                throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
+                                       " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
+                                       " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
+            }
+        }
+    }
+
+    static void verifyB(String name, int i, byte[] g, byte[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyS(String name, int i, short[] g, short[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyC(String name, int i, char[] g, char[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyI(String name, int i, int[] g, int[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyL(String name, int i, long[] g, long[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyF(String name, int i, float[] g, float[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (Float.floatToIntBits(g[j]) != Float.floatToIntBits(r[j])) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyD(String name, int i, double[] g, double[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (Double.doubleToLongBits(g[j]) != Double.doubleToLongBits(r[j])) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // "inflate"  method: 1 byte -> 2 byte.
+    // Java scalar code has no explicit conversion.
+    // Vector code would need a conversion. We may add this in the future.
+    static Object[] test0(byte[] src, char[] dst) {
+        for (int i = 0; i < src.length; i++) {
+            dst[i] = (char)(src[i] & 0xff);
+        }
+        return new Object[]{ src, dst };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // "inflate"  method: 1 byte -> 2 byte.
+    // Java scalar code has no explicit conversion.
+    // Vector code would need a conversion. We may add this in the future.
+    static Object[] test1(byte[] src, char[] dst) {
+        for (int i = 0; i < src.length; i++) {
+            dst[i] = (char)(src[i]);
+        }
+        return new Object[]{ src, dst };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // "deflate"  method: 2 byte -> 1 byte.
+    // Java scalar code has no explicit conversion.
+    // Vector code would need a conversion. We may add this in the future.
+    static Object[] test2(byte[] src, char[] dst) {
+        for (int i = 0; i < src.length; i++) {
+            src[i] = (byte)(dst[i]);
+        }
+        return new Object[]{ src, dst };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Used to not vectorize because of "alignment boundaries".
+    // Assume 64 byte vector width:
+    // a[i+0:i+15] and a[i+1:i+16], each are 4 * 16 = 64 byte.
+    // The alignment boundary is every 64 byte, so one of the two vectors gets cut up.
+    static Object[] test3(int[] a, int[] b) {
+        for (int i = 0; i < a.length-1; i++) {
+            a[i] = (int)(b[i] + a[i+1]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // same as test3, but hand-unrolled
+    static Object[] test4(int[] a, int[] b) {
+        for (int i = 0; i < a.length-2; i+=2) {
+            a[i+0] = (int)(b[i+0] + a[i+1]);
+            a[i+1] = (int)(b[i+1] + a[i+2]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // In theory, one would expect this to be a simple 4byte -> 4byte conversion.
+    // But there is a CmpF and CMove here because we check for isNaN. Plus a MoveF2I.
+    //
+    // Would be nice to vectorize: Missing support for CmpF, CMove and MoveF2I.
+    static Object[] test5(int[] a, float[] b) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Float.floatToIntBits(b[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Missing support for MoveF2I
+    static Object[] test6(int[] a, float[] b) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Float.floatToRawIntBits(b[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Missing support for MoveI2F
+    static Object[] test7(int[] a, float[] b) {
+        for (int i = 0; i < a.length; i++) {
+            b[i] = Float.intBitsToFloat(a[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Missing support for Needs CmpD, CMove and MoveD2L
+    static Object[] test8(long[] a, double[] b) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Double.doubleToLongBits(b[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Missing support for MoveD2L
+    static Object[] test9(long[] a, double[] b) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = Double.doubleToRawLongBits(b[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_VECTOR, "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // Missing support for MoveL2D
+    static Object[] test10(long[] a, double[] b) {
+        for (int i = 0; i < a.length; i++) {
+            b[i] = Double.longBitsToDouble(a[i]);
+        }
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    // MaxI reduction is with char type, but the MaxI char vector is not implemented.
+    static Object[] test11(char[] a) {
+        char m = 0;
+        for (int i = 0; i < a.length; i++) {
+            m = (char)Math.max(m, a[i]);
+            a[i] = 0;
+        }
+        return new Object[]{ a, new char[] { m } };
+    }
+}
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestSplitPacks.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestSplitPacks.java
@ -390,9 +390,9 @@ public class TestSplitPacks {

    @Test
    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
-                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
+                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
-                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "= 0",
+                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.STORE_VECTOR, "> 0"},
        applyIf = {"MaxVectorSize", ">=32"},
        applyIfPlatform = {"64-bit", "true"},
@ -405,8 +405,6 @@ public class TestSplitPacks {
    //  | |    \ \ \ \
    //  0 1 - - 4 5 6 7
    //
-    // The 4-pack does not vectorize. This is a technical limitation that
-    // we can hopefully soon remove. Load and store offsets are different.
    static Object[] test2a(int[] a, int[] b, int mask) {
        for (int i = 0; i < RANGE; i+=8) {
            int b0 = a[i+0] & mask;
@ -428,9 +426,9 @@ public class TestSplitPacks {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
+    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
-                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "= 0",
+                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.STORE_VECTOR, "> 0"},
        applyIf = {"MaxVectorSize", ">=32"},
@ -444,8 +442,6 @@ public class TestSplitPacks {
    //  | | | |    \ \
    //  0 1 2 3 -- 6 7
    //
-    // The 2-pack does not vectorize. This is a technical limitation that
-    // we can hopefully soon remove. Load and store offsets are different.
    static Object[] test2b(int[] a, int[] b, int mask) {
        for (int i = 0; i < RANGE; i+=8) {
            int b0 = a[i+0] & mask;
@ -468,9 +464,9 @@ public class TestSplitPacks {

    @Test
    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
-                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
+                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
-                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "= 0",
+                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.STORE_VECTOR, "> 0"},
        applyIf = {"MaxVectorSize", ">=32"},
        applyIfPlatform = {"64-bit", "true"},
@ -483,8 +479,6 @@ public class TestSplitPacks {
    //  | |  / / / /
    //  0 1 2 3 4 5 - -
    //
-    // The 4-pack does not vectorize. This is a technical limitation that
-    // we can hopefully soon remove. Load and store offsets are different.
    static Object[] test2c(int[] a, int[] b, int mask) {
        for (int i = 0; i < RANGE; i+=8) {
            int b0 = a[i+0] & mask;
@ -506,9 +500,9 @@ public class TestSplitPacks {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
+    @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
                  IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
-                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "= 0",
+                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
                  IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
                  IRNode.STORE_VECTOR, "> 0"},
        applyIf = {"MaxVectorSize", ">=32"},
@ -522,8 +516,6 @@ public class TestSplitPacks {
    //  | | | |  / /
    //  0 1 2 3 4 5 - -
    //
-    // The 2-pack does not vectorize. This is a technical limitation that
-    // we can hopefully soon remove. Load and store offsets are different.
    static Object[] test2d(int[] a, int[] b, int mask) {
        for (int i = 0; i < RANGE; i+=8) {
            int b0 = a[i+0] & mask;