From ad0e5a99ca1ad9dd04105f502985735a3536c3f4 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <epeter@openjdk.org>
Date: Mon, 8 May 2023 06:09:10 +0000
Subject: [PATCH] 8304720: SuperWord::schedule should rebuild C2-graph from
 SuperWord dependency-graph

Reviewed-by: kvn, fgao
---
 src/hotspot/share/opto/superword.cpp          | 514 ++++++------------
 src/hotspot/share/opto/superword.hpp          |  24 +-
 ...tIndependentPacksWithCyclicDependency.java |  51 +-
 ...IndependentPacksWithCyclicDependency2.java |   2 -
 .../TestScheduleReordersScalarMemops.java     | 159 ++++++
 5 files changed, 351 insertions(+), 399 deletions(-)
 create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index af8f08f722b..b1e4677a7cb 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -669,8 +669,6 @@ bool SuperWord::SLP_extract() {
 
     DEBUG_ONLY(verify_packs();)
 
-    remove_cycles();
-
     schedule();
 
     // Record eventual count of vector packs for checks in post loop vectorization
@@ -1431,11 +1429,13 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
     return false;
   }
 
-  // FIXME - co_locate_pack fails on Stores in different mem-slices, so
-  // only pack memops that are in the same alias set until that's fixed.
+  // Adjacent memory references must be on the same slice.
   if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
     return false;
   }
+
+  // Adjacent memory references must have the same base, be comparable
+  // and have the correct distance between them.
   SWPointer p1(s1->as_Mem(), this, nullptr, false);
   SWPointer p2(s2->as_Mem(), this, nullptr, false);
   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
@@ -2497,10 +2497,13 @@ class PacksetGraph {
 private:
   // pid: packset graph node id.
   GrowableArray<int> _pid;                 // bb_idx(n) -> pid
+  GrowableArray<Node*> _pid_to_node;       // one node per pid, find rest via my_pack
   GrowableArray<GrowableArray<int>> _out;  // out-edges
   GrowableArray<int> _incnt;               // number of (implicit) in-edges
   int _max_pid = 0;
 
+  bool _schedule_success;
+
   SuperWord* _slp;
 public:
   PacksetGraph(SuperWord* slp)
@@ -2523,11 +2526,18 @@ public:
     assert(poz != 0, "pid should not be zero");
     return poz;
   }
-  void set_pid(const Node* n, int pid) {
+  void set_pid(Node* n, int pid) {
     assert(n != nullptr && pid > 0, "sane inputs");
     assert(_slp->in_bb(n), "must be");
     int idx = _slp->bb_idx(n);
     _pid.at_put_grow(idx, pid);
+    _pid_to_node.at_put_grow(pid - 1, n, nullptr);
+  }
+  Node* get_node(int pid) {
+    assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
+    Node* n = _pid_to_node.at(pid - 1);
+    assert(n != nullptr, "sanity");
+    return n;
   }
   int new_pid() {
     _incnt.push(0);
@@ -2537,6 +2547,7 @@ public:
   int incnt(int pid) { return _incnt.at(pid - 1); }
   void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
   GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
+  bool schedule_success() const { return _schedule_success; }
 
   // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
   void build() {
@@ -2550,6 +2561,7 @@ public:
       for (uint k = 0; k < p->size(); k++) {
         Node* n = p->at(k);
         set_pid(n, pid);
+        assert(_slp->my_pack(n) == p, "matching packset");
       }
     }
 
@@ -2565,6 +2577,7 @@ public:
       if (pid == 0) {
         pid = new_pid();
         set_pid(n, pid);
+        assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset");
       }
     }
 
@@ -2610,11 +2623,13 @@ public:
       }
     }
   }
-  // Schedule the graph to worklist. Returns true iff all nodes were scheduled.
-  // This implies that we return true iff the PacksetGraph is acyclic.
-  // We schedule with topological sort: schedule any node that has zero incnt.
-  // Then remove that node, which decrements the incnt of all its uses (outputs).
-  bool schedule() {
+
+  // Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
+  // that has zero incnt. If a PacksetGraph node corresponds to memops, then add
+  // those to the memops_schedule. At the end, we return the memops_schedule, and
+  // note if topsort was successful.
+  Node_List schedule() {
+    Node_List memops_schedule;
     GrowableArray<int> worklist;
     // Directly schedule all nodes without precedence
     for (int pid = 1; pid <= _max_pid; pid++) {
@@ -2625,6 +2640,22 @@ public:
     // Continue scheduling via topological sort
     for (int i = 0; i < worklist.length(); i++) {
       int pid = worklist.at(i);
+
+      // Add memops to memops_schedule
+      Node* n = get_node(pid);
+      Node_List* p = _slp->my_pack(n);
+      if (n->is_Mem()) {
+        if (p == nullptr) {
+          memops_schedule.push(n);
+        } else {
+          for (uint k = 0; k < p->size(); k++) {
+            memops_schedule.push(p->at(k));
+            assert(p->at(k)->is_Mem(), "only schedule memops");
+          }
+        }
+      }
+
+      // Decrement incnt for all successors
       for (int j = 0; j < out(pid).length(); j++){
         int pid_use = out(pid).at(j);
         int incnt_use = incnt(pid_use) - 1;
@@ -2635,9 +2666,12 @@ public:
         }
       }
     }
-    // Was every pid scheduled?
-    return worklist.length() == _max_pid;
+
+    // Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
+    _schedule_success = (worklist.length() == _max_pid);
+    return memops_schedule;
   }
+
   // Print the PacksetGraph.
   // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
   // print_zero_incnt = false: do not print nodes that have no in-edges (any more).
@@ -2668,303 +2702,135 @@ public:
   }
 };
 
-//------------------------------remove_cycles---------------------------
-// We now know that we only have independent packs, see verify_packs.
-// This is a necessary but not a sufficient condition for an acyclic
-// graph (DAG) after scheduling. Thus, we must check if the packs have
-// introduced a cycle. The SuperWord paper mentions the need for this
-// in "3.7 Scheduling".
-// Approach: given all nodes from the _block, we create a new graph.
-// The nodes that are not in a pack are their own nodes (scalar-node)
-// in that new graph. Every pack is also a node (pack-node). We then
-// add the edges according to DepPreds: a scalar-node has all edges
-// to its node's DepPreds. A pack-node has all edges from every pack
-// member to all their DepPreds.
-void SuperWord::remove_cycles() {
+// The C2 graph (specifically the memory graph), needs to be re-ordered.
+// (1) Build the PacksetGraph. It combines the DepPreds graph with the
+//     packset. The PacksetGraph gives us the dependencies that must be
+//     respected after scheduling.
+// (2) Schedule the PacksetGraph to the memops_schedule, which represents
+//     a linear order of all memops in the body. The order respects the
+//     dependencies of the PacksetGraph.
+// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
+// (4) Use the memops_schedule to re-order the memops in all slices.
+void SuperWord::schedule() {
   if (_packset.length() == 0) {
     return; // empty packset
   }
   ResourceMark rm;
 
+  // (1) Build the PacksetGraph.
   PacksetGraph graph(this);
-
   graph.build();
 
-  if (!graph.schedule()) {
+  // (2) Schedule the PacksetGraph.
+  Node_List memops_schedule = graph.schedule();
+
+  // (3) Check if the PacksetGraph schedule succeeded (had no cycles).
+  // We now know that we only have independent packs, see verify_packs.
+  // This is a necessary but not a sufficient condition for an acyclic
+  // graph (DAG) after scheduling. Thus, we must check if the packs have
+  // introduced a cycle. The SuperWord paper mentions the need for this
+  // in "3.7 Scheduling".
+  if (!graph.schedule_success()) {
     if (TraceSuperWord) {
-      tty->print_cr("remove_cycles found cycle in PacksetGraph:");
+      tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
       graph.print(true, false);
       tty->print_cr("removing all packs from packset.");
     }
     _packset.clear();
+    return;
   }
+
+#ifndef PRODUCT
+  if (TraceSuperWord) {
+    tty->print_cr("SuperWord::schedule: memops_schedule:");
+    memops_schedule.dump();
+  }
+#endif
+
+  // (4) Use the memops_schedule to re-order the memops in all slices.
+  schedule_reorder_memops(memops_schedule);
 }
 
-//------------------------------schedule---------------------------
-// Adjust the memory graph for the packed operations
-void SuperWord::schedule() {
 
-  // Co-locate in the memory graph the members of each memory pack
-  for (int i = 0; i < _packset.length(); i++) {
-    co_locate_pack(_packset.at(i));
+// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
+// and track the current memory state of each slice.
+void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
+  int max_slices = _phase->C->num_alias_types();
+  // When iterating over the memops_schedule, we keep track of the current memory state,
+  // which is the Phi or a store in the loop.
+  GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
+  // The memory state after the loop is the last store inside the loop. If we reorder the
+  // loop we may have a different last store, and we need to adjust the uses accordingly.
+  GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
+
+  // (1) Set up the initial memory state from Phi. And find the old last store.
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* phi  = _mem_slice_head.at(i);
+    assert(phi->is_Phi(), "must be phi");
+    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    current_state_in_slice.at_put(alias_idx, phi);
+
+    // If we have a memory phi, we have a last store in the loop, find it over backedge.
+    StoreNode* last_store = phi->in(2)->as_Store();
+    old_last_store_in_slice.at_put(alias_idx, last_store);
   }
-}
 
-//-------------------------------remove_and_insert-------------------
-// Remove "current" from its current position in the memory graph and insert
-// it after the appropriate insertion point (lip or uip).
-void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
-                                  Node *uip, Unique_Node_List &sched_before) {
-  Node* my_mem = current->in(MemNode::Memory);
-  bool sched_up = sched_before.member(current);
+  // (2) Walk over memops_schedule, append memops to the current state
+  //     of that slice. If it is a Store, we take it as the new state.
+  for (uint i = 0; i < memops_schedule.size(); i++) {
+    MemNode* n = memops_schedule.at(i)->as_Mem();
+    assert(n->is_Load() || n->is_Store(), "only loads or stores");
+    int alias_idx = _phase->C->get_alias_index(n->adr_type());
+    Node* current_state = current_state_in_slice.at(alias_idx);
+    if (current_state == nullptr) {
+      // If there are only loads in a slice, we never update the memory
+      // state in the loop, hence there is no phi for the memory state.
+      // We just keep the old memory state that was outside the loop.
+      assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
+             "only loads can have memory state from outside loop");
+    } else {
+      _igvn.replace_input_of(n, MemNode::Memory, current_state);
+      if (n->is_Store()) {
+        current_state_in_slice.at_put(alias_idx, n);
+      }
+    }
+  }
 
-  // remove current_store from its current position in the memory graph
-  for (DUIterator i = current->outs(); current->has_out(i); i++) {
-    Node* use = current->out(i);
-    if (use->is_Mem()) {
-      assert(use->in(MemNode::Memory) == current, "must be");
-      if (use == prev) { // connect prev to my_mem
-          _igvn.replace_input_of(use, MemNode::Memory, my_mem);
-          --i; //deleted this edge; rescan position
-      } else if (sched_before.member(use)) {
-        if (!sched_up) { // Will be moved together with current
-          _igvn.replace_input_of(use, MemNode::Memory, uip);
-          --i; //deleted this edge; rescan position
-        }
-      } else {
-        if (sched_up) { // Will be moved together with current
-          _igvn.replace_input_of(use, MemNode::Memory, lip);
-          --i; //deleted this edge; rescan position
+  // (3) For each slice, we add the current state to the backedge
+  //     in the Phi. Further, we replace uses of the old last store
+  //     with uses of the new last store (current_state).
+  Node_List uses_after_loop;
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* phi  = _mem_slice_head.at(i);
+    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    Node* current_state = current_state_in_slice.at(alias_idx);
+    assert(current_state != nullptr, "slice is mapped");
+    assert(current_state != phi, "did some work in between");
+    assert(current_state->is_Store(), "sanity");
+    _igvn.replace_input_of(phi, 2, current_state);
+
+    // Replace uses of old last store with current_state (new last store)
+    // Do it in two loops: first find all the uses, and change the graph
+    // in as second loop so that we do not break the iterator.
+    Node* last_store = old_last_store_in_slice.at(alias_idx);
+    assert(last_store != nullptr, "we have a old last store");
+    uses_after_loop.clear();
+    for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
+      Node* use = last_store->fast_out(k);
+      if (!in_bb(use)) {
+        uses_after_loop.push(use);
+      }
+    }
+    for (uint k = 0; k < uses_after_loop.size(); k++) {
+      Node* use = uses_after_loop.at(k);
+      for (uint j = 0; j < use->req(); j++) {
+        Node* def = use->in(j);
+        if (def == last_store) {
+          _igvn.replace_input_of(use, j, current_state);
         }
       }
     }
   }
-
-  Node *insert_pt =  sched_up ?  uip : lip;
-
-  // all uses of insert_pt's memory state should use current's instead
-  for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
-    Node* use = insert_pt->out(i);
-    if (use->is_Mem()) {
-      assert(use->in(MemNode::Memory) == insert_pt, "must be");
-      _igvn.replace_input_of(use, MemNode::Memory, current);
-      --i; //deleted this edge; rescan position
-    } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
-      uint pos; //lip (lower insert point) must be the last one in the memory slice
-      for (pos=1; pos < use->req(); pos++) {
-        if (use->in(pos) == insert_pt) break;
-      }
-      _igvn.replace_input_of(use, pos, current);
-      --i;
-    }
-  }
-
-  //connect current to insert_pt
-  _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
-}
-
-//------------------------------co_locate_pack----------------------------------
-// To schedule a store pack, we need to move any sandwiched memory ops either before
-// or after the pack, based upon dependence information:
-// (1) If any store in the pack depends on the sandwiched memory op, the
-//     sandwiched memory op must be scheduled BEFORE the pack;
-// (2) If a sandwiched memory op depends on any store in the pack, the
-//     sandwiched memory op must be scheduled AFTER the pack;
-// (3) If a sandwiched memory op (say, memA) depends on another sandwiched
-//     memory op (say memB), memB must be scheduled before memA. So, if memA is
-//     scheduled before the pack, memB must also be scheduled before the pack;
-// (4) If there is no dependence restriction for a sandwiched memory op, we simply
-//     schedule this store AFTER the pack
-// (5) We know there is no dependence cycle, so there in no other case;
-// (6) Finally, all memory ops in another single pack should be moved in the same direction.
-//
-// To schedule a load pack, we use the memory state of either the first or the last load in
-// the pack, based on the dependence constraint.
-void SuperWord::co_locate_pack(Node_List* pk) {
-  if (pk->at(0)->is_Store()) {
-    MemNode* first     = executed_first(pk)->as_Mem();
-    MemNode* last      = executed_last(pk)->as_Mem();
-    Unique_Node_List schedule_before_pack;
-    Unique_Node_List memops;
-
-    MemNode* current   = last->in(MemNode::Memory)->as_Mem();
-    MemNode* previous  = last;
-    while (true) {
-      assert(in_bb(current), "stay in block");
-      memops.push(previous);
-      for (DUIterator i = current->outs(); current->has_out(i); i++) {
-        Node* use = current->out(i);
-        if (use->is_Mem() && use != previous)
-          memops.push(use);
-      }
-      if (current == first) break;
-      previous = current;
-      current  = current->in(MemNode::Memory)->as_Mem();
-    }
-
-    // determine which memory operations should be scheduled before the pack
-    for (uint i = 1; i < memops.size(); i++) {
-      Node *s1 = memops.at(i);
-      if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
-        for (uint j = 0; j< i; j++) {
-          Node *s2 = memops.at(j);
-          if (!independent(s1, s2)) {
-            if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
-              schedule_before_pack.push(s1); // s1 must be scheduled before
-              Node_List* mem_pk = my_pack(s1);
-              if (mem_pk != nullptr) {
-                for (uint ii = 0; ii < mem_pk->size(); ii++) {
-                  Node* s = mem_pk->at(ii);  // follow partner
-                  if (memops.member(s) && !schedule_before_pack.member(s))
-                    schedule_before_pack.push(s);
-                }
-              }
-              break;
-            }
-          }
-        }
-      }
-    }
-
-    Node*    upper_insert_pt = first->in(MemNode::Memory);
-    // Following code moves loads connected to upper_insert_pt below aliased stores.
-    // Collect such loads here and reconnect them back to upper_insert_pt later.
-    memops.clear();
-    for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
-      Node* use = upper_insert_pt->out(i);
-      if (use->is_Mem() && !use->is_Store()) {
-        memops.push(use);
-      }
-    }
-
-    MemNode* lower_insert_pt = last;
-    previous                 = last; //previous store in pk
-    current                  = last->in(MemNode::Memory)->as_Mem();
-
-    // start scheduling from "last" to "first"
-    while (true) {
-      assert(in_bb(current), "stay in block");
-      assert(in_pack(previous, pk), "previous stays in pack");
-      Node* my_mem = current->in(MemNode::Memory);
-
-      if (in_pack(current, pk)) {
-        // Forward users of my memory state (except "previous) to my input memory state
-        for (DUIterator i = current->outs(); current->has_out(i); i++) {
-          Node* use = current->out(i);
-          if (use->is_Mem() && use != previous) {
-            assert(use->in(MemNode::Memory) == current, "must be");
-            if (schedule_before_pack.member(use)) {
-              _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
-            } else {
-              _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
-            }
-            --i; // deleted this edge; rescan position
-          }
-        }
-        previous = current;
-      } else { // !in_pack(current, pk) ==> a sandwiched store
-        remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
-      }
-
-      if (current == first) break;
-      current = my_mem->as_Mem();
-    } // end while
-
-    // Reconnect loads back to upper_insert_pt.
-    for (uint i = 0; i < memops.size(); i++) {
-      Node *ld = memops.at(i);
-      if (ld->in(MemNode::Memory) != upper_insert_pt) {
-        _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
-      }
-    }
-  } else if (pk->at(0)->is_Load()) { // Load pack
-    // All loads in the pack should have the same memory state. By default,
-    // we use the memory state of the last load. However, if any load could
-    // not be moved down due to the dependence constraint, we use the memory
-    // state of the first load.
-    Node* mem_input = pick_mem_state(pk);
-    _igvn.hash_delete(mem_input);
-    // Give each load the same memory state
-    for (uint i = 0; i < pk->size(); i++) {
-      LoadNode* ld = pk->at(i)->as_Load();
-      _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
-    }
-  }
-}
-
-// Finds the first and last memory state and then picks either of them by checking dependence constraints.
-// If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
-// pick the memory state of the last load.
-Node* SuperWord::pick_mem_state(Node_List* pk) {
-  Node* first_mem = find_first_mem_state(pk);
-  bool is_dependent = false;
-  Node* last_mem  = find_last_mem_state(pk, first_mem, is_dependent);
-
-  for (uint i = 0; i < pk->size(); i++) {
-    Node* ld = pk->at(i);
-    for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
-      assert(current->is_Mem() && in_bb(current), "unexpected memory");
-      assert(current != first_mem, "corrupted memory graph");
-      if (!independent(current, ld)) {
-        // A later unvectorized store depends on this load, pick the memory state of the first load. This can happen,
-        // for example, if a load pack has interleaving stores that are part of a store pack which, however, is removed
-        // at the pack filtering stage. This leaves us with only a load pack for which we cannot take the memory state
-        // of the last load as the remaining unvectorized stores could interfere since they have a dependency to the loads.
-        // Some stores could be executed before the load vector resulting in a wrong result. We need to take the
-        // memory state of the first load to prevent this.
-        if (my_pack(current) != nullptr && is_dependent) {
-          // For vectorized store pack, when the load pack depends on
-          // some memory operations locating after first_mem, we still
-          // take the memory state of the last load.
-          continue;
-        }
-        return first_mem;
-      }
-    }
-  }
-  return last_mem;
-}
-
-// Walk the memory graph from the current first load until the
-// start of the loop and check if nodes on the way are memory
-// edges of loads in the pack. The last one we encounter is the
-// first load.
-Node* SuperWord::find_first_mem_state(Node_List* pk) {
-  Node* first_mem = pk->at(0)->in(MemNode::Memory);
-  for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
-    assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
-    for (uint i = 1; i < pk->size(); i++) {
-      Node* ld = pk->at(i);
-      if (ld->in(MemNode::Memory) == current) {
-        first_mem = current;
-        break;
-      }
-    }
-  }
-  return first_mem;
-}
-
-// Find the last load by going over the pack again and walking
-// the memory graph from the loads of the pack to the memory of
-// the first load. If we encounter the memory of the current last
-// load, then we started from further down in the memory graph and
-// the load we started from is the last load. At the same time, the
-// function also helps determine if some loads in the pack depend on
-// early memory operations which locate after first_mem.
-Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent) {
-  Node* last_mem = pk->at(0)->in(MemNode::Memory);
-  for (uint i = 0; i < pk->size(); i++) {
-    Node* ld = pk->at(i);
-    for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
-      assert(current->is_Mem() && in_bb(current), "unexpected memory");
-      // Determine if the load pack is dependent on some memory operations locating after first_mem.
-      is_dependent |= !independent(current, ld);
-      if (current->in(MemNode::Memory) == last_mem) {
-        last_mem = ld->in(MemNode::Memory);
-      }
-    }
-  }
-  return last_mem;
 }
 
 #ifndef PRODUCT
@@ -3037,12 +2903,14 @@ bool SuperWord::output() {
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
     Node_List* p = my_pack(n);
-    if (p && n == executed_last(p)) {
+    if (p != nullptr && n == p->at(p->size()-1)) {
+      // After schedule_reorder_memops, we know that the memops have the same order in the pack
+      // as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
+      // and "n" is the last node in the slice from the pack.
+      Node* first = p->at(0);
       uint vlen = p->size();
       uint vlen_in_bytes = 0;
       Node* vn = nullptr;
-      Node* low_adr = p->at(0);
-      Node* first   = executed_first(p);
       if (cl->is_rce_post_loop()) {
         // override vlen with the main loops vector length
         vlen = cl->slp_max_unroll();
@@ -3066,7 +2934,7 @@ bool SuperWord::output() {
             break; // dependent memory
           }
         }
-        Node* adr = low_adr->in(MemNode::Address);
+        Node* adr = first->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
         if (cl->is_rce_post_loop()) {
           assert(vmask != nullptr, "vector mask should be generated");
@@ -3089,7 +2957,7 @@ bool SuperWord::output() {
 
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
-        Node* adr = low_adr->in(MemNode::Address);
+        Node* adr = first->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
         if (cl->is_rce_post_loop()) {
           assert(vmask != nullptr, "vector mask should be generated");
@@ -3100,8 +2968,8 @@ bool SuperWord::output() {
         }
         vlen_in_bytes = vn->as_StoreVector()->memory_size();
       } else if (VectorNode::is_scalar_rotate(n)) {
-        Node* in1 = low_adr->in(1);
-        Node* in2 = p->at(0)->in(2);
+        Node* in1 = first->in(1);
+        Node* in2 = first->in(2);
         // If rotation count is non-constant or greater than 8bit value create a vector.
         if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
           in2 =  vector_opd(p, 2);
@@ -3110,7 +2978,7 @@ bool SuperWord::output() {
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (VectorNode::is_roundopD(n)) {
         Node* in1 = vector_opd(p, 1);
-        Node* in2 = low_adr->in(2);
+        Node* in2 = first->in(2);
         assert(in2->is_Con(), "Constant rounding mode expected.");
         vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
@@ -3133,7 +3001,7 @@ bool SuperWord::output() {
         bool node_isa_reduction = is_marked_reduction(n);
         if (node_isa_reduction) {
           // the input to the first reduction operation is retained
-          in1 = low_adr->in(1);
+          in1 = first->in(1);
         } else {
           in1 = vector_opd(p, 1);
           if (in1 == nullptr) {
@@ -3201,7 +3069,7 @@ bool SuperWord::output() {
         Node* in = vector_opd(p, 1);
         Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
         _igvn.register_new_node_with_optimizer(longval);
-        _phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
+        _phase->set_ctrl(longval, _phase->get_ctrl(first));
         vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (VectorNode::is_convert_opcode(opc)) {
@@ -3316,7 +3184,7 @@ bool SuperWord::output() {
 
       _block.at_put(i, vn);
       _igvn.register_new_node_with_optimizer(vn);
-      _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
+      _phase->set_ctrl(vn, _phase->get_ctrl(first));
       for (uint j = 0; j < p->size(); j++) {
         Node* pm = p->at(j);
         _igvn.replace_node(pm, vn);
@@ -4196,17 +4064,6 @@ bool SuperWord::in_packset(Node* s1, Node* s2) {
   return false;
 }
 
-//------------------------------in_pack---------------------------
-// Is s in pack p?
-Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
-  for (uint i = 0; i < p->size(); i++) {
-    if (p->at(i) == s) {
-      return p;
-    }
-  }
-  return nullptr;
-}
-
 //------------------------------remove_pack_at---------------------------
 // Remove the pack at position pos in the packset
 void SuperWord::remove_pack_at(int pos) {
@@ -4239,39 +4096,6 @@ void SuperWord::packset_sort(int n) {
   }
 }
 
-//------------------------------executed_first---------------------------
-// Return the node executed first in pack p.  Uses the RPO block list
-// to determine order.
-Node* SuperWord::executed_first(Node_List* p) {
-  Node* n = p->at(0);
-  int n_rpo = bb_idx(n);
-  for (uint i = 1; i < p->size(); i++) {
-    Node* s = p->at(i);
-    int s_rpo = bb_idx(s);
-    if (s_rpo < n_rpo) {
-      n = s;
-      n_rpo = s_rpo;
-    }
-  }
-  return n;
-}
-
-//------------------------------executed_last---------------------------
-// Return the node executed last in pack p.
-Node* SuperWord::executed_last(Node_List* p) {
-  Node* n = p->at(0);
-  int n_rpo = bb_idx(n);
-  for (uint i = 1; i < p->size(); i++) {
-    Node* s = p->at(i);
-    int s_rpo = bb_idx(s);
-    if (s_rpo > n_rpo) {
-      n = s;
-      n_rpo = s_rpo;
-    }
-  }
-  return n;
-}
-
 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
   LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
   for (uint i = 0; i < p->size(); i++) {
@@ -4547,16 +4371,6 @@ void SuperWord::print_stmt(Node* s) {
 #endif
 }
 
-//------------------------------blank---------------------------
-char* SuperWord::blank(uint depth) {
-  static char blanks[101];
-  assert(depth < 101, "too deep");
-  for (uint i = 0; i < depth; i++) blanks[i] = ' ';
-  blanks[depth] = '\0';
-  return blanks;
-}
-
-
 //==============================SWPointer===========================
 #ifndef PRODUCT
 int SWPointer::Tracer::_depth = 0;
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 70e97e9444c..cf358b5d439 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -458,7 +458,9 @@ class SuperWord : public ResourceObj {
   bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const;
 
   // my_pack
+ public:
   Node_List* my_pack(Node* n)                 { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
+ private:
   void set_my_pack(Node* n, Node_List* p)     { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
   // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
   bool is_cmov_pack(Node_List* p);
@@ -618,19 +620,10 @@ private:
   void merge_packs_to_cmove();
   // Verify that for every pack, all nodes are mutually independent
   DEBUG_ONLY(void verify_packs();)
-  // Remove cycles in packset.
-  void remove_cycles();
   // Adjust the memory graph for the packed operations
   void schedule();
-  // Remove "current" from its current position in the memory graph and insert
-  // it after the appropriate insert points (lip or uip);
-  void remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &schd_before);
-  // Within a store pack, schedule stores together by moving out the sandwiched memory ops according
-  // to dependence info; and within a load pack, move loads down to the last executed load.
-  void co_locate_pack(Node_List* p);
-  Node* pick_mem_state(Node_List* pk);
-  Node* find_first_mem_state(Node_List* pk);
-  Node* find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent);
+  // Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
+  void schedule_reorder_memops(Node_List &memops_schedule);
 
   // Convert packs into vector node operations
   bool output();
@@ -662,19 +655,11 @@ private:
   void compute_vector_element_type();
   // Are s1 and s2 in a pack pair and ordered as s1,s2?
   bool in_packset(Node* s1, Node* s2);
-  // Is s in pack p?
-  Node_List* in_pack(Node* s, Node_List* p);
   // Remove the pack at position pos in the packset
   void remove_pack_at(int pos);
-  // Return the node executed first in pack p.
-  Node* executed_first(Node_List* p);
-  // Return the node executed last in pack p.
-  Node* executed_last(Node_List* p);
   static LoadNode::ControlDependency control_dependency(Node_List* p);
   // Alignment within a vector memory reference
   int memory_alignment(MemNode* s, int iv_adjust);
-  // (Start, end] half-open range defining which operands are vector
-  void vector_opd_range(Node* n, uint* start, uint* end);
   // Smallest type containing range of values
   const Type* container_type(Node* n);
   // Adjust pre-loop limit so that in main loop, a load/store reference
@@ -693,7 +678,6 @@ private:
   void print_pack(Node_List* p);
   void print_bb();
   void print_stmt(Node* s);
-  char* blank(uint depth);
 
   void packset_sort(int n);
 };
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
index 870c4be3931..c384253d6e6 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
@@ -27,9 +27,8 @@
  * @bug 8304042
  * @summary Test some examples with independent packs with cyclic dependency
  *          between the packs.
- * @requires vm.compiler2.enabled
  * @requires vm.bits == 64
- * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
+ * @requires vm.compiler2.enabled
  * @modules java.base/jdk.internal.misc
  * @library /test/lib /
  * @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency
@@ -94,8 +93,8 @@ public class TestIndependentPacksWithCyclicDependency {
         test3(goldI3, goldI3, goldF3, goldF3);
         init(goldI4, goldF4);
         test4(goldI4, goldI4, goldF4, goldF4);
-//        init(goldI5, goldF5);
-//        test5(goldI5, goldI5, goldF5, goldF5);
+        init(goldI5, goldF5);
+        test5(goldI5, goldI5, goldF5, goldF5);
         init(goldI6, goldF6, goldL6);
         test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6);
         init(goldI7, goldF7, goldL7);
@@ -232,29 +231,27 @@ public class TestIndependentPacksWithCyclicDependency {
         }
     }
 
-// TODO uncomment after fixing JDK-8304720
-//
-//    @Run(test = "test5")
-//    public void runTest5() {
-//        int[] dataI = new int[RANGE];
-//        float[] dataF = new float[RANGE];
-//        init(dataI, dataF);
-//        test5(dataI, dataI, dataF, dataF);
-//        verify("test5", dataI, goldI5);
-//        verify("test5", dataF, goldF5);
-//    }
-//
-//    @Test
-//    static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
-//        for (int i = 0; i < RANGE; i+=2) {
-//            // same as test2, except that reordering leads to different semantics
-//            // explanation analogue to test4
-//            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
-//            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
-//            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
-//            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
-//        }
-//    }
+    @Run(test = "test5")
+    public void runTest5() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test5(dataI, dataI, dataF, dataF);
+        verify("test5", dataI, goldI5);
+        verify("test5", dataF, goldF5);
+    }
+
+    @Test
+    static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // same as test2, except that reordering leads to different semantics
+            // explanation analogue to test4
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
+        }
+    }
 
     @Run(test = "test6")
     public void runTest6() {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
index 328719daf58..31a2ba3a0cb 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
@@ -29,8 +29,6 @@
  *          between the packs.
  *          Before fix, this hit: "assert(!is_visited) failed: visit only once"
  * @requires vm.compiler2.enabled
- * @requires vm.bits == 64
- * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
  * @modules java.base/jdk.internal.misc
  * @library /test/lib /
  * @run main/othervm -XX:LoopUnrollLimit=250
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
new file mode 100644
index 00000000000..f74c19d1b8b
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+/*
+ * @test
+ * @bug 8304720
+ * @summary Test some examples where non-vectorized memops also need to
+ *          be reordered during SuperWord::schedule.
+ * @requires vm.compiler2.enabled
+ * @modules java.base/jdk.internal.misc
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestScheduleReordersScalarMemops
+ */
+
+package compiler.loopopts.superword;
+
+import jdk.internal.misc.Unsafe;
+import jdk.test.lib.Asserts;
+import compiler.lib.ir_framework.*;
+
+public class TestScheduleReordersScalarMemops {
+    static final int RANGE = 1024;
+    static final int ITER  = 10_000;
+    static Unsafe unsafe = Unsafe.getUnsafe();
+
+    int[]   goldI0 = new int[RANGE];
+    float[] goldF0 = new float[RANGE];
+    int[]   goldI1 = new int[RANGE];
+    float[] goldF1 = new float[RANGE];
+
+    public static void main(String args[]) {
+        TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::test*",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::verify",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::init",
+                                   "-XX:LoopUnrollLimit=1000",
+                                   "-XX:-TieredCompilation", "-Xbatch");
+    }
+
+    TestScheduleReordersScalarMemops() {
+        // compute the gold standard in interpreter mode
+        init(goldI0, goldF0);
+        test0(goldI0, goldI0, goldF0, goldF0);
+        init(goldI1, goldF1);
+        test1(goldI1, goldI1, goldF1, goldF1);
+    }
+
+    @Run(test = "test0")
+    @Warmup(100)
+    public void runTest0() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test0(dataI, dataI, dataF, dataF);
+        verify("test0", dataI, goldI0);
+        verify("test0", dataF, goldF0);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_VI, "> 0"},
+        applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
+    static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // We have dependency edges:
+            //  A -> X
+            //  Y -> B
+            // Still, we can vectorize [X,Y].
+            // We do not vectorize A and B, because they are not isomorphic (add vs mul).
+            //
+            // Imagine this is unrolled at least 2x.
+            // We get order: A0 X0 Y0 B0 A1 X1 Y1 B1
+            // Vectorized:      X0 Y0       X1 Y1
+            // Scalar:       A0       B0 A1       B1
+            //
+            // However, since the As need to be before, and the Bs after the vector operations,
+            // we need to have all As before all Bs. This means we need to reorder the scalar
+            // operations, and not just the vectorized ones.
+            //
+            // A correct reordering would be: A0 A1 [X0, Y0, X1, Y1] B0 B1
+            //
+            dataFa[i + 0] = dataIa[i + 0] * 1.3f;     // A *1.3
+            dataIb[i + 0] = (int)dataFb[i + 0] * 11;  // X *11
+            dataIb[i + 1] = (int)dataFb[i + 1] * 11;  // Y *11
+            dataFa[i + 1] = dataIa[i + 1] + 1.2f;     // B +1.2
+        }
+    }
+
+    @Run(test = "test1")
+    @Warmup(100)
+    public void runTest1() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test1(dataI, dataI, dataF, dataF);
+        verify("test1", dataI, goldI1);
+        verify("test1", dataF, goldF1);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_VI, "> 0"},
+        applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
+    static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // Do the same as test0, but without int-float conversion.
+            // This should reproduce on machines where conversion is not implemented.
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1);  // A +1
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0);  // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4);  // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11
+        }
+    }
+
+    static void init(int[] dataI, float[] dataF) {
+        for (int i = 0; i < RANGE; i++) {
+            dataI[i] = i + 1;
+            dataF[i] = i + 0.1f;
+        }
+    }
+
+    static void verify(String name, int[] data, int[] gold) {
+        for (int i = 0; i < RANGE; i++) {
+            if (data[i] != gold[i]) {
+                throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]);
+            }
+        }
+    }
+
+    static void verify(String name, float[] data, float[] gold) {
+        for (int i = 0; i < RANGE; i++) {
+            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            if (datav != goldv) {
+                throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
+            }
+        }
+    }
+}