8304720: SuperWord::schedule should rebuild C2-graph from SuperWord dependency-graph

Reviewed-by: kvn, fgao
2023-05-08 06:09:10 +00:00 · 2023-05-08 06:09:10 +00:00 · ad0e5a99ca
commit ad0e5a99ca
parent 495f2688d6
5 changed files with 351 additions and 399 deletions
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -669,8 +669,6 @@ bool SuperWord::SLP_extract() {

    DEBUG_ONLY(verify_packs();)

-    remove_cycles();
-
    schedule();

    // Record eventual count of vector packs for checks in post loop vectorization
@ -1431,11 +1429,13 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
    return false;
  }

-  // FIXME - co_locate_pack fails on Stores in different mem-slices, so
-  // only pack memops that are in the same alias set until that's fixed.
+  // Adjacent memory references must be on the same slice.
  if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
    return false;
  }
+
+  // Adjacent memory references must have the same base, be comparable
+  // and have the correct distance between them.
  SWPointer p1(s1->as_Mem(), this, nullptr, false);
  SWPointer p2(s2->as_Mem(), this, nullptr, false);
  if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
@ -2497,10 +2497,13 @@ class PacksetGraph {
 private:
  // pid: packset graph node id.
  GrowableArray<int> _pid;                 // bb_idx(n) -> pid
+  GrowableArray<Node*> _pid_to_node;       // one node per pid, find rest via my_pack
  GrowableArray<GrowableArray<int>> _out;  // out-edges
  GrowableArray<int> _incnt;               // number of (implicit) in-edges
  int _max_pid = 0;

+  bool _schedule_success;
+
  SuperWord* _slp;
 public:
  PacksetGraph(SuperWord* slp)
@ -2523,11 +2526,18 @@ public:
    assert(poz != 0, "pid should not be zero");
    return poz;
  }
-  void set_pid(const Node* n, int pid) {
+  void set_pid(Node* n, int pid) {
    assert(n != nullptr && pid > 0, "sane inputs");
    assert(_slp->in_bb(n), "must be");
    int idx = _slp->bb_idx(n);
    _pid.at_put_grow(idx, pid);
+    _pid_to_node.at_put_grow(pid - 1, n, nullptr);
+  }
+  Node* get_node(int pid) {
+    assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
+    Node* n = _pid_to_node.at(pid - 1);
+    assert(n != nullptr, "sanity");
+    return n;
  }
  int new_pid() {
    _incnt.push(0);
@ -2537,6 +2547,7 @@ public:
  int incnt(int pid) { return _incnt.at(pid - 1); }
  void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
  GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
+  bool schedule_success() const { return _schedule_success; }

  // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
  void build() {
@ -2550,6 +2561,7 @@ public:
      for (uint k = 0; k < p->size(); k++) {
        Node* n = p->at(k);
        set_pid(n, pid);
+        assert(_slp->my_pack(n) == p, "matching packset");
      }
    }

@ -2565,6 +2577,7 @@ public:
      if (pid == 0) {
        pid = new_pid();
        set_pid(n, pid);
+        assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset");
      }
    }

@ -2610,11 +2623,13 @@ public:
      }
    }
  }
-  // Schedule the graph to worklist. Returns true iff all nodes were scheduled.
-  // This implies that we return true iff the PacksetGraph is acyclic.
-  // We schedule with topological sort: schedule any node that has zero incnt.
-  // Then remove that node, which decrements the incnt of all its uses (outputs).
-  bool schedule() {
+
+  // Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
+  // that has zero incnt. If a PacksetGraph node corresponds to memops, then add
+  // those to the memops_schedule. At the end, we return the memops_schedule, and
+  // note if topsort was successful.
+  Node_List schedule() {
+    Node_List memops_schedule;
    GrowableArray<int> worklist;
    // Directly schedule all nodes without precedence
    for (int pid = 1; pid <= _max_pid; pid++) {
@ -2625,6 +2640,22 @@ public:
    // Continue scheduling via topological sort
    for (int i = 0; i < worklist.length(); i++) {
      int pid = worklist.at(i);
+
+      // Add memops to memops_schedule
+      Node* n = get_node(pid);
+      Node_List* p = _slp->my_pack(n);
+      if (n->is_Mem()) {
+        if (p == nullptr) {
+          memops_schedule.push(n);
+        } else {
+          for (uint k = 0; k < p->size(); k++) {
+            memops_schedule.push(p->at(k));
+            assert(p->at(k)->is_Mem(), "only schedule memops");
+          }
+        }
+      }
+
+      // Decrement incnt for all successors
      for (int j = 0; j < out(pid).length(); j++){
        int pid_use = out(pid).at(j);
        int incnt_use = incnt(pid_use) - 1;
@ -2635,9 +2666,12 @@ public:
        }
      }
    }
-    // Was every pid scheduled?
-    return worklist.length() == _max_pid;
+
+    // Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
+    _schedule_success = (worklist.length() == _max_pid);
+    return memops_schedule;
  }
+
  // Print the PacksetGraph.
  // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
  // print_zero_incnt = false: do not print nodes that have no in-edges (any more).
@ -2668,303 +2702,135 @@ public:
  }
 };

-//------------------------------remove_cycles---------------------------
-// We now know that we only have independent packs, see verify_packs.
-// This is a necessary but not a sufficient condition for an acyclic
-// graph (DAG) after scheduling. Thus, we must check if the packs have
-// introduced a cycle. The SuperWord paper mentions the need for this
-// in "3.7 Scheduling".
-// Approach: given all nodes from the _block, we create a new graph.
-// The nodes that are not in a pack are their own nodes (scalar-node)
-// in that new graph. Every pack is also a node (pack-node). We then
-// add the edges according to DepPreds: a scalar-node has all edges
-// to its node's DepPreds. A pack-node has all edges from every pack
-// member to all their DepPreds.
-void SuperWord::remove_cycles() {
+// The C2 graph (specifically the memory graph), needs to be re-ordered.
+// (1) Build the PacksetGraph. It combines the DepPreds graph with the
+//     packset. The PacksetGraph gives us the dependencies that must be
+//     respected after scheduling.
+// (2) Schedule the PacksetGraph to the memops_schedule, which represents
+//     a linear order of all memops in the body. The order respects the
+//     dependencies of the PacksetGraph.
+// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
+// (4) Use the memops_schedule to re-order the memops in all slices.
+void SuperWord::schedule() {
  if (_packset.length() == 0) {
    return; // empty packset
  }
  ResourceMark rm;

+  // (1) Build the PacksetGraph.
  PacksetGraph graph(this);
-
  graph.build();

-  if (!graph.schedule()) {
+  // (2) Schedule the PacksetGraph.
+  Node_List memops_schedule = graph.schedule();
+
+  // (3) Check if the PacksetGraph schedule succeeded (had no cycles).
+  // We now know that we only have independent packs, see verify_packs.
+  // This is a necessary but not a sufficient condition for an acyclic
+  // graph (DAG) after scheduling. Thus, we must check if the packs have
+  // introduced a cycle. The SuperWord paper mentions the need for this
+  // in "3.7 Scheduling".
+  if (!graph.schedule_success()) {
    if (TraceSuperWord) {
-      tty->print_cr("remove_cycles found cycle in PacksetGraph:");
+      tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
      graph.print(true, false);
      tty->print_cr("removing all packs from packset.");
    }
    _packset.clear();
+    return;
  }
+
+#ifndef PRODUCT
+  if (TraceSuperWord) {
+    tty->print_cr("SuperWord::schedule: memops_schedule:");
+    memops_schedule.dump();
+  }
+#endif
+
+  // (4) Use the memops_schedule to re-order the memops in all slices.
+  schedule_reorder_memops(memops_schedule);
 }

-//------------------------------schedule---------------------------
-// Adjust the memory graph for the packed operations
-void SuperWord::schedule() {

-  // Co-locate in the memory graph the members of each memory pack
-  for (int i = 0; i < _packset.length(); i++) {
-    co_locate_pack(_packset.at(i));
+// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
+// and track the current memory state of each slice.
+void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
+  int max_slices = _phase->C->num_alias_types();
+  // When iterating over the memops_schedule, we keep track of the current memory state,
+  // which is the Phi or a store in the loop.
+  GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
+  // The memory state after the loop is the last store inside the loop. If we reorder the
+  // loop we may have a different last store, and we need to adjust the uses accordingly.
+  GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
+
+  // (1) Set up the initial memory state from Phi. And find the old last store.
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* phi  = _mem_slice_head.at(i);
+    assert(phi->is_Phi(), "must be phi");
+    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    current_state_in_slice.at_put(alias_idx, phi);
+
+    // If we have a memory phi, we have a last store in the loop, find it over backedge.
+    StoreNode* last_store = phi->in(2)->as_Store();
+    old_last_store_in_slice.at_put(alias_idx, last_store);
  }
-}

-//-------------------------------remove_and_insert-------------------
-// Remove "current" from its current position in the memory graph and insert
-// it after the appropriate insertion point (lip or uip).
-void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
-                                  Node *uip, Unique_Node_List &sched_before) {
-  Node* my_mem = current->in(MemNode::Memory);
-  bool sched_up = sched_before.member(current);
+  // (2) Walk over memops_schedule, append memops to the current state
+  //     of that slice. If it is a Store, we take it as the new state.
+  for (uint i = 0; i < memops_schedule.size(); i++) {
+    MemNode* n = memops_schedule.at(i)->as_Mem();
+    assert(n->is_Load() || n->is_Store(), "only loads or stores");
+    int alias_idx = _phase->C->get_alias_index(n->adr_type());
+    Node* current_state = current_state_in_slice.at(alias_idx);
+    if (current_state == nullptr) {
+      // If there are only loads in a slice, we never update the memory
+      // state in the loop, hence there is no phi for the memory state.
+      // We just keep the old memory state that was outside the loop.
+      assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
+             "only loads can have memory state from outside loop");
+    } else {
+      _igvn.replace_input_of(n, MemNode::Memory, current_state);
+      if (n->is_Store()) {
+        current_state_in_slice.at_put(alias_idx, n);
+      }
+    }
+  }

-  // remove current_store from its current position in the memory graph
-  for (DUIterator i = current->outs(); current->has_out(i); i++) {
-    Node* use = current->out(i);
-    if (use->is_Mem()) {
-      assert(use->in(MemNode::Memory) == current, "must be");
-      if (use == prev) { // connect prev to my_mem
-          _igvn.replace_input_of(use, MemNode::Memory, my_mem);
-          --i; //deleted this edge; rescan position
-      } else if (sched_before.member(use)) {
-        if (!sched_up) { // Will be moved together with current
-          _igvn.replace_input_of(use, MemNode::Memory, uip);
-          --i; //deleted this edge; rescan position
-        }
-      } else {
-        if (sched_up) { // Will be moved together with current
-          _igvn.replace_input_of(use, MemNode::Memory, lip);
-          --i; //deleted this edge; rescan position
+  // (3) For each slice, we add the current state to the backedge
+  //     in the Phi. Further, we replace uses of the old last store
+  //     with uses of the new last store (current_state).
+  Node_List uses_after_loop;
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* phi  = _mem_slice_head.at(i);
+    int alias_idx = _phase->C->get_alias_index(phi->adr_type());
+    Node* current_state = current_state_in_slice.at(alias_idx);
+    assert(current_state != nullptr, "slice is mapped");
+    assert(current_state != phi, "did some work in between");
+    assert(current_state->is_Store(), "sanity");
+    _igvn.replace_input_of(phi, 2, current_state);
+
+    // Replace uses of old last store with current_state (new last store)
+    // Do it in two loops: first find all the uses, and change the graph
+    // in as second loop so that we do not break the iterator.
+    Node* last_store = old_last_store_in_slice.at(alias_idx);
+    assert(last_store != nullptr, "we have a old last store");
+    uses_after_loop.clear();
+    for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
+      Node* use = last_store->fast_out(k);
+      if (!in_bb(use)) {
+        uses_after_loop.push(use);
+      }
+    }
+    for (uint k = 0; k < uses_after_loop.size(); k++) {
+      Node* use = uses_after_loop.at(k);
+      for (uint j = 0; j < use->req(); j++) {
+        Node* def = use->in(j);
+        if (def == last_store) {
+          _igvn.replace_input_of(use, j, current_state);
        }
      }
    }
  }
-
-  Node *insert_pt =  sched_up ?  uip : lip;
-
-  // all uses of insert_pt's memory state should use current's instead
-  for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
-    Node* use = insert_pt->out(i);
-    if (use->is_Mem()) {
-      assert(use->in(MemNode::Memory) == insert_pt, "must be");
-      _igvn.replace_input_of(use, MemNode::Memory, current);
-      --i; //deleted this edge; rescan position
-    } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
-      uint pos; //lip (lower insert point) must be the last one in the memory slice
-      for (pos=1; pos < use->req(); pos++) {
-        if (use->in(pos) == insert_pt) break;
-      }
-      _igvn.replace_input_of(use, pos, current);
-      --i;
-    }
-  }
-
-  //connect current to insert_pt
-  _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
-}
-
-//------------------------------co_locate_pack----------------------------------
-// To schedule a store pack, we need to move any sandwiched memory ops either before
-// or after the pack, based upon dependence information:
-// (1) If any store in the pack depends on the sandwiched memory op, the
-//     sandwiched memory op must be scheduled BEFORE the pack;
-// (2) If a sandwiched memory op depends on any store in the pack, the
-//     sandwiched memory op must be scheduled AFTER the pack;
-// (3) If a sandwiched memory op (say, memA) depends on another sandwiched
-//     memory op (say memB), memB must be scheduled before memA. So, if memA is
-//     scheduled before the pack, memB must also be scheduled before the pack;
-// (4) If there is no dependence restriction for a sandwiched memory op, we simply
-//     schedule this store AFTER the pack
-// (5) We know there is no dependence cycle, so there in no other case;
-// (6) Finally, all memory ops in another single pack should be moved in the same direction.
-//
-// To schedule a load pack, we use the memory state of either the first or the last load in
-// the pack, based on the dependence constraint.
-void SuperWord::co_locate_pack(Node_List* pk) {
-  if (pk->at(0)->is_Store()) {
-    MemNode* first     = executed_first(pk)->as_Mem();
-    MemNode* last      = executed_last(pk)->as_Mem();
-    Unique_Node_List schedule_before_pack;
-    Unique_Node_List memops;
-
-    MemNode* current   = last->in(MemNode::Memory)->as_Mem();
-    MemNode* previous  = last;
-    while (true) {
-      assert(in_bb(current), "stay in block");
-      memops.push(previous);
-      for (DUIterator i = current->outs(); current->has_out(i); i++) {
-        Node* use = current->out(i);
-        if (use->is_Mem() && use != previous)
-          memops.push(use);
-      }
-      if (current == first) break;
-      previous = current;
-      current  = current->in(MemNode::Memory)->as_Mem();
-    }
-
-    // determine which memory operations should be scheduled before the pack
-    for (uint i = 1; i < memops.size(); i++) {
-      Node *s1 = memops.at(i);
-      if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
-        for (uint j = 0; j< i; j++) {
-          Node *s2 = memops.at(j);
-          if (!independent(s1, s2)) {
-            if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
-              schedule_before_pack.push(s1); // s1 must be scheduled before
-              Node_List* mem_pk = my_pack(s1);
-              if (mem_pk != nullptr) {
-                for (uint ii = 0; ii < mem_pk->size(); ii++) {
-                  Node* s = mem_pk->at(ii);  // follow partner
-                  if (memops.member(s) && !schedule_before_pack.member(s))
-                    schedule_before_pack.push(s);
-                }
-              }
-              break;
-            }
-          }
-        }
-      }
-    }
-
-    Node*    upper_insert_pt = first->in(MemNode::Memory);
-    // Following code moves loads connected to upper_insert_pt below aliased stores.
-    // Collect such loads here and reconnect them back to upper_insert_pt later.
-    memops.clear();
-    for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
-      Node* use = upper_insert_pt->out(i);
-      if (use->is_Mem() && !use->is_Store()) {
-        memops.push(use);
-      }
-    }
-
-    MemNode* lower_insert_pt = last;
-    previous                 = last; //previous store in pk
-    current                  = last->in(MemNode::Memory)->as_Mem();
-
-    // start scheduling from "last" to "first"
-    while (true) {
-      assert(in_bb(current), "stay in block");
-      assert(in_pack(previous, pk), "previous stays in pack");
-      Node* my_mem = current->in(MemNode::Memory);
-
-      if (in_pack(current, pk)) {
-        // Forward users of my memory state (except "previous) to my input memory state
-        for (DUIterator i = current->outs(); current->has_out(i); i++) {
-          Node* use = current->out(i);
-          if (use->is_Mem() && use != previous) {
-            assert(use->in(MemNode::Memory) == current, "must be");
-            if (schedule_before_pack.member(use)) {
-              _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
-            } else {
-              _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
-            }
-            --i; // deleted this edge; rescan position
-          }
-        }
-        previous = current;
-      } else { // !in_pack(current, pk) ==> a sandwiched store
-        remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
-      }
-
-      if (current == first) break;
-      current = my_mem->as_Mem();
-    } // end while
-
-    // Reconnect loads back to upper_insert_pt.
-    for (uint i = 0; i < memops.size(); i++) {
-      Node *ld = memops.at(i);
-      if (ld->in(MemNode::Memory) != upper_insert_pt) {
-        _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
-      }
-    }
-  } else if (pk->at(0)->is_Load()) { // Load pack
-    // All loads in the pack should have the same memory state. By default,
-    // we use the memory state of the last load. However, if any load could
-    // not be moved down due to the dependence constraint, we use the memory
-    // state of the first load.
-    Node* mem_input = pick_mem_state(pk);
-    _igvn.hash_delete(mem_input);
-    // Give each load the same memory state
-    for (uint i = 0; i < pk->size(); i++) {
-      LoadNode* ld = pk->at(i)->as_Load();
-      _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
-    }
-  }
-}
-
-// Finds the first and last memory state and then picks either of them by checking dependence constraints.
-// If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
-// pick the memory state of the last load.
-Node* SuperWord::pick_mem_state(Node_List* pk) {
-  Node* first_mem = find_first_mem_state(pk);
-  bool is_dependent = false;
-  Node* last_mem  = find_last_mem_state(pk, first_mem, is_dependent);
-
-  for (uint i = 0; i < pk->size(); i++) {
-    Node* ld = pk->at(i);
-    for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
-      assert(current->is_Mem() && in_bb(current), "unexpected memory");
-      assert(current != first_mem, "corrupted memory graph");
-      if (!independent(current, ld)) {
-        // A later unvectorized store depends on this load, pick the memory state of the first load. This can happen,
-        // for example, if a load pack has interleaving stores that are part of a store pack which, however, is removed
-        // at the pack filtering stage. This leaves us with only a load pack for which we cannot take the memory state
-        // of the last load as the remaining unvectorized stores could interfere since they have a dependency to the loads.
-        // Some stores could be executed before the load vector resulting in a wrong result. We need to take the
-        // memory state of the first load to prevent this.
-        if (my_pack(current) != nullptr && is_dependent) {
-          // For vectorized store pack, when the load pack depends on
-          // some memory operations locating after first_mem, we still
-          // take the memory state of the last load.
-          continue;
-        }
-        return first_mem;
-      }
-    }
-  }
-  return last_mem;
-}
-
-// Walk the memory graph from the current first load until the
-// start of the loop and check if nodes on the way are memory
-// edges of loads in the pack. The last one we encounter is the
-// first load.
-Node* SuperWord::find_first_mem_state(Node_List* pk) {
-  Node* first_mem = pk->at(0)->in(MemNode::Memory);
-  for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
-    assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
-    for (uint i = 1; i < pk->size(); i++) {
-      Node* ld = pk->at(i);
-      if (ld->in(MemNode::Memory) == current) {
-        first_mem = current;
-        break;
-      }
-    }
-  }
-  return first_mem;
-}
-
-// Find the last load by going over the pack again and walking
-// the memory graph from the loads of the pack to the memory of
-// the first load. If we encounter the memory of the current last
-// load, then we started from further down in the memory graph and
-// the load we started from is the last load. At the same time, the
-// function also helps determine if some loads in the pack depend on
-// early memory operations which locate after first_mem.
-Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent) {
-  Node* last_mem = pk->at(0)->in(MemNode::Memory);
-  for (uint i = 0; i < pk->size(); i++) {
-    Node* ld = pk->at(i);
-    for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
-      assert(current->is_Mem() && in_bb(current), "unexpected memory");
-      // Determine if the load pack is dependent on some memory operations locating after first_mem.
-      is_dependent |= !independent(current, ld);
-      if (current->in(MemNode::Memory) == last_mem) {
-        last_mem = ld->in(MemNode::Memory);
-      }
-    }
-  }
-  return last_mem;
 }

 #ifndef PRODUCT
@ -3037,12 +2903,14 @@ bool SuperWord::output() {
  for (int i = 0; i < _block.length(); i++) {
    Node* n = _block.at(i);
    Node_List* p = my_pack(n);
-    if (p && n == executed_last(p)) {
+    if (p != nullptr && n == p->at(p->size()-1)) {
+      // After schedule_reorder_memops, we know that the memops have the same order in the pack
+      // as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
+      // and "n" is the last node in the slice from the pack.
+      Node* first = p->at(0);
      uint vlen = p->size();
      uint vlen_in_bytes = 0;
      Node* vn = nullptr;
-      Node* low_adr = p->at(0);
-      Node* first   = executed_first(p);
      if (cl->is_rce_post_loop()) {
        // override vlen with the main loops vector length
        vlen = cl->slp_max_unroll();
@ -3066,7 +2934,7 @@ bool SuperWord::output() {
            break; // dependent memory
          }
        }
-        Node* adr = low_adr->in(MemNode::Address);
+        Node* adr = first->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
        if (cl->is_rce_post_loop()) {
          assert(vmask != nullptr, "vector mask should be generated");
@ -3089,7 +2957,7 @@ bool SuperWord::output() {

        Node* ctl = n->in(MemNode::Control);
        Node* mem = first->in(MemNode::Memory);
-        Node* adr = low_adr->in(MemNode::Address);
+        Node* adr = first->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
        if (cl->is_rce_post_loop()) {
          assert(vmask != nullptr, "vector mask should be generated");
@ -3100,8 +2968,8 @@ bool SuperWord::output() {
        }
        vlen_in_bytes = vn->as_StoreVector()->memory_size();
      } else if (VectorNode::is_scalar_rotate(n)) {
-        Node* in1 = low_adr->in(1);
-        Node* in2 = p->at(0)->in(2);
+        Node* in1 = first->in(1);
+        Node* in2 = first->in(2);
        // If rotation count is non-constant or greater than 8bit value create a vector.
        if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
          in2 =  vector_opd(p, 2);
@ -3110,7 +2978,7 @@ bool SuperWord::output() {
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (VectorNode::is_roundopD(n)) {
        Node* in1 = vector_opd(p, 1);
-        Node* in2 = low_adr->in(2);
+        Node* in2 = first->in(2);
        assert(in2->is_Con(), "Constant rounding mode expected.");
        vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
@ -3133,7 +3001,7 @@ bool SuperWord::output() {
        bool node_isa_reduction = is_marked_reduction(n);
        if (node_isa_reduction) {
          // the input to the first reduction operation is retained
-          in1 = low_adr->in(1);
+          in1 = first->in(1);
        } else {
          in1 = vector_opd(p, 1);
          if (in1 == nullptr) {
@ -3201,7 +3069,7 @@ bool SuperWord::output() {
        Node* in = vector_opd(p, 1);
        Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
        _igvn.register_new_node_with_optimizer(longval);
-        _phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
+        _phase->set_ctrl(longval, _phase->get_ctrl(first));
        vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (VectorNode::is_convert_opcode(opc)) {
@ -3316,7 +3184,7 @@ bool SuperWord::output() {

      _block.at_put(i, vn);
      _igvn.register_new_node_with_optimizer(vn);
-      _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
+      _phase->set_ctrl(vn, _phase->get_ctrl(first));
      for (uint j = 0; j < p->size(); j++) {
        Node* pm = p->at(j);
        _igvn.replace_node(pm, vn);
@ -4196,17 +4064,6 @@ bool SuperWord::in_packset(Node* s1, Node* s2) {
  return false;
 }

-//------------------------------in_pack---------------------------
-// Is s in pack p?
-Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
-  for (uint i = 0; i < p->size(); i++) {
-    if (p->at(i) == s) {
-      return p;
-    }
-  }
-  return nullptr;
-}
-
 //------------------------------remove_pack_at---------------------------
 // Remove the pack at position pos in the packset
 void SuperWord::remove_pack_at(int pos) {
@ -4239,39 +4096,6 @@ void SuperWord::packset_sort(int n) {
  }
 }

-//------------------------------executed_first---------------------------
-// Return the node executed first in pack p.  Uses the RPO block list
-// to determine order.
-Node* SuperWord::executed_first(Node_List* p) {
-  Node* n = p->at(0);
-  int n_rpo = bb_idx(n);
-  for (uint i = 1; i < p->size(); i++) {
-    Node* s = p->at(i);
-    int s_rpo = bb_idx(s);
-    if (s_rpo < n_rpo) {
-      n = s;
-      n_rpo = s_rpo;
-    }
-  }
-  return n;
-}
-
-//------------------------------executed_last---------------------------
-// Return the node executed last in pack p.
-Node* SuperWord::executed_last(Node_List* p) {
-  Node* n = p->at(0);
-  int n_rpo = bb_idx(n);
-  for (uint i = 1; i < p->size(); i++) {
-    Node* s = p->at(i);
-    int s_rpo = bb_idx(s);
-    if (s_rpo > n_rpo) {
-      n = s;
-      n_rpo = s_rpo;
-    }
-  }
-  return n;
-}
-
 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
  LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
  for (uint i = 0; i < p->size(); i++) {
@ -4547,16 +4371,6 @@ void SuperWord::print_stmt(Node* s) {
 #endif
 }

-//------------------------------blank---------------------------
-char* SuperWord::blank(uint depth) {
-  static char blanks[101];
-  assert(depth < 101, "too deep");
-  for (uint i = 0; i < depth; i++) blanks[i] = ' ';
-  blanks[depth] = '\0';
-  return blanks;
-}
-
-
 //==============================SWPointer===========================
 #ifndef PRODUCT
 int SWPointer::Tracer::_depth = 0;
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -458,7 +458,9 @@ class SuperWord : public ResourceObj {
  bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const;

  // my_pack
+ public:
  Node_List* my_pack(Node* n)                 { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
+ private:
  void set_my_pack(Node* n, Node_List* p)     { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
  // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
  bool is_cmov_pack(Node_List* p);
@ -618,19 +620,10 @@ private:
  void merge_packs_to_cmove();
  // Verify that for every pack, all nodes are mutually independent
  DEBUG_ONLY(void verify_packs();)
-  // Remove cycles in packset.
-  void remove_cycles();
  // Adjust the memory graph for the packed operations
  void schedule();
-  // Remove "current" from its current position in the memory graph and insert
-  // it after the appropriate insert points (lip or uip);
-  void remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &schd_before);
-  // Within a store pack, schedule stores together by moving out the sandwiched memory ops according
-  // to dependence info; and within a load pack, move loads down to the last executed load.
-  void co_locate_pack(Node_List* p);
-  Node* pick_mem_state(Node_List* pk);
-  Node* find_first_mem_state(Node_List* pk);
-  Node* find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent);
+  // Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
+  void schedule_reorder_memops(Node_List &memops_schedule);

  // Convert packs into vector node operations
  bool output();
@ -662,19 +655,11 @@ private:
  void compute_vector_element_type();
  // Are s1 and s2 in a pack pair and ordered as s1,s2?
  bool in_packset(Node* s1, Node* s2);
-  // Is s in pack p?
-  Node_List* in_pack(Node* s, Node_List* p);
  // Remove the pack at position pos in the packset
  void remove_pack_at(int pos);
-  // Return the node executed first in pack p.
-  Node* executed_first(Node_List* p);
-  // Return the node executed last in pack p.
-  Node* executed_last(Node_List* p);
  static LoadNode::ControlDependency control_dependency(Node_List* p);
  // Alignment within a vector memory reference
  int memory_alignment(MemNode* s, int iv_adjust);
-  // (Start, end] half-open range defining which operands are vector
-  void vector_opd_range(Node* n, uint* start, uint* end);
  // Smallest type containing range of values
  const Type* container_type(Node* n);
  // Adjust pre-loop limit so that in main loop, a load/store reference
@ -693,7 +678,6 @@ private:
  void print_pack(Node_List* p);
  void print_bb();
  void print_stmt(Node* s);
-  char* blank(uint depth);

  void packset_sort(int n);
 };
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
@ -27,9 +27,8 @@
 * @bug 8304042
 * @summary Test some examples with independent packs with cyclic dependency
 *          between the packs.
- * @requires vm.compiler2.enabled
 * @requires vm.bits == 64
- * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
+ * @requires vm.compiler2.enabled
 * @modules java.base/jdk.internal.misc
 * @library /test/lib /
 * @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency
@ -94,8 +93,8 @@ public class TestIndependentPacksWithCyclicDependency {
        test3(goldI3, goldI3, goldF3, goldF3);
        init(goldI4, goldF4);
        test4(goldI4, goldI4, goldF4, goldF4);
-//        init(goldI5, goldF5);
-//        test5(goldI5, goldI5, goldF5, goldF5);
+        init(goldI5, goldF5);
+        test5(goldI5, goldI5, goldF5, goldF5);
        init(goldI6, goldF6, goldL6);
        test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6);
        init(goldI7, goldF7, goldL7);
@ -232,29 +231,27 @@ public class TestIndependentPacksWithCyclicDependency {
        }
    }

-// TODO uncomment after fixing JDK-8304720
-//
-//    @Run(test = "test5")
-//    public void runTest5() {
-//        int[] dataI = new int[RANGE];
-//        float[] dataF = new float[RANGE];
-//        init(dataI, dataF);
-//        test5(dataI, dataI, dataF, dataF);
-//        verify("test5", dataI, goldI5);
-//        verify("test5", dataF, goldF5);
-//    }
-//
-//    @Test
-//    static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
-//        for (int i = 0; i < RANGE; i+=2) {
-//            // same as test2, except that reordering leads to different semantics
-//            // explanation analogue to test4
-//            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
-//            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
-//            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
-//            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
-//        }
-//    }
+    @Run(test = "test5")
+    public void runTest5() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test5(dataI, dataI, dataF, dataF);
+        verify("test5", dataI, goldI5);
+        verify("test5", dataF, goldF5);
+    }
+
+    @Test
+    static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // same as test2, except that reordering leads to different semantics
+            // explanation analogue to test4
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
+        }
+    }

    @Run(test = "test6")
    public void runTest6() {
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
@ -29,8 +29,6 @@
 *          between the packs.
 *          Before fix, this hit: "assert(!is_visited) failed: visit only once"
 * @requires vm.compiler2.enabled
- * @requires vm.bits == 64
- * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
 * @modules java.base/jdk.internal.misc
 * @library /test/lib /
 * @run main/othervm -XX:LoopUnrollLimit=250
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+/*
+ * @test
+ * @bug 8304720
+ * @summary Test some examples where non-vectorized memops also need to
+ *          be reordered during SuperWord::schedule.
+ * @requires vm.compiler2.enabled
+ * @modules java.base/jdk.internal.misc
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestScheduleReordersScalarMemops
+ */
+
+package compiler.loopopts.superword;
+
+import jdk.internal.misc.Unsafe;
+import jdk.test.lib.Asserts;
+import compiler.lib.ir_framework.*;
+
+public class TestScheduleReordersScalarMemops {
+    static final int RANGE = 1024;
+    static final int ITER  = 10_000;
+    static Unsafe unsafe = Unsafe.getUnsafe();
+
+    int[]   goldI0 = new int[RANGE];
+    float[] goldF0 = new float[RANGE];
+    int[]   goldI1 = new int[RANGE];
+    float[] goldF1 = new float[RANGE];
+
+    public static void main(String args[]) {
+        TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::test*",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::verify",
+                                   "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::init",
+                                   "-XX:LoopUnrollLimit=1000",
+                                   "-XX:-TieredCompilation", "-Xbatch");
+    }
+
+    TestScheduleReordersScalarMemops() {
+        // compute the gold standard in interpreter mode
+        init(goldI0, goldF0);
+        test0(goldI0, goldI0, goldF0, goldF0);
+        init(goldI1, goldF1);
+        test1(goldI1, goldI1, goldF1, goldF1);
+    }
+
+    @Run(test = "test0")
+    @Warmup(100)
+    public void runTest0() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test0(dataI, dataI, dataF, dataF);
+        verify("test0", dataI, goldI0);
+        verify("test0", dataF, goldF0);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_VI, "> 0"},
+        applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
+    static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // We have dependency edges:
+            //  A -> X
+            //  Y -> B
+            // Still, we can vectorize [X,Y].
+            // We do not vectorize A and B, because they are not isomorphic (add vs mul).
+            //
+            // Imagine this is unrolled at least 2x.
+            // We get order: A0 X0 Y0 B0 A1 X1 Y1 B1
+            // Vectorized:      X0 Y0       X1 Y1
+            // Scalar:       A0       B0 A1       B1
+            //
+            // However, since the As need to be before, and the Bs after the vector operations,
+            // we need to have all As before all Bs. This means we need to reorder the scalar
+            // operations, and not just the vectorized ones.
+            //
+            // A correct reordering would be: A0 A1 [X0, Y0, X1, Y1] B0 B1
+            //
+            dataFa[i + 0] = dataIa[i + 0] * 1.3f;     // A *1.3
+            dataIb[i + 0] = (int)dataFb[i + 0] * 11;  // X *11
+            dataIb[i + 1] = (int)dataFb[i + 1] * 11;  // Y *11
+            dataFa[i + 1] = dataIa[i + 1] + 1.2f;     // B +1.2
+        }
+    }
+
+    @Run(test = "test1")
+    @Warmup(100)
+    public void runTest1() {
+        int[] dataI = new int[RANGE];
+        float[] dataF = new float[RANGE];
+        init(dataI, dataF);
+        test1(dataI, dataI, dataF, dataF);
+        verify("test1", dataI, goldI1);
+        verify("test1", dataF, goldF1);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_VI, "> 0"},
+        applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
+    static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
+        for (int i = 0; i < RANGE; i+=2) {
+            // Do the same as test0, but without int-float conversion.
+            // This should reproduce on machines where conversion is not implemented.
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1);  // A +1
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0);  // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4);  // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11
+        }
+    }
+
+    static void init(int[] dataI, float[] dataF) {
+        for (int i = 0; i < RANGE; i++) {
+            dataI[i] = i + 1;
+            dataF[i] = i + 0.1f;
+        }
+    }
+
+    static void verify(String name, int[] data, int[] gold) {
+        for (int i = 0; i < RANGE; i++) {
+            if (data[i] != gold[i]) {
+                throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]);
+            }
+        }
+    }
+
+    static void verify(String name, float[] data, float[] gold) {
+        for (int i = 0; i < RANGE; i++) {
+            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            if (datav != goldv) {
+                throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
+            }
+        }
+    }
+}