From ad0e5a99ca1ad9dd04105f502985735a3536c3f4 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 8 May 2023 06:09:10 +0000 Subject: [PATCH] 8304720: SuperWord::schedule should rebuild C2-graph from SuperWord dependency-graph Reviewed-by: kvn, fgao --- src/hotspot/share/opto/superword.cpp | 514 ++++++------------ src/hotspot/share/opto/superword.hpp | 24 +- ...tIndependentPacksWithCyclicDependency.java | 51 +- ...IndependentPacksWithCyclicDependency2.java | 2 - .../TestScheduleReordersScalarMemops.java | 159 ++++++ 5 files changed, 351 insertions(+), 399 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index af8f08f722b..b1e4677a7cb 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -669,8 +669,6 @@ bool SuperWord::SLP_extract() { DEBUG_ONLY(verify_packs();) - remove_cycles(); - schedule(); // Record eventual count of vector packs for checks in post loop vectorization @@ -1431,11 +1429,13 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) { return false; } - // FIXME - co_locate_pack fails on Stores in different mem-slices, so - // only pack memops that are in the same alias set until that's fixed. + // Adjacent memory references must be on the same slice. if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) { return false; } + + // Adjacent memory references must have the same base, be comparable + // and have the correct distance between them. SWPointer p1(s1->as_Mem(), this, nullptr, false); SWPointer p2(s2->as_Mem(), this, nullptr, false); if (p1.base() != p2.base() || !p1.comparable(p2)) return false; @@ -2497,10 +2497,13 @@ class PacksetGraph { private: // pid: packset graph node id. GrowableArray _pid; // bb_idx(n) -> pid + GrowableArray _pid_to_node; // one node per pid, find rest via my_pack GrowableArray> _out; // out-edges GrowableArray _incnt; // number of (implicit) in-edges int _max_pid = 0; + bool _schedule_success; + SuperWord* _slp; public: PacksetGraph(SuperWord* slp) @@ -2523,11 +2526,18 @@ public: assert(poz != 0, "pid should not be zero"); return poz; } - void set_pid(const Node* n, int pid) { + void set_pid(Node* n, int pid) { assert(n != nullptr && pid > 0, "sane inputs"); assert(_slp->in_bb(n), "must be"); int idx = _slp->bb_idx(n); _pid.at_put_grow(idx, pid); + _pid_to_node.at_put_grow(pid - 1, n, nullptr); + } + Node* get_node(int pid) { + assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped"); + Node* n = _pid_to_node.at(pid - 1); + assert(n != nullptr, "sanity"); + return n; } int new_pid() { _incnt.push(0); @@ -2537,6 +2547,7 @@ public: int incnt(int pid) { return _incnt.at(pid - 1); } void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); } GrowableArray& out(int pid) { return _out.at(pid - 1); } + bool schedule_success() const { return _schedule_success; } // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds. void build() { @@ -2550,6 +2561,7 @@ public: for (uint k = 0; k < p->size(); k++) { Node* n = p->at(k); set_pid(n, pid); + assert(_slp->my_pack(n) == p, "matching packset"); } } @@ -2565,6 +2577,7 @@ public: if (pid == 0) { pid = new_pid(); set_pid(n, pid); + assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset"); } } @@ -2610,11 +2623,13 @@ public: } } } - // Schedule the graph to worklist. Returns true iff all nodes were scheduled. - // This implies that we return true iff the PacksetGraph is acyclic. - // We schedule with topological sort: schedule any node that has zero incnt. - // Then remove that node, which decrements the incnt of all its uses (outputs). - bool schedule() { + + // Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node + // that has zero incnt. If a PacksetGraph node corresponds to memops, then add + // those to the memops_schedule. At the end, we return the memops_schedule, and + // note if topsort was successful. + Node_List schedule() { + Node_List memops_schedule; GrowableArray worklist; // Directly schedule all nodes without precedence for (int pid = 1; pid <= _max_pid; pid++) { @@ -2625,6 +2640,22 @@ public: // Continue scheduling via topological sort for (int i = 0; i < worklist.length(); i++) { int pid = worklist.at(i); + + // Add memops to memops_schedule + Node* n = get_node(pid); + Node_List* p = _slp->my_pack(n); + if (n->is_Mem()) { + if (p == nullptr) { + memops_schedule.push(n); + } else { + for (uint k = 0; k < p->size(); k++) { + memops_schedule.push(p->at(k)); + assert(p->at(k)->is_Mem(), "only schedule memops"); + } + } + } + + // Decrement incnt for all successors for (int j = 0; j < out(pid).length(); j++){ int pid_use = out(pid).at(j); int incnt_use = incnt(pid_use) - 1; @@ -2635,9 +2666,12 @@ public: } } } - // Was every pid scheduled? - return worklist.length() == _max_pid; + + // Was every pid scheduled? If not, we found some cycles in the PacksetGraph. + _schedule_success = (worklist.length() == _max_pid); + return memops_schedule; } + // Print the PacksetGraph. // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node. // print_zero_incnt = false: do not print nodes that have no in-edges (any more). @@ -2668,303 +2702,135 @@ public: } }; -//------------------------------remove_cycles--------------------------- -// We now know that we only have independent packs, see verify_packs. -// This is a necessary but not a sufficient condition for an acyclic -// graph (DAG) after scheduling. Thus, we must check if the packs have -// introduced a cycle. The SuperWord paper mentions the need for this -// in "3.7 Scheduling". -// Approach: given all nodes from the _block, we create a new graph. -// The nodes that are not in a pack are their own nodes (scalar-node) -// in that new graph. Every pack is also a node (pack-node). We then -// add the edges according to DepPreds: a scalar-node has all edges -// to its node's DepPreds. A pack-node has all edges from every pack -// member to all their DepPreds. -void SuperWord::remove_cycles() { +// The C2 graph (specifically the memory graph), needs to be re-ordered. +// (1) Build the PacksetGraph. It combines the DepPreds graph with the +// packset. The PacksetGraph gives us the dependencies that must be +// respected after scheduling. +// (2) Schedule the PacksetGraph to the memops_schedule, which represents +// a linear order of all memops in the body. The order respects the +// dependencies of the PacksetGraph. +// (3) If the PacksetGraph has cycles, we cannot schedule. Abort. +// (4) Use the memops_schedule to re-order the memops in all slices. +void SuperWord::schedule() { if (_packset.length() == 0) { return; // empty packset } ResourceMark rm; + // (1) Build the PacksetGraph. PacksetGraph graph(this); - graph.build(); - if (!graph.schedule()) { + // (2) Schedule the PacksetGraph. + Node_List memops_schedule = graph.schedule(); + + // (3) Check if the PacksetGraph schedule succeeded (had no cycles). + // We now know that we only have independent packs, see verify_packs. + // This is a necessary but not a sufficient condition for an acyclic + // graph (DAG) after scheduling. Thus, we must check if the packs have + // introduced a cycle. The SuperWord paper mentions the need for this + // in "3.7 Scheduling". + if (!graph.schedule_success()) { if (TraceSuperWord) { - tty->print_cr("remove_cycles found cycle in PacksetGraph:"); + tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:"); graph.print(true, false); tty->print_cr("removing all packs from packset."); } _packset.clear(); + return; } + +#ifndef PRODUCT + if (TraceSuperWord) { + tty->print_cr("SuperWord::schedule: memops_schedule:"); + memops_schedule.dump(); + } +#endif + + // (4) Use the memops_schedule to re-order the memops in all slices. + schedule_reorder_memops(memops_schedule); } -//------------------------------schedule--------------------------- -// Adjust the memory graph for the packed operations -void SuperWord::schedule() { - // Co-locate in the memory graph the members of each memory pack - for (int i = 0; i < _packset.length(); i++) { - co_locate_pack(_packset.at(i)); +// Reorder the memory graph for all slices in parallel. We walk over the schedule once, +// and track the current memory state of each slice. +void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) { + int max_slices = _phase->C->num_alias_types(); + // When iterating over the memops_schedule, we keep track of the current memory state, + // which is the Phi or a store in the loop. + GrowableArray current_state_in_slice(max_slices, max_slices, nullptr); + // The memory state after the loop is the last store inside the loop. If we reorder the + // loop we may have a different last store, and we need to adjust the uses accordingly. + GrowableArray old_last_store_in_slice(max_slices, max_slices, nullptr); + + // (1) Set up the initial memory state from Phi. And find the old last store. + for (int i = 0; i < _mem_slice_head.length(); i++) { + Node* phi = _mem_slice_head.at(i); + assert(phi->is_Phi(), "must be phi"); + int alias_idx = _phase->C->get_alias_index(phi->adr_type()); + current_state_in_slice.at_put(alias_idx, phi); + + // If we have a memory phi, we have a last store in the loop, find it over backedge. + StoreNode* last_store = phi->in(2)->as_Store(); + old_last_store_in_slice.at_put(alias_idx, last_store); } -} -//-------------------------------remove_and_insert------------------- -// Remove "current" from its current position in the memory graph and insert -// it after the appropriate insertion point (lip or uip). -void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, - Node *uip, Unique_Node_List &sched_before) { - Node* my_mem = current->in(MemNode::Memory); - bool sched_up = sched_before.member(current); + // (2) Walk over memops_schedule, append memops to the current state + // of that slice. If it is a Store, we take it as the new state. + for (uint i = 0; i < memops_schedule.size(); i++) { + MemNode* n = memops_schedule.at(i)->as_Mem(); + assert(n->is_Load() || n->is_Store(), "only loads or stores"); + int alias_idx = _phase->C->get_alias_index(n->adr_type()); + Node* current_state = current_state_in_slice.at(alias_idx); + if (current_state == nullptr) { + // If there are only loads in a slice, we never update the memory + // state in the loop, hence there is no phi for the memory state. + // We just keep the old memory state that was outside the loop. + assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)), + "only loads can have memory state from outside loop"); + } else { + _igvn.replace_input_of(n, MemNode::Memory, current_state); + if (n->is_Store()) { + current_state_in_slice.at_put(alias_idx, n); + } + } + } - // remove current_store from its current position in the memory graph - for (DUIterator i = current->outs(); current->has_out(i); i++) { - Node* use = current->out(i); - if (use->is_Mem()) { - assert(use->in(MemNode::Memory) == current, "must be"); - if (use == prev) { // connect prev to my_mem - _igvn.replace_input_of(use, MemNode::Memory, my_mem); - --i; //deleted this edge; rescan position - } else if (sched_before.member(use)) { - if (!sched_up) { // Will be moved together with current - _igvn.replace_input_of(use, MemNode::Memory, uip); - --i; //deleted this edge; rescan position - } - } else { - if (sched_up) { // Will be moved together with current - _igvn.replace_input_of(use, MemNode::Memory, lip); - --i; //deleted this edge; rescan position + // (3) For each slice, we add the current state to the backedge + // in the Phi. Further, we replace uses of the old last store + // with uses of the new last store (current_state). + Node_List uses_after_loop; + for (int i = 0; i < _mem_slice_head.length(); i++) { + Node* phi = _mem_slice_head.at(i); + int alias_idx = _phase->C->get_alias_index(phi->adr_type()); + Node* current_state = current_state_in_slice.at(alias_idx); + assert(current_state != nullptr, "slice is mapped"); + assert(current_state != phi, "did some work in between"); + assert(current_state->is_Store(), "sanity"); + _igvn.replace_input_of(phi, 2, current_state); + + // Replace uses of old last store with current_state (new last store) + // Do it in two loops: first find all the uses, and change the graph + // in as second loop so that we do not break the iterator. + Node* last_store = old_last_store_in_slice.at(alias_idx); + assert(last_store != nullptr, "we have a old last store"); + uses_after_loop.clear(); + for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) { + Node* use = last_store->fast_out(k); + if (!in_bb(use)) { + uses_after_loop.push(use); + } + } + for (uint k = 0; k < uses_after_loop.size(); k++) { + Node* use = uses_after_loop.at(k); + for (uint j = 0; j < use->req(); j++) { + Node* def = use->in(j); + if (def == last_store) { + _igvn.replace_input_of(use, j, current_state); } } } } - - Node *insert_pt = sched_up ? uip : lip; - - // all uses of insert_pt's memory state should use current's instead - for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) { - Node* use = insert_pt->out(i); - if (use->is_Mem()) { - assert(use->in(MemNode::Memory) == insert_pt, "must be"); - _igvn.replace_input_of(use, MemNode::Memory, current); - --i; //deleted this edge; rescan position - } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) { - uint pos; //lip (lower insert point) must be the last one in the memory slice - for (pos=1; pos < use->req(); pos++) { - if (use->in(pos) == insert_pt) break; - } - _igvn.replace_input_of(use, pos, current); - --i; - } - } - - //connect current to insert_pt - _igvn.replace_input_of(current, MemNode::Memory, insert_pt); -} - -//------------------------------co_locate_pack---------------------------------- -// To schedule a store pack, we need to move any sandwiched memory ops either before -// or after the pack, based upon dependence information: -// (1) If any store in the pack depends on the sandwiched memory op, the -// sandwiched memory op must be scheduled BEFORE the pack; -// (2) If a sandwiched memory op depends on any store in the pack, the -// sandwiched memory op must be scheduled AFTER the pack; -// (3) If a sandwiched memory op (say, memA) depends on another sandwiched -// memory op (say memB), memB must be scheduled before memA. So, if memA is -// scheduled before the pack, memB must also be scheduled before the pack; -// (4) If there is no dependence restriction for a sandwiched memory op, we simply -// schedule this store AFTER the pack -// (5) We know there is no dependence cycle, so there in no other case; -// (6) Finally, all memory ops in another single pack should be moved in the same direction. -// -// To schedule a load pack, we use the memory state of either the first or the last load in -// the pack, based on the dependence constraint. -void SuperWord::co_locate_pack(Node_List* pk) { - if (pk->at(0)->is_Store()) { - MemNode* first = executed_first(pk)->as_Mem(); - MemNode* last = executed_last(pk)->as_Mem(); - Unique_Node_List schedule_before_pack; - Unique_Node_List memops; - - MemNode* current = last->in(MemNode::Memory)->as_Mem(); - MemNode* previous = last; - while (true) { - assert(in_bb(current), "stay in block"); - memops.push(previous); - for (DUIterator i = current->outs(); current->has_out(i); i++) { - Node* use = current->out(i); - if (use->is_Mem() && use != previous) - memops.push(use); - } - if (current == first) break; - previous = current; - current = current->in(MemNode::Memory)->as_Mem(); - } - - // determine which memory operations should be scheduled before the pack - for (uint i = 1; i < memops.size(); i++) { - Node *s1 = memops.at(i); - if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) { - for (uint j = 0; j< i; j++) { - Node *s2 = memops.at(j); - if (!independent(s1, s2)) { - if (in_pack(s2, pk) || schedule_before_pack.member(s2)) { - schedule_before_pack.push(s1); // s1 must be scheduled before - Node_List* mem_pk = my_pack(s1); - if (mem_pk != nullptr) { - for (uint ii = 0; ii < mem_pk->size(); ii++) { - Node* s = mem_pk->at(ii); // follow partner - if (memops.member(s) && !schedule_before_pack.member(s)) - schedule_before_pack.push(s); - } - } - break; - } - } - } - } - } - - Node* upper_insert_pt = first->in(MemNode::Memory); - // Following code moves loads connected to upper_insert_pt below aliased stores. - // Collect such loads here and reconnect them back to upper_insert_pt later. - memops.clear(); - for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) { - Node* use = upper_insert_pt->out(i); - if (use->is_Mem() && !use->is_Store()) { - memops.push(use); - } - } - - MemNode* lower_insert_pt = last; - previous = last; //previous store in pk - current = last->in(MemNode::Memory)->as_Mem(); - - // start scheduling from "last" to "first" - while (true) { - assert(in_bb(current), "stay in block"); - assert(in_pack(previous, pk), "previous stays in pack"); - Node* my_mem = current->in(MemNode::Memory); - - if (in_pack(current, pk)) { - // Forward users of my memory state (except "previous) to my input memory state - for (DUIterator i = current->outs(); current->has_out(i); i++) { - Node* use = current->out(i); - if (use->is_Mem() && use != previous) { - assert(use->in(MemNode::Memory) == current, "must be"); - if (schedule_before_pack.member(use)) { - _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt); - } else { - _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt); - } - --i; // deleted this edge; rescan position - } - } - previous = current; - } else { // !in_pack(current, pk) ==> a sandwiched store - remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack); - } - - if (current == first) break; - current = my_mem->as_Mem(); - } // end while - - // Reconnect loads back to upper_insert_pt. - for (uint i = 0; i < memops.size(); i++) { - Node *ld = memops.at(i); - if (ld->in(MemNode::Memory) != upper_insert_pt) { - _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt); - } - } - } else if (pk->at(0)->is_Load()) { // Load pack - // All loads in the pack should have the same memory state. By default, - // we use the memory state of the last load. However, if any load could - // not be moved down due to the dependence constraint, we use the memory - // state of the first load. - Node* mem_input = pick_mem_state(pk); - _igvn.hash_delete(mem_input); - // Give each load the same memory state - for (uint i = 0; i < pk->size(); i++) { - LoadNode* ld = pk->at(i)->as_Load(); - _igvn.replace_input_of(ld, MemNode::Memory, mem_input); - } - } -} - -// Finds the first and last memory state and then picks either of them by checking dependence constraints. -// If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot -// pick the memory state of the last load. -Node* SuperWord::pick_mem_state(Node_List* pk) { - Node* first_mem = find_first_mem_state(pk); - bool is_dependent = false; - Node* last_mem = find_last_mem_state(pk, first_mem, is_dependent); - - for (uint i = 0; i < pk->size(); i++) { - Node* ld = pk->at(i); - for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) { - assert(current->is_Mem() && in_bb(current), "unexpected memory"); - assert(current != first_mem, "corrupted memory graph"); - if (!independent(current, ld)) { - // A later unvectorized store depends on this load, pick the memory state of the first load. This can happen, - // for example, if a load pack has interleaving stores that are part of a store pack which, however, is removed - // at the pack filtering stage. This leaves us with only a load pack for which we cannot take the memory state - // of the last load as the remaining unvectorized stores could interfere since they have a dependency to the loads. - // Some stores could be executed before the load vector resulting in a wrong result. We need to take the - // memory state of the first load to prevent this. - if (my_pack(current) != nullptr && is_dependent) { - // For vectorized store pack, when the load pack depends on - // some memory operations locating after first_mem, we still - // take the memory state of the last load. - continue; - } - return first_mem; - } - } - } - return last_mem; -} - -// Walk the memory graph from the current first load until the -// start of the loop and check if nodes on the way are memory -// edges of loads in the pack. The last one we encounter is the -// first load. -Node* SuperWord::find_first_mem_state(Node_List* pk) { - Node* first_mem = pk->at(0)->in(MemNode::Memory); - for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) { - assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory"); - for (uint i = 1; i < pk->size(); i++) { - Node* ld = pk->at(i); - if (ld->in(MemNode::Memory) == current) { - first_mem = current; - break; - } - } - } - return first_mem; -} - -// Find the last load by going over the pack again and walking -// the memory graph from the loads of the pack to the memory of -// the first load. If we encounter the memory of the current last -// load, then we started from further down in the memory graph and -// the load we started from is the last load. At the same time, the -// function also helps determine if some loads in the pack depend on -// early memory operations which locate after first_mem. -Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent) { - Node* last_mem = pk->at(0)->in(MemNode::Memory); - for (uint i = 0; i < pk->size(); i++) { - Node* ld = pk->at(i); - for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) { - assert(current->is_Mem() && in_bb(current), "unexpected memory"); - // Determine if the load pack is dependent on some memory operations locating after first_mem. - is_dependent |= !independent(current, ld); - if (current->in(MemNode::Memory) == last_mem) { - last_mem = ld->in(MemNode::Memory); - } - } - } - return last_mem; } #ifndef PRODUCT @@ -3037,12 +2903,14 @@ bool SuperWord::output() { for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); Node_List* p = my_pack(n); - if (p && n == executed_last(p)) { + if (p != nullptr && n == p->at(p->size()-1)) { + // After schedule_reorder_memops, we know that the memops have the same order in the pack + // as in the memory slice. Hence, "first" is the first memop in the slice from the pack, + // and "n" is the last node in the slice from the pack. + Node* first = p->at(0); uint vlen = p->size(); uint vlen_in_bytes = 0; Node* vn = nullptr; - Node* low_adr = p->at(0); - Node* first = executed_first(p); if (cl->is_rce_post_loop()) { // override vlen with the main loops vector length vlen = cl->slp_max_unroll(); @@ -3066,7 +2934,7 @@ bool SuperWord::output() { break; // dependent memory } } - Node* adr = low_adr->in(MemNode::Address); + Node* adr = first->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); if (cl->is_rce_post_loop()) { assert(vmask != nullptr, "vector mask should be generated"); @@ -3089,7 +2957,7 @@ bool SuperWord::output() { Node* ctl = n->in(MemNode::Control); Node* mem = first->in(MemNode::Memory); - Node* adr = low_adr->in(MemNode::Address); + Node* adr = first->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); if (cl->is_rce_post_loop()) { assert(vmask != nullptr, "vector mask should be generated"); @@ -3100,8 +2968,8 @@ bool SuperWord::output() { } vlen_in_bytes = vn->as_StoreVector()->memory_size(); } else if (VectorNode::is_scalar_rotate(n)) { - Node* in1 = low_adr->in(1); - Node* in2 = p->at(0)->in(2); + Node* in1 = first->in(1); + Node* in2 = first->in(2); // If rotation count is non-constant or greater than 8bit value create a vector. if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) { in2 = vector_opd(p, 2); @@ -3110,7 +2978,7 @@ bool SuperWord::output() { vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else if (VectorNode::is_roundopD(n)) { Node* in1 = vector_opd(p, 1); - Node* in2 = low_adr->in(2); + Node* in2 = first->in(2); assert(in2->is_Con(), "Constant rounding mode expected."); vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); @@ -3133,7 +3001,7 @@ bool SuperWord::output() { bool node_isa_reduction = is_marked_reduction(n); if (node_isa_reduction) { // the input to the first reduction operation is retained - in1 = low_adr->in(1); + in1 = first->in(1); } else { in1 = vector_opd(p, 1); if (in1 == nullptr) { @@ -3201,7 +3069,7 @@ bool SuperWord::output() { Node* in = vector_opd(p, 1); Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG); _igvn.register_new_node_with_optimizer(longval); - _phase->set_ctrl(longval, _phase->get_ctrl(p->at(0))); + _phase->set_ctrl(longval, _phase->get_ctrl(first)); vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else if (VectorNode::is_convert_opcode(opc)) { @@ -3316,7 +3184,7 @@ bool SuperWord::output() { _block.at_put(i, vn); _igvn.register_new_node_with_optimizer(vn); - _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); + _phase->set_ctrl(vn, _phase->get_ctrl(first)); for (uint j = 0; j < p->size(); j++) { Node* pm = p->at(j); _igvn.replace_node(pm, vn); @@ -4196,17 +4064,6 @@ bool SuperWord::in_packset(Node* s1, Node* s2) { return false; } -//------------------------------in_pack--------------------------- -// Is s in pack p? -Node_List* SuperWord::in_pack(Node* s, Node_List* p) { - for (uint i = 0; i < p->size(); i++) { - if (p->at(i) == s) { - return p; - } - } - return nullptr; -} - //------------------------------remove_pack_at--------------------------- // Remove the pack at position pos in the packset void SuperWord::remove_pack_at(int pos) { @@ -4239,39 +4096,6 @@ void SuperWord::packset_sort(int n) { } } -//------------------------------executed_first--------------------------- -// Return the node executed first in pack p. Uses the RPO block list -// to determine order. -Node* SuperWord::executed_first(Node_List* p) { - Node* n = p->at(0); - int n_rpo = bb_idx(n); - for (uint i = 1; i < p->size(); i++) { - Node* s = p->at(i); - int s_rpo = bb_idx(s); - if (s_rpo < n_rpo) { - n = s; - n_rpo = s_rpo; - } - } - return n; -} - -//------------------------------executed_last--------------------------- -// Return the node executed last in pack p. -Node* SuperWord::executed_last(Node_List* p) { - Node* n = p->at(0); - int n_rpo = bb_idx(n); - for (uint i = 1; i < p->size(); i++) { - Node* s = p->at(i); - int s_rpo = bb_idx(s); - if (s_rpo > n_rpo) { - n = s; - n_rpo = s_rpo; - } - } - return n; -} - LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) { LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest; for (uint i = 0; i < p->size(); i++) { @@ -4547,16 +4371,6 @@ void SuperWord::print_stmt(Node* s) { #endif } -//------------------------------blank--------------------------- -char* SuperWord::blank(uint depth) { - static char blanks[101]; - assert(depth < 101, "too deep"); - for (uint i = 0; i < depth; i++) blanks[i] = ' '; - blanks[depth] = '\0'; - return blanks; -} - - //==============================SWPointer=========================== #ifndef PRODUCT int SWPointer::Tracer::_depth = 0; diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 70e97e9444c..cf358b5d439 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -458,7 +458,9 @@ class SuperWord : public ResourceObj { bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const; // my_pack + public: Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; } + private: void set_my_pack(Node* n, Node_List* p) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; } // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes. bool is_cmov_pack(Node_List* p); @@ -618,19 +620,10 @@ private: void merge_packs_to_cmove(); // Verify that for every pack, all nodes are mutually independent DEBUG_ONLY(void verify_packs();) - // Remove cycles in packset. - void remove_cycles(); // Adjust the memory graph for the packed operations void schedule(); - // Remove "current" from its current position in the memory graph and insert - // it after the appropriate insert points (lip or uip); - void remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &schd_before); - // Within a store pack, schedule stores together by moving out the sandwiched memory ops according - // to dependence info; and within a load pack, move loads down to the last executed load. - void co_locate_pack(Node_List* p); - Node* pick_mem_state(Node_List* pk); - Node* find_first_mem_state(Node_List* pk); - Node* find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent); + // Helper function for schedule, that reorders all memops, slice by slice, according to the schedule + void schedule_reorder_memops(Node_List &memops_schedule); // Convert packs into vector node operations bool output(); @@ -662,19 +655,11 @@ private: void compute_vector_element_type(); // Are s1 and s2 in a pack pair and ordered as s1,s2? bool in_packset(Node* s1, Node* s2); - // Is s in pack p? - Node_List* in_pack(Node* s, Node_List* p); // Remove the pack at position pos in the packset void remove_pack_at(int pos); - // Return the node executed first in pack p. - Node* executed_first(Node_List* p); - // Return the node executed last in pack p. - Node* executed_last(Node_List* p); static LoadNode::ControlDependency control_dependency(Node_List* p); // Alignment within a vector memory reference int memory_alignment(MemNode* s, int iv_adjust); - // (Start, end] half-open range defining which operands are vector - void vector_opd_range(Node* n, uint* start, uint* end); // Smallest type containing range of values const Type* container_type(Node* n); // Adjust pre-loop limit so that in main loop, a load/store reference @@ -693,7 +678,6 @@ private: void print_pack(Node_List* p); void print_bb(); void print_stmt(Node* s); - char* blank(uint depth); void packset_sort(int n); }; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java index 870c4be3931..c384253d6e6 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java @@ -27,9 +27,8 @@ * @bug 8304042 * @summary Test some examples with independent packs with cyclic dependency * between the packs. - * @requires vm.compiler2.enabled * @requires vm.bits == 64 - * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*" + * @requires vm.compiler2.enabled * @modules java.base/jdk.internal.misc * @library /test/lib / * @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency @@ -94,8 +93,8 @@ public class TestIndependentPacksWithCyclicDependency { test3(goldI3, goldI3, goldF3, goldF3); init(goldI4, goldF4); test4(goldI4, goldI4, goldF4, goldF4); -// init(goldI5, goldF5); -// test5(goldI5, goldI5, goldF5, goldF5); + init(goldI5, goldF5); + test5(goldI5, goldI5, goldF5, goldF5); init(goldI6, goldF6, goldL6); test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6); init(goldI7, goldF7, goldL7); @@ -232,29 +231,27 @@ public class TestIndependentPacksWithCyclicDependency { } } -// TODO uncomment after fixing JDK-8304720 -// -// @Run(test = "test5") -// public void runTest5() { -// int[] dataI = new int[RANGE]; -// float[] dataF = new float[RANGE]; -// init(dataI, dataF); -// test5(dataI, dataI, dataF, dataF); -// verify("test5", dataI, goldI5); -// verify("test5", dataF, goldF5); -// } -// -// @Test -// static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { -// for (int i = 0; i < RANGE; i+=2) { -// // same as test2, except that reordering leads to different semantics -// // explanation analogue to test4 -// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A -// dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X -// dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y -// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B -// } -// } + @Run(test = "test5") + public void runTest5() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test5(dataI, dataI, dataF, dataF); + verify("test5", dataI, goldI5); + verify("test5", dataF, goldF5); + } + + @Test + static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // same as test2, except that reordering leads to different semantics + // explanation analogue to test4 + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B + } + } @Run(test = "test6") public void runTest6() { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java index 328719daf58..31a2ba3a0cb 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java @@ -29,8 +29,6 @@ * between the packs. * Before fix, this hit: "assert(!is_visited) failed: visit only once" * @requires vm.compiler2.enabled - * @requires vm.bits == 64 - * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*" * @modules java.base/jdk.internal.misc * @library /test/lib / * @run main/othervm -XX:LoopUnrollLimit=250 diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java new file mode 100644 index 00000000000..f74c19d1b8b --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + + +/* + * @test + * @bug 8304720 + * @summary Test some examples where non-vectorized memops also need to + * be reordered during SuperWord::schedule. + * @requires vm.compiler2.enabled + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestScheduleReordersScalarMemops + */ + +package compiler.loopopts.superword; + +import jdk.internal.misc.Unsafe; +import jdk.test.lib.Asserts; +import compiler.lib.ir_framework.*; + +public class TestScheduleReordersScalarMemops { + static final int RANGE = 1024; + static final int ITER = 10_000; + static Unsafe unsafe = Unsafe.getUnsafe(); + + int[] goldI0 = new int[RANGE]; + float[] goldF0 = new float[RANGE]; + int[] goldI1 = new int[RANGE]; + float[] goldF1 = new float[RANGE]; + + public static void main(String args[]) { + TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::test*", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::verify", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::init", + "-XX:LoopUnrollLimit=1000", + "-XX:-TieredCompilation", "-Xbatch"); + } + + TestScheduleReordersScalarMemops() { + // compute the gold standard in interpreter mode + init(goldI0, goldF0); + test0(goldI0, goldI0, goldF0, goldF0); + init(goldI1, goldF1); + test1(goldI1, goldI1, goldF1, goldF1); + } + + @Run(test = "test0") + @Warmup(100) + public void runTest0() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test0(dataI, dataI, dataF, dataF); + verify("test0", dataI, goldI0); + verify("test0", dataF, goldF0); + } + + @Test + @IR(counts = {IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) + static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // We have dependency edges: + // A -> X + // Y -> B + // Still, we can vectorize [X,Y]. + // We do not vectorize A and B, because they are not isomorphic (add vs mul). + // + // Imagine this is unrolled at least 2x. + // We get order: A0 X0 Y0 B0 A1 X1 Y1 B1 + // Vectorized: X0 Y0 X1 Y1 + // Scalar: A0 B0 A1 B1 + // + // However, since the As need to be before, and the Bs after the vector operations, + // we need to have all As before all Bs. This means we need to reorder the scalar + // operations, and not just the vectorized ones. + // + // A correct reordering would be: A0 A1 [X0, Y0, X1, Y1] B0 B1 + // + dataFa[i + 0] = dataIa[i + 0] * 1.3f; // A *1.3 + dataIb[i + 0] = (int)dataFb[i + 0] * 11; // X *11 + dataIb[i + 1] = (int)dataFb[i + 1] * 11; // Y *11 + dataFa[i + 1] = dataIa[i + 1] + 1.2f; // B +1.2 + } + } + + @Run(test = "test1") + @Warmup(100) + public void runTest1() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test1(dataI, dataI, dataF, dataF); + verify("test1", dataI, goldI1); + verify("test1", dataF, goldF1); + } + + @Test + @IR(counts = {IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) + static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // Do the same as test0, but without int-float conversion. + // This should reproduce on machines where conversion is not implemented. + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A +1 + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11 + } + } + + static void init(int[] dataI, float[] dataF) { + for (int i = 0; i < RANGE; i++) { + dataI[i] = i + 1; + dataF[i] = i + 0.1f; + } + } + + static void verify(String name, int[] data, int[] gold) { + for (int i = 0; i < RANGE; i++) { + if (data[i] != gold[i]) { + throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]); + } + } + } + + static void verify(String name, float[] data, float[] gold) { + for (int i = 0; i < RANGE; i++) { + int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + if (datav != goldv) { + throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv); + } + } + } +}