8304720: SuperWord::schedule should rebuild C2-graph from SuperWord dependency-graph

Reviewed-by: kvn, fgao
This commit is contained in:
Emanuel Peter 2023-05-08 06:09:10 +00:00
parent 495f2688d6
commit ad0e5a99ca
5 changed files with 351 additions and 399 deletions

View File

@ -669,8 +669,6 @@ bool SuperWord::SLP_extract() {
DEBUG_ONLY(verify_packs();)
remove_cycles();
schedule();
// Record eventual count of vector packs for checks in post loop vectorization
@ -1431,11 +1429,13 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
return false;
}
// FIXME - co_locate_pack fails on Stores in different mem-slices, so
// only pack memops that are in the same alias set until that's fixed.
// Adjacent memory references must be on the same slice.
if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
return false;
}
// Adjacent memory references must have the same base, be comparable
// and have the correct distance between them.
SWPointer p1(s1->as_Mem(), this, nullptr, false);
SWPointer p2(s2->as_Mem(), this, nullptr, false);
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
@ -2497,10 +2497,13 @@ class PacksetGraph {
private:
// pid: packset graph node id.
GrowableArray<int> _pid; // bb_idx(n) -> pid
GrowableArray<Node*> _pid_to_node; // one node per pid, find rest via my_pack
GrowableArray<GrowableArray<int>> _out; // out-edges
GrowableArray<int> _incnt; // number of (implicit) in-edges
int _max_pid = 0;
bool _schedule_success;
SuperWord* _slp;
public:
PacksetGraph(SuperWord* slp)
@ -2523,11 +2526,18 @@ public:
assert(poz != 0, "pid should not be zero");
return poz;
}
void set_pid(const Node* n, int pid) {
void set_pid(Node* n, int pid) {
assert(n != nullptr && pid > 0, "sane inputs");
assert(_slp->in_bb(n), "must be");
int idx = _slp->bb_idx(n);
_pid.at_put_grow(idx, pid);
_pid_to_node.at_put_grow(pid - 1, n, nullptr);
}
Node* get_node(int pid) {
assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
Node* n = _pid_to_node.at(pid - 1);
assert(n != nullptr, "sanity");
return n;
}
int new_pid() {
_incnt.push(0);
@ -2537,6 +2547,7 @@ public:
int incnt(int pid) { return _incnt.at(pid - 1); }
void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
bool schedule_success() const { return _schedule_success; }
// Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
void build() {
@ -2550,6 +2561,7 @@ public:
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
set_pid(n, pid);
assert(_slp->my_pack(n) == p, "matching packset");
}
}
@ -2565,6 +2577,7 @@ public:
if (pid == 0) {
pid = new_pid();
set_pid(n, pid);
assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset");
}
}
@ -2610,11 +2623,13 @@ public:
}
}
}
// Schedule the graph to worklist. Returns true iff all nodes were scheduled.
// This implies that we return true iff the PacksetGraph is acyclic.
// We schedule with topological sort: schedule any node that has zero incnt.
// Then remove that node, which decrements the incnt of all its uses (outputs).
bool schedule() {
// Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
// that has zero incnt. If a PacksetGraph node corresponds to memops, then add
// those to the memops_schedule. At the end, we return the memops_schedule, and
// note if topsort was successful.
Node_List schedule() {
Node_List memops_schedule;
GrowableArray<int> worklist;
// Directly schedule all nodes without precedence
for (int pid = 1; pid <= _max_pid; pid++) {
@ -2625,6 +2640,22 @@ public:
// Continue scheduling via topological sort
for (int i = 0; i < worklist.length(); i++) {
int pid = worklist.at(i);
// Add memops to memops_schedule
Node* n = get_node(pid);
Node_List* p = _slp->my_pack(n);
if (n->is_Mem()) {
if (p == nullptr) {
memops_schedule.push(n);
} else {
for (uint k = 0; k < p->size(); k++) {
memops_schedule.push(p->at(k));
assert(p->at(k)->is_Mem(), "only schedule memops");
}
}
}
// Decrement incnt for all successors
for (int j = 0; j < out(pid).length(); j++){
int pid_use = out(pid).at(j);
int incnt_use = incnt(pid_use) - 1;
@ -2635,9 +2666,12 @@ public:
}
}
}
// Was every pid scheduled?
return worklist.length() == _max_pid;
// Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
_schedule_success = (worklist.length() == _max_pid);
return memops_schedule;
}
// Print the PacksetGraph.
// print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
// print_zero_incnt = false: do not print nodes that have no in-edges (any more).
@ -2668,303 +2702,135 @@ public:
}
};
//------------------------------remove_cycles---------------------------
// We now know that we only have independent packs, see verify_packs.
// This is a necessary but not a sufficient condition for an acyclic
// graph (DAG) after scheduling. Thus, we must check if the packs have
// introduced a cycle. The SuperWord paper mentions the need for this
// in "3.7 Scheduling".
// Approach: given all nodes from the _block, we create a new graph.
// The nodes that are not in a pack are their own nodes (scalar-node)
// in that new graph. Every pack is also a node (pack-node). We then
// add the edges according to DepPreds: a scalar-node has all edges
// to its node's DepPreds. A pack-node has all edges from every pack
// member to all their DepPreds.
void SuperWord::remove_cycles() {
// The C2 graph (specifically the memory graph), needs to be re-ordered.
// (1) Build the PacksetGraph. It combines the DepPreds graph with the
// packset. The PacksetGraph gives us the dependencies that must be
// respected after scheduling.
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
// a linear order of all memops in the body. The order respects the
// dependencies of the PacksetGraph.
// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
// (4) Use the memops_schedule to re-order the memops in all slices.
void SuperWord::schedule() {
if (_packset.length() == 0) {
return; // empty packset
}
ResourceMark rm;
// (1) Build the PacksetGraph.
PacksetGraph graph(this);
graph.build();
if (!graph.schedule()) {
// (2) Schedule the PacksetGraph.
Node_List memops_schedule = graph.schedule();
// (3) Check if the PacksetGraph schedule succeeded (had no cycles).
// We now know that we only have independent packs, see verify_packs.
// This is a necessary but not a sufficient condition for an acyclic
// graph (DAG) after scheduling. Thus, we must check if the packs have
// introduced a cycle. The SuperWord paper mentions the need for this
// in "3.7 Scheduling".
if (!graph.schedule_success()) {
if (TraceSuperWord) {
tty->print_cr("remove_cycles found cycle in PacksetGraph:");
tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
graph.print(true, false);
tty->print_cr("removing all packs from packset.");
}
_packset.clear();
return;
}
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("SuperWord::schedule: memops_schedule:");
memops_schedule.dump();
}
#endif
// (4) Use the memops_schedule to re-order the memops in all slices.
schedule_reorder_memops(memops_schedule);
}
//------------------------------schedule---------------------------
// Adjust the memory graph for the packed operations
void SuperWord::schedule() {
// Co-locate in the memory graph the members of each memory pack
for (int i = 0; i < _packset.length(); i++) {
co_locate_pack(_packset.at(i));
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
// and track the current memory state of each slice.
void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
int max_slices = _phase->C->num_alias_types();
// When iterating over the memops_schedule, we keep track of the current memory state,
// which is the Phi or a store in the loop.
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
// The memory state after the loop is the last store inside the loop. If we reorder the
// loop we may have a different last store, and we need to adjust the uses accordingly.
GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
// (1) Set up the initial memory state from Phi. And find the old last store.
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* phi = _mem_slice_head.at(i);
assert(phi->is_Phi(), "must be phi");
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
current_state_in_slice.at_put(alias_idx, phi);
// If we have a memory phi, we have a last store in the loop, find it over backedge.
StoreNode* last_store = phi->in(2)->as_Store();
old_last_store_in_slice.at_put(alias_idx, last_store);
}
}
//-------------------------------remove_and_insert-------------------
// Remove "current" from its current position in the memory graph and insert
// it after the appropriate insertion point (lip or uip).
void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
Node *uip, Unique_Node_List &sched_before) {
Node* my_mem = current->in(MemNode::Memory);
bool sched_up = sched_before.member(current);
// (2) Walk over memops_schedule, append memops to the current state
// of that slice. If it is a Store, we take it as the new state.
for (uint i = 0; i < memops_schedule.size(); i++) {
MemNode* n = memops_schedule.at(i)->as_Mem();
assert(n->is_Load() || n->is_Store(), "only loads or stores");
int alias_idx = _phase->C->get_alias_index(n->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
if (current_state == nullptr) {
// If there are only loads in a slice, we never update the memory
// state in the loop, hence there is no phi for the memory state.
// We just keep the old memory state that was outside the loop.
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
"only loads can have memory state from outside loop");
} else {
_igvn.replace_input_of(n, MemNode::Memory, current_state);
if (n->is_Store()) {
current_state_in_slice.at_put(alias_idx, n);
}
}
}
// remove current_store from its current position in the memory graph
for (DUIterator i = current->outs(); current->has_out(i); i++) {
Node* use = current->out(i);
if (use->is_Mem()) {
assert(use->in(MemNode::Memory) == current, "must be");
if (use == prev) { // connect prev to my_mem
_igvn.replace_input_of(use, MemNode::Memory, my_mem);
--i; //deleted this edge; rescan position
} else if (sched_before.member(use)) {
if (!sched_up) { // Will be moved together with current
_igvn.replace_input_of(use, MemNode::Memory, uip);
--i; //deleted this edge; rescan position
}
} else {
if (sched_up) { // Will be moved together with current
_igvn.replace_input_of(use, MemNode::Memory, lip);
--i; //deleted this edge; rescan position
// (3) For each slice, we add the current state to the backedge
// in the Phi. Further, we replace uses of the old last store
// with uses of the new last store (current_state).
Node_List uses_after_loop;
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* phi = _mem_slice_head.at(i);
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
assert(current_state != nullptr, "slice is mapped");
assert(current_state != phi, "did some work in between");
assert(current_state->is_Store(), "sanity");
_igvn.replace_input_of(phi, 2, current_state);
// Replace uses of old last store with current_state (new last store)
// Do it in two loops: first find all the uses, and change the graph
// in as second loop so that we do not break the iterator.
Node* last_store = old_last_store_in_slice.at(alias_idx);
assert(last_store != nullptr, "we have a old last store");
uses_after_loop.clear();
for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
Node* use = last_store->fast_out(k);
if (!in_bb(use)) {
uses_after_loop.push(use);
}
}
for (uint k = 0; k < uses_after_loop.size(); k++) {
Node* use = uses_after_loop.at(k);
for (uint j = 0; j < use->req(); j++) {
Node* def = use->in(j);
if (def == last_store) {
_igvn.replace_input_of(use, j, current_state);
}
}
}
}
Node *insert_pt = sched_up ? uip : lip;
// all uses of insert_pt's memory state should use current's instead
for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
Node* use = insert_pt->out(i);
if (use->is_Mem()) {
assert(use->in(MemNode::Memory) == insert_pt, "must be");
_igvn.replace_input_of(use, MemNode::Memory, current);
--i; //deleted this edge; rescan position
} else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
uint pos; //lip (lower insert point) must be the last one in the memory slice
for (pos=1; pos < use->req(); pos++) {
if (use->in(pos) == insert_pt) break;
}
_igvn.replace_input_of(use, pos, current);
--i;
}
}
//connect current to insert_pt
_igvn.replace_input_of(current, MemNode::Memory, insert_pt);
}
//------------------------------co_locate_pack----------------------------------
// To schedule a store pack, we need to move any sandwiched memory ops either before
// or after the pack, based upon dependence information:
// (1) If any store in the pack depends on the sandwiched memory op, the
// sandwiched memory op must be scheduled BEFORE the pack;
// (2) If a sandwiched memory op depends on any store in the pack, the
// sandwiched memory op must be scheduled AFTER the pack;
// (3) If a sandwiched memory op (say, memA) depends on another sandwiched
// memory op (say memB), memB must be scheduled before memA. So, if memA is
// scheduled before the pack, memB must also be scheduled before the pack;
// (4) If there is no dependence restriction for a sandwiched memory op, we simply
// schedule this store AFTER the pack
// (5) We know there is no dependence cycle, so there in no other case;
// (6) Finally, all memory ops in another single pack should be moved in the same direction.
//
// To schedule a load pack, we use the memory state of either the first or the last load in
// the pack, based on the dependence constraint.
void SuperWord::co_locate_pack(Node_List* pk) {
if (pk->at(0)->is_Store()) {
MemNode* first = executed_first(pk)->as_Mem();
MemNode* last = executed_last(pk)->as_Mem();
Unique_Node_List schedule_before_pack;
Unique_Node_List memops;
MemNode* current = last->in(MemNode::Memory)->as_Mem();
MemNode* previous = last;
while (true) {
assert(in_bb(current), "stay in block");
memops.push(previous);
for (DUIterator i = current->outs(); current->has_out(i); i++) {
Node* use = current->out(i);
if (use->is_Mem() && use != previous)
memops.push(use);
}
if (current == first) break;
previous = current;
current = current->in(MemNode::Memory)->as_Mem();
}
// determine which memory operations should be scheduled before the pack
for (uint i = 1; i < memops.size(); i++) {
Node *s1 = memops.at(i);
if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
for (uint j = 0; j< i; j++) {
Node *s2 = memops.at(j);
if (!independent(s1, s2)) {
if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
schedule_before_pack.push(s1); // s1 must be scheduled before
Node_List* mem_pk = my_pack(s1);
if (mem_pk != nullptr) {
for (uint ii = 0; ii < mem_pk->size(); ii++) {
Node* s = mem_pk->at(ii); // follow partner
if (memops.member(s) && !schedule_before_pack.member(s))
schedule_before_pack.push(s);
}
}
break;
}
}
}
}
}
Node* upper_insert_pt = first->in(MemNode::Memory);
// Following code moves loads connected to upper_insert_pt below aliased stores.
// Collect such loads here and reconnect them back to upper_insert_pt later.
memops.clear();
for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
Node* use = upper_insert_pt->out(i);
if (use->is_Mem() && !use->is_Store()) {
memops.push(use);
}
}
MemNode* lower_insert_pt = last;
previous = last; //previous store in pk
current = last->in(MemNode::Memory)->as_Mem();
// start scheduling from "last" to "first"
while (true) {
assert(in_bb(current), "stay in block");
assert(in_pack(previous, pk), "previous stays in pack");
Node* my_mem = current->in(MemNode::Memory);
if (in_pack(current, pk)) {
// Forward users of my memory state (except "previous) to my input memory state
for (DUIterator i = current->outs(); current->has_out(i); i++) {
Node* use = current->out(i);
if (use->is_Mem() && use != previous) {
assert(use->in(MemNode::Memory) == current, "must be");
if (schedule_before_pack.member(use)) {
_igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
} else {
_igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
}
--i; // deleted this edge; rescan position
}
}
previous = current;
} else { // !in_pack(current, pk) ==> a sandwiched store
remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
}
if (current == first) break;
current = my_mem->as_Mem();
} // end while
// Reconnect loads back to upper_insert_pt.
for (uint i = 0; i < memops.size(); i++) {
Node *ld = memops.at(i);
if (ld->in(MemNode::Memory) != upper_insert_pt) {
_igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
}
}
} else if (pk->at(0)->is_Load()) { // Load pack
// All loads in the pack should have the same memory state. By default,
// we use the memory state of the last load. However, if any load could
// not be moved down due to the dependence constraint, we use the memory
// state of the first load.
Node* mem_input = pick_mem_state(pk);
_igvn.hash_delete(mem_input);
// Give each load the same memory state
for (uint i = 0; i < pk->size(); i++) {
LoadNode* ld = pk->at(i)->as_Load();
_igvn.replace_input_of(ld, MemNode::Memory, mem_input);
}
}
}
// Finds the first and last memory state and then picks either of them by checking dependence constraints.
// If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
// pick the memory state of the last load.
Node* SuperWord::pick_mem_state(Node_List* pk) {
Node* first_mem = find_first_mem_state(pk);
bool is_dependent = false;
Node* last_mem = find_last_mem_state(pk, first_mem, is_dependent);
for (uint i = 0; i < pk->size(); i++) {
Node* ld = pk->at(i);
for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
assert(current->is_Mem() && in_bb(current), "unexpected memory");
assert(current != first_mem, "corrupted memory graph");
if (!independent(current, ld)) {
// A later unvectorized store depends on this load, pick the memory state of the first load. This can happen,
// for example, if a load pack has interleaving stores that are part of a store pack which, however, is removed
// at the pack filtering stage. This leaves us with only a load pack for which we cannot take the memory state
// of the last load as the remaining unvectorized stores could interfere since they have a dependency to the loads.
// Some stores could be executed before the load vector resulting in a wrong result. We need to take the
// memory state of the first load to prevent this.
if (my_pack(current) != nullptr && is_dependent) {
// For vectorized store pack, when the load pack depends on
// some memory operations locating after first_mem, we still
// take the memory state of the last load.
continue;
}
return first_mem;
}
}
}
return last_mem;
}
// Walk the memory graph from the current first load until the
// start of the loop and check if nodes on the way are memory
// edges of loads in the pack. The last one we encounter is the
// first load.
Node* SuperWord::find_first_mem_state(Node_List* pk) {
Node* first_mem = pk->at(0)->in(MemNode::Memory);
for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
for (uint i = 1; i < pk->size(); i++) {
Node* ld = pk->at(i);
if (ld->in(MemNode::Memory) == current) {
first_mem = current;
break;
}
}
}
return first_mem;
}
// Find the last load by going over the pack again and walking
// the memory graph from the loads of the pack to the memory of
// the first load. If we encounter the memory of the current last
// load, then we started from further down in the memory graph and
// the load we started from is the last load. At the same time, the
// function also helps determine if some loads in the pack depend on
// early memory operations which locate after first_mem.
Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent) {
Node* last_mem = pk->at(0)->in(MemNode::Memory);
for (uint i = 0; i < pk->size(); i++) {
Node* ld = pk->at(i);
for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
assert(current->is_Mem() && in_bb(current), "unexpected memory");
// Determine if the load pack is dependent on some memory operations locating after first_mem.
is_dependent |= !independent(current, ld);
if (current->in(MemNode::Memory) == last_mem) {
last_mem = ld->in(MemNode::Memory);
}
}
}
return last_mem;
}
#ifndef PRODUCT
@ -3037,12 +2903,14 @@ bool SuperWord::output() {
for (int i = 0; i < _block.length(); i++) {
Node* n = _block.at(i);
Node_List* p = my_pack(n);
if (p && n == executed_last(p)) {
if (p != nullptr && n == p->at(p->size()-1)) {
// After schedule_reorder_memops, we know that the memops have the same order in the pack
// as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
// and "n" is the last node in the slice from the pack.
Node* first = p->at(0);
uint vlen = p->size();
uint vlen_in_bytes = 0;
Node* vn = nullptr;
Node* low_adr = p->at(0);
Node* first = executed_first(p);
if (cl->is_rce_post_loop()) {
// override vlen with the main loops vector length
vlen = cl->slp_max_unroll();
@ -3066,7 +2934,7 @@ bool SuperWord::output() {
break; // dependent memory
}
}
Node* adr = low_adr->in(MemNode::Address);
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
if (cl->is_rce_post_loop()) {
assert(vmask != nullptr, "vector mask should be generated");
@ -3089,7 +2957,7 @@ bool SuperWord::output() {
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
Node* adr = low_adr->in(MemNode::Address);
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
if (cl->is_rce_post_loop()) {
assert(vmask != nullptr, "vector mask should be generated");
@ -3100,8 +2968,8 @@ bool SuperWord::output() {
}
vlen_in_bytes = vn->as_StoreVector()->memory_size();
} else if (VectorNode::is_scalar_rotate(n)) {
Node* in1 = low_adr->in(1);
Node* in2 = p->at(0)->in(2);
Node* in1 = first->in(1);
Node* in2 = first->in(2);
// If rotation count is non-constant or greater than 8bit value create a vector.
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
in2 = vector_opd(p, 2);
@ -3110,7 +2978,7 @@ bool SuperWord::output() {
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_roundopD(n)) {
Node* in1 = vector_opd(p, 1);
Node* in2 = low_adr->in(2);
Node* in2 = first->in(2);
assert(in2->is_Con(), "Constant rounding mode expected.");
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
@ -3133,7 +3001,7 @@ bool SuperWord::output() {
bool node_isa_reduction = is_marked_reduction(n);
if (node_isa_reduction) {
// the input to the first reduction operation is retained
in1 = low_adr->in(1);
in1 = first->in(1);
} else {
in1 = vector_opd(p, 1);
if (in1 == nullptr) {
@ -3201,7 +3069,7 @@ bool SuperWord::output() {
Node* in = vector_opd(p, 1);
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
_igvn.register_new_node_with_optimizer(longval);
_phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
_phase->set_ctrl(longval, _phase->get_ctrl(first));
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_convert_opcode(opc)) {
@ -3316,7 +3184,7 @@ bool SuperWord::output() {
_block.at_put(i, vn);
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
_phase->set_ctrl(vn, _phase->get_ctrl(first));
for (uint j = 0; j < p->size(); j++) {
Node* pm = p->at(j);
_igvn.replace_node(pm, vn);
@ -4196,17 +4064,6 @@ bool SuperWord::in_packset(Node* s1, Node* s2) {
return false;
}
//------------------------------in_pack---------------------------
// Is s in pack p?
Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
for (uint i = 0; i < p->size(); i++) {
if (p->at(i) == s) {
return p;
}
}
return nullptr;
}
//------------------------------remove_pack_at---------------------------
// Remove the pack at position pos in the packset
void SuperWord::remove_pack_at(int pos) {
@ -4239,39 +4096,6 @@ void SuperWord::packset_sort(int n) {
}
}
//------------------------------executed_first---------------------------
// Return the node executed first in pack p. Uses the RPO block list
// to determine order.
Node* SuperWord::executed_first(Node_List* p) {
Node* n = p->at(0);
int n_rpo = bb_idx(n);
for (uint i = 1; i < p->size(); i++) {
Node* s = p->at(i);
int s_rpo = bb_idx(s);
if (s_rpo < n_rpo) {
n = s;
n_rpo = s_rpo;
}
}
return n;
}
//------------------------------executed_last---------------------------
// Return the node executed last in pack p.
Node* SuperWord::executed_last(Node_List* p) {
Node* n = p->at(0);
int n_rpo = bb_idx(n);
for (uint i = 1; i < p->size(); i++) {
Node* s = p->at(i);
int s_rpo = bb_idx(s);
if (s_rpo > n_rpo) {
n = s;
n_rpo = s_rpo;
}
}
return n;
}
LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
for (uint i = 0; i < p->size(); i++) {
@ -4547,16 +4371,6 @@ void SuperWord::print_stmt(Node* s) {
#endif
}
//------------------------------blank---------------------------
char* SuperWord::blank(uint depth) {
static char blanks[101];
assert(depth < 101, "too deep");
for (uint i = 0; i < depth; i++) blanks[i] = ' ';
blanks[depth] = '\0';
return blanks;
}
//==============================SWPointer===========================
#ifndef PRODUCT
int SWPointer::Tracer::_depth = 0;

View File

@ -458,7 +458,9 @@ class SuperWord : public ResourceObj {
bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const;
// my_pack
public:
Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
private:
void set_my_pack(Node* n, Node_List* p) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
// is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
bool is_cmov_pack(Node_List* p);
@ -618,19 +620,10 @@ private:
void merge_packs_to_cmove();
// Verify that for every pack, all nodes are mutually independent
DEBUG_ONLY(void verify_packs();)
// Remove cycles in packset.
void remove_cycles();
// Adjust the memory graph for the packed operations
void schedule();
// Remove "current" from its current position in the memory graph and insert
// it after the appropriate insert points (lip or uip);
void remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &schd_before);
// Within a store pack, schedule stores together by moving out the sandwiched memory ops according
// to dependence info; and within a load pack, move loads down to the last executed load.
void co_locate_pack(Node_List* p);
Node* pick_mem_state(Node_List* pk);
Node* find_first_mem_state(Node_List* pk);
Node* find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent);
// Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
void schedule_reorder_memops(Node_List &memops_schedule);
// Convert packs into vector node operations
bool output();
@ -662,19 +655,11 @@ private:
void compute_vector_element_type();
// Are s1 and s2 in a pack pair and ordered as s1,s2?
bool in_packset(Node* s1, Node* s2);
// Is s in pack p?
Node_List* in_pack(Node* s, Node_List* p);
// Remove the pack at position pos in the packset
void remove_pack_at(int pos);
// Return the node executed first in pack p.
Node* executed_first(Node_List* p);
// Return the node executed last in pack p.
Node* executed_last(Node_List* p);
static LoadNode::ControlDependency control_dependency(Node_List* p);
// Alignment within a vector memory reference
int memory_alignment(MemNode* s, int iv_adjust);
// (Start, end] half-open range defining which operands are vector
void vector_opd_range(Node* n, uint* start, uint* end);
// Smallest type containing range of values
const Type* container_type(Node* n);
// Adjust pre-loop limit so that in main loop, a load/store reference
@ -693,7 +678,6 @@ private:
void print_pack(Node_List* p);
void print_bb();
void print_stmt(Node* s);
char* blank(uint depth);
void packset_sort(int n);
};

View File

@ -27,9 +27,8 @@
* @bug 8304042
* @summary Test some examples with independent packs with cyclic dependency
* between the packs.
* @requires vm.compiler2.enabled
* @requires vm.bits == 64
* @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
* @requires vm.compiler2.enabled
* @modules java.base/jdk.internal.misc
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency
@ -94,8 +93,8 @@ public class TestIndependentPacksWithCyclicDependency {
test3(goldI3, goldI3, goldF3, goldF3);
init(goldI4, goldF4);
test4(goldI4, goldI4, goldF4, goldF4);
// init(goldI5, goldF5);
// test5(goldI5, goldI5, goldF5, goldF5);
init(goldI5, goldF5);
test5(goldI5, goldI5, goldF5, goldF5);
init(goldI6, goldF6, goldL6);
test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6);
init(goldI7, goldF7, goldL7);
@ -232,29 +231,27 @@ public class TestIndependentPacksWithCyclicDependency {
}
}
// TODO uncomment after fixing JDK-8304720
//
// @Run(test = "test5")
// public void runTest5() {
// int[] dataI = new int[RANGE];
// float[] dataF = new float[RANGE];
// init(dataI, dataF);
// test5(dataI, dataI, dataF, dataF);
// verify("test5", dataI, goldI5);
// verify("test5", dataF, goldF5);
// }
//
// @Test
// static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
// for (int i = 0; i < RANGE; i+=2) {
// // same as test2, except that reordering leads to different semantics
// // explanation analogue to test4
// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
// dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
// dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
// }
// }
@Run(test = "test5")
public void runTest5() {
int[] dataI = new int[RANGE];
float[] dataF = new float[RANGE];
init(dataI, dataF);
test5(dataI, dataI, dataF, dataF);
verify("test5", dataI, goldI5);
verify("test5", dataF, goldF5);
}
@Test
static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
for (int i = 0; i < RANGE; i+=2) {
// same as test2, except that reordering leads to different semantics
// explanation analogue to test4
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
}
}
@Run(test = "test6")
public void runTest6() {

View File

@ -29,8 +29,6 @@
* between the packs.
* Before fix, this hit: "assert(!is_visited) failed: visit only once"
* @requires vm.compiler2.enabled
* @requires vm.bits == 64
* @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
* @modules java.base/jdk.internal.misc
* @library /test/lib /
* @run main/othervm -XX:LoopUnrollLimit=250

View File

@ -0,0 +1,159 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/*
* @test
* @bug 8304720
* @summary Test some examples where non-vectorized memops also need to
* be reordered during SuperWord::schedule.
* @requires vm.compiler2.enabled
* @modules java.base/jdk.internal.misc
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestScheduleReordersScalarMemops
*/
package compiler.loopopts.superword;
import jdk.internal.misc.Unsafe;
import jdk.test.lib.Asserts;
import compiler.lib.ir_framework.*;
public class TestScheduleReordersScalarMemops {
static final int RANGE = 1024;
static final int ITER = 10_000;
static Unsafe unsafe = Unsafe.getUnsafe();
int[] goldI0 = new int[RANGE];
float[] goldF0 = new float[RANGE];
int[] goldI1 = new int[RANGE];
float[] goldF1 = new float[RANGE];
public static void main(String args[]) {
TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::test*",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::verify",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::init",
"-XX:LoopUnrollLimit=1000",
"-XX:-TieredCompilation", "-Xbatch");
}
TestScheduleReordersScalarMemops() {
// compute the gold standard in interpreter mode
init(goldI0, goldF0);
test0(goldI0, goldI0, goldF0, goldF0);
init(goldI1, goldF1);
test1(goldI1, goldI1, goldF1, goldF1);
}
@Run(test = "test0")
@Warmup(100)
public void runTest0() {
int[] dataI = new int[RANGE];
float[] dataF = new float[RANGE];
init(dataI, dataF);
test0(dataI, dataI, dataF, dataF);
verify("test0", dataI, goldI0);
verify("test0", dataF, goldF0);
}
@Test
@IR(counts = {IRNode.MUL_VI, "> 0"},
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
for (int i = 0; i < RANGE; i+=2) {
// We have dependency edges:
// A -> X
// Y -> B
// Still, we can vectorize [X,Y].
// We do not vectorize A and B, because they are not isomorphic (add vs mul).
//
// Imagine this is unrolled at least 2x.
// We get order: A0 X0 Y0 B0 A1 X1 Y1 B1
// Vectorized: X0 Y0 X1 Y1
// Scalar: A0 B0 A1 B1
//
// However, since the As need to be before, and the Bs after the vector operations,
// we need to have all As before all Bs. This means we need to reorder the scalar
// operations, and not just the vectorized ones.
//
// A correct reordering would be: A0 A1 [X0, Y0, X1, Y1] B0 B1
//
dataFa[i + 0] = dataIa[i + 0] * 1.3f; // A *1.3
dataIb[i + 0] = (int)dataFb[i + 0] * 11; // X *11
dataIb[i + 1] = (int)dataFb[i + 1] * 11; // Y *11
dataFa[i + 1] = dataIa[i + 1] + 1.2f; // B +1.2
}
}
@Run(test = "test1")
@Warmup(100)
public void runTest1() {
int[] dataI = new int[RANGE];
float[] dataF = new float[RANGE];
init(dataI, dataF);
test1(dataI, dataI, dataF, dataF);
verify("test1", dataI, goldI1);
verify("test1", dataF, goldF1);
}
@Test
@IR(counts = {IRNode.MUL_VI, "> 0"},
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
for (int i = 0; i < RANGE; i+=2) {
// Do the same as test0, but without int-float conversion.
// This should reproduce on machines where conversion is not implemented.
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A +1
dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11
}
}
static void init(int[] dataI, float[] dataF) {
for (int i = 0; i < RANGE; i++) {
dataI[i] = i + 1;
dataF[i] = i + 0.1f;
}
}
static void verify(String name, int[] data, int[] gold) {
for (int i = 0; i < RANGE; i++) {
if (data[i] != gold[i]) {
throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]);
}
}
}
static void verify(String name, float[] data, float[] gold) {
for (int i = 0; i < RANGE; i++) {
int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
if (datav != goldv) {
throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
}
}
}
}