8304720: SuperWord::schedule should rebuild C2-graph from SuperWord dependency-graph
Reviewed-by: kvn, fgao
This commit is contained in:
parent
495f2688d6
commit
ad0e5a99ca
@ -669,8 +669,6 @@ bool SuperWord::SLP_extract() {
|
||||
|
||||
DEBUG_ONLY(verify_packs();)
|
||||
|
||||
remove_cycles();
|
||||
|
||||
schedule();
|
||||
|
||||
// Record eventual count of vector packs for checks in post loop vectorization
|
||||
@ -1431,11 +1429,13 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// FIXME - co_locate_pack fails on Stores in different mem-slices, so
|
||||
// only pack memops that are in the same alias set until that's fixed.
|
||||
// Adjacent memory references must be on the same slice.
|
||||
if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Adjacent memory references must have the same base, be comparable
|
||||
// and have the correct distance between them.
|
||||
SWPointer p1(s1->as_Mem(), this, nullptr, false);
|
||||
SWPointer p2(s2->as_Mem(), this, nullptr, false);
|
||||
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
|
||||
@ -2497,10 +2497,13 @@ class PacksetGraph {
|
||||
private:
|
||||
// pid: packset graph node id.
|
||||
GrowableArray<int> _pid; // bb_idx(n) -> pid
|
||||
GrowableArray<Node*> _pid_to_node; // one node per pid, find rest via my_pack
|
||||
GrowableArray<GrowableArray<int>> _out; // out-edges
|
||||
GrowableArray<int> _incnt; // number of (implicit) in-edges
|
||||
int _max_pid = 0;
|
||||
|
||||
bool _schedule_success;
|
||||
|
||||
SuperWord* _slp;
|
||||
public:
|
||||
PacksetGraph(SuperWord* slp)
|
||||
@ -2523,11 +2526,18 @@ public:
|
||||
assert(poz != 0, "pid should not be zero");
|
||||
return poz;
|
||||
}
|
||||
void set_pid(const Node* n, int pid) {
|
||||
void set_pid(Node* n, int pid) {
|
||||
assert(n != nullptr && pid > 0, "sane inputs");
|
||||
assert(_slp->in_bb(n), "must be");
|
||||
int idx = _slp->bb_idx(n);
|
||||
_pid.at_put_grow(idx, pid);
|
||||
_pid_to_node.at_put_grow(pid - 1, n, nullptr);
|
||||
}
|
||||
Node* get_node(int pid) {
|
||||
assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
|
||||
Node* n = _pid_to_node.at(pid - 1);
|
||||
assert(n != nullptr, "sanity");
|
||||
return n;
|
||||
}
|
||||
int new_pid() {
|
||||
_incnt.push(0);
|
||||
@ -2537,6 +2547,7 @@ public:
|
||||
int incnt(int pid) { return _incnt.at(pid - 1); }
|
||||
void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
|
||||
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
|
||||
bool schedule_success() const { return _schedule_success; }
|
||||
|
||||
// Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
|
||||
void build() {
|
||||
@ -2550,6 +2561,7 @@ public:
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
Node* n = p->at(k);
|
||||
set_pid(n, pid);
|
||||
assert(_slp->my_pack(n) == p, "matching packset");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2565,6 +2577,7 @@ public:
|
||||
if (pid == 0) {
|
||||
pid = new_pid();
|
||||
set_pid(n, pid);
|
||||
assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2610,11 +2623,13 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
// Schedule the graph to worklist. Returns true iff all nodes were scheduled.
|
||||
// This implies that we return true iff the PacksetGraph is acyclic.
|
||||
// We schedule with topological sort: schedule any node that has zero incnt.
|
||||
// Then remove that node, which decrements the incnt of all its uses (outputs).
|
||||
bool schedule() {
|
||||
|
||||
// Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
|
||||
// that has zero incnt. If a PacksetGraph node corresponds to memops, then add
|
||||
// those to the memops_schedule. At the end, we return the memops_schedule, and
|
||||
// note if topsort was successful.
|
||||
Node_List schedule() {
|
||||
Node_List memops_schedule;
|
||||
GrowableArray<int> worklist;
|
||||
// Directly schedule all nodes without precedence
|
||||
for (int pid = 1; pid <= _max_pid; pid++) {
|
||||
@ -2625,6 +2640,22 @@ public:
|
||||
// Continue scheduling via topological sort
|
||||
for (int i = 0; i < worklist.length(); i++) {
|
||||
int pid = worklist.at(i);
|
||||
|
||||
// Add memops to memops_schedule
|
||||
Node* n = get_node(pid);
|
||||
Node_List* p = _slp->my_pack(n);
|
||||
if (n->is_Mem()) {
|
||||
if (p == nullptr) {
|
||||
memops_schedule.push(n);
|
||||
} else {
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
memops_schedule.push(p->at(k));
|
||||
assert(p->at(k)->is_Mem(), "only schedule memops");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Decrement incnt for all successors
|
||||
for (int j = 0; j < out(pid).length(); j++){
|
||||
int pid_use = out(pid).at(j);
|
||||
int incnt_use = incnt(pid_use) - 1;
|
||||
@ -2635,9 +2666,12 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
// Was every pid scheduled?
|
||||
return worklist.length() == _max_pid;
|
||||
|
||||
// Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
|
||||
_schedule_success = (worklist.length() == _max_pid);
|
||||
return memops_schedule;
|
||||
}
|
||||
|
||||
// Print the PacksetGraph.
|
||||
// print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
|
||||
// print_zero_incnt = false: do not print nodes that have no in-edges (any more).
|
||||
@ -2668,303 +2702,135 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------remove_cycles---------------------------
|
||||
// We now know that we only have independent packs, see verify_packs.
|
||||
// This is a necessary but not a sufficient condition for an acyclic
|
||||
// graph (DAG) after scheduling. Thus, we must check if the packs have
|
||||
// introduced a cycle. The SuperWord paper mentions the need for this
|
||||
// in "3.7 Scheduling".
|
||||
// Approach: given all nodes from the _block, we create a new graph.
|
||||
// The nodes that are not in a pack are their own nodes (scalar-node)
|
||||
// in that new graph. Every pack is also a node (pack-node). We then
|
||||
// add the edges according to DepPreds: a scalar-node has all edges
|
||||
// to its node's DepPreds. A pack-node has all edges from every pack
|
||||
// member to all their DepPreds.
|
||||
void SuperWord::remove_cycles() {
|
||||
// The C2 graph (specifically the memory graph), needs to be re-ordered.
|
||||
// (1) Build the PacksetGraph. It combines the DepPreds graph with the
|
||||
// packset. The PacksetGraph gives us the dependencies that must be
|
||||
// respected after scheduling.
|
||||
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
|
||||
// a linear order of all memops in the body. The order respects the
|
||||
// dependencies of the PacksetGraph.
|
||||
// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
|
||||
// (4) Use the memops_schedule to re-order the memops in all slices.
|
||||
void SuperWord::schedule() {
|
||||
if (_packset.length() == 0) {
|
||||
return; // empty packset
|
||||
}
|
||||
ResourceMark rm;
|
||||
|
||||
// (1) Build the PacksetGraph.
|
||||
PacksetGraph graph(this);
|
||||
|
||||
graph.build();
|
||||
|
||||
if (!graph.schedule()) {
|
||||
// (2) Schedule the PacksetGraph.
|
||||
Node_List memops_schedule = graph.schedule();
|
||||
|
||||
// (3) Check if the PacksetGraph schedule succeeded (had no cycles).
|
||||
// We now know that we only have independent packs, see verify_packs.
|
||||
// This is a necessary but not a sufficient condition for an acyclic
|
||||
// graph (DAG) after scheduling. Thus, we must check if the packs have
|
||||
// introduced a cycle. The SuperWord paper mentions the need for this
|
||||
// in "3.7 Scheduling".
|
||||
if (!graph.schedule_success()) {
|
||||
if (TraceSuperWord) {
|
||||
tty->print_cr("remove_cycles found cycle in PacksetGraph:");
|
||||
tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
|
||||
graph.print(true, false);
|
||||
tty->print_cr("removing all packs from packset.");
|
||||
}
|
||||
_packset.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWord) {
|
||||
tty->print_cr("SuperWord::schedule: memops_schedule:");
|
||||
memops_schedule.dump();
|
||||
}
|
||||
#endif
|
||||
|
||||
// (4) Use the memops_schedule to re-order the memops in all slices.
|
||||
schedule_reorder_memops(memops_schedule);
|
||||
}
|
||||
|
||||
//------------------------------schedule---------------------------
|
||||
// Adjust the memory graph for the packed operations
|
||||
void SuperWord::schedule() {
|
||||
|
||||
// Co-locate in the memory graph the members of each memory pack
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
co_locate_pack(_packset.at(i));
|
||||
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
|
||||
// and track the current memory state of each slice.
|
||||
void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
int max_slices = _phase->C->num_alias_types();
|
||||
// When iterating over the memops_schedule, we keep track of the current memory state,
|
||||
// which is the Phi or a store in the loop.
|
||||
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
|
||||
// The memory state after the loop is the last store inside the loop. If we reorder the
|
||||
// loop we may have a different last store, and we need to adjust the uses accordingly.
|
||||
GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
|
||||
|
||||
// (1) Set up the initial memory state from Phi. And find the old last store.
|
||||
for (int i = 0; i < _mem_slice_head.length(); i++) {
|
||||
Node* phi = _mem_slice_head.at(i);
|
||||
assert(phi->is_Phi(), "must be phi");
|
||||
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
|
||||
current_state_in_slice.at_put(alias_idx, phi);
|
||||
|
||||
// If we have a memory phi, we have a last store in the loop, find it over backedge.
|
||||
StoreNode* last_store = phi->in(2)->as_Store();
|
||||
old_last_store_in_slice.at_put(alias_idx, last_store);
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------remove_and_insert-------------------
|
||||
// Remove "current" from its current position in the memory graph and insert
|
||||
// it after the appropriate insertion point (lip or uip).
|
||||
void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
|
||||
Node *uip, Unique_Node_List &sched_before) {
|
||||
Node* my_mem = current->in(MemNode::Memory);
|
||||
bool sched_up = sched_before.member(current);
|
||||
// (2) Walk over memops_schedule, append memops to the current state
|
||||
// of that slice. If it is a Store, we take it as the new state.
|
||||
for (uint i = 0; i < memops_schedule.size(); i++) {
|
||||
MemNode* n = memops_schedule.at(i)->as_Mem();
|
||||
assert(n->is_Load() || n->is_Store(), "only loads or stores");
|
||||
int alias_idx = _phase->C->get_alias_index(n->adr_type());
|
||||
Node* current_state = current_state_in_slice.at(alias_idx);
|
||||
if (current_state == nullptr) {
|
||||
// If there are only loads in a slice, we never update the memory
|
||||
// state in the loop, hence there is no phi for the memory state.
|
||||
// We just keep the old memory state that was outside the loop.
|
||||
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
|
||||
"only loads can have memory state from outside loop");
|
||||
} else {
|
||||
_igvn.replace_input_of(n, MemNode::Memory, current_state);
|
||||
if (n->is_Store()) {
|
||||
current_state_in_slice.at_put(alias_idx, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remove current_store from its current position in the memory graph
|
||||
for (DUIterator i = current->outs(); current->has_out(i); i++) {
|
||||
Node* use = current->out(i);
|
||||
if (use->is_Mem()) {
|
||||
assert(use->in(MemNode::Memory) == current, "must be");
|
||||
if (use == prev) { // connect prev to my_mem
|
||||
_igvn.replace_input_of(use, MemNode::Memory, my_mem);
|
||||
--i; //deleted this edge; rescan position
|
||||
} else if (sched_before.member(use)) {
|
||||
if (!sched_up) { // Will be moved together with current
|
||||
_igvn.replace_input_of(use, MemNode::Memory, uip);
|
||||
--i; //deleted this edge; rescan position
|
||||
}
|
||||
} else {
|
||||
if (sched_up) { // Will be moved together with current
|
||||
_igvn.replace_input_of(use, MemNode::Memory, lip);
|
||||
--i; //deleted this edge; rescan position
|
||||
// (3) For each slice, we add the current state to the backedge
|
||||
// in the Phi. Further, we replace uses of the old last store
|
||||
// with uses of the new last store (current_state).
|
||||
Node_List uses_after_loop;
|
||||
for (int i = 0; i < _mem_slice_head.length(); i++) {
|
||||
Node* phi = _mem_slice_head.at(i);
|
||||
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
|
||||
Node* current_state = current_state_in_slice.at(alias_idx);
|
||||
assert(current_state != nullptr, "slice is mapped");
|
||||
assert(current_state != phi, "did some work in between");
|
||||
assert(current_state->is_Store(), "sanity");
|
||||
_igvn.replace_input_of(phi, 2, current_state);
|
||||
|
||||
// Replace uses of old last store with current_state (new last store)
|
||||
// Do it in two loops: first find all the uses, and change the graph
|
||||
// in as second loop so that we do not break the iterator.
|
||||
Node* last_store = old_last_store_in_slice.at(alias_idx);
|
||||
assert(last_store != nullptr, "we have a old last store");
|
||||
uses_after_loop.clear();
|
||||
for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
|
||||
Node* use = last_store->fast_out(k);
|
||||
if (!in_bb(use)) {
|
||||
uses_after_loop.push(use);
|
||||
}
|
||||
}
|
||||
for (uint k = 0; k < uses_after_loop.size(); k++) {
|
||||
Node* use = uses_after_loop.at(k);
|
||||
for (uint j = 0; j < use->req(); j++) {
|
||||
Node* def = use->in(j);
|
||||
if (def == last_store) {
|
||||
_igvn.replace_input_of(use, j, current_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Node *insert_pt = sched_up ? uip : lip;
|
||||
|
||||
// all uses of insert_pt's memory state should use current's instead
|
||||
for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
|
||||
Node* use = insert_pt->out(i);
|
||||
if (use->is_Mem()) {
|
||||
assert(use->in(MemNode::Memory) == insert_pt, "must be");
|
||||
_igvn.replace_input_of(use, MemNode::Memory, current);
|
||||
--i; //deleted this edge; rescan position
|
||||
} else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
|
||||
uint pos; //lip (lower insert point) must be the last one in the memory slice
|
||||
for (pos=1; pos < use->req(); pos++) {
|
||||
if (use->in(pos) == insert_pt) break;
|
||||
}
|
||||
_igvn.replace_input_of(use, pos, current);
|
||||
--i;
|
||||
}
|
||||
}
|
||||
|
||||
//connect current to insert_pt
|
||||
_igvn.replace_input_of(current, MemNode::Memory, insert_pt);
|
||||
}
|
||||
|
||||
//------------------------------co_locate_pack----------------------------------
|
||||
// To schedule a store pack, we need to move any sandwiched memory ops either before
|
||||
// or after the pack, based upon dependence information:
|
||||
// (1) If any store in the pack depends on the sandwiched memory op, the
|
||||
// sandwiched memory op must be scheduled BEFORE the pack;
|
||||
// (2) If a sandwiched memory op depends on any store in the pack, the
|
||||
// sandwiched memory op must be scheduled AFTER the pack;
|
||||
// (3) If a sandwiched memory op (say, memA) depends on another sandwiched
|
||||
// memory op (say memB), memB must be scheduled before memA. So, if memA is
|
||||
// scheduled before the pack, memB must also be scheduled before the pack;
|
||||
// (4) If there is no dependence restriction for a sandwiched memory op, we simply
|
||||
// schedule this store AFTER the pack
|
||||
// (5) We know there is no dependence cycle, so there in no other case;
|
||||
// (6) Finally, all memory ops in another single pack should be moved in the same direction.
|
||||
//
|
||||
// To schedule a load pack, we use the memory state of either the first or the last load in
|
||||
// the pack, based on the dependence constraint.
|
||||
void SuperWord::co_locate_pack(Node_List* pk) {
|
||||
if (pk->at(0)->is_Store()) {
|
||||
MemNode* first = executed_first(pk)->as_Mem();
|
||||
MemNode* last = executed_last(pk)->as_Mem();
|
||||
Unique_Node_List schedule_before_pack;
|
||||
Unique_Node_List memops;
|
||||
|
||||
MemNode* current = last->in(MemNode::Memory)->as_Mem();
|
||||
MemNode* previous = last;
|
||||
while (true) {
|
||||
assert(in_bb(current), "stay in block");
|
||||
memops.push(previous);
|
||||
for (DUIterator i = current->outs(); current->has_out(i); i++) {
|
||||
Node* use = current->out(i);
|
||||
if (use->is_Mem() && use != previous)
|
||||
memops.push(use);
|
||||
}
|
||||
if (current == first) break;
|
||||
previous = current;
|
||||
current = current->in(MemNode::Memory)->as_Mem();
|
||||
}
|
||||
|
||||
// determine which memory operations should be scheduled before the pack
|
||||
for (uint i = 1; i < memops.size(); i++) {
|
||||
Node *s1 = memops.at(i);
|
||||
if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
|
||||
for (uint j = 0; j< i; j++) {
|
||||
Node *s2 = memops.at(j);
|
||||
if (!independent(s1, s2)) {
|
||||
if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
|
||||
schedule_before_pack.push(s1); // s1 must be scheduled before
|
||||
Node_List* mem_pk = my_pack(s1);
|
||||
if (mem_pk != nullptr) {
|
||||
for (uint ii = 0; ii < mem_pk->size(); ii++) {
|
||||
Node* s = mem_pk->at(ii); // follow partner
|
||||
if (memops.member(s) && !schedule_before_pack.member(s))
|
||||
schedule_before_pack.push(s);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Node* upper_insert_pt = first->in(MemNode::Memory);
|
||||
// Following code moves loads connected to upper_insert_pt below aliased stores.
|
||||
// Collect such loads here and reconnect them back to upper_insert_pt later.
|
||||
memops.clear();
|
||||
for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
|
||||
Node* use = upper_insert_pt->out(i);
|
||||
if (use->is_Mem() && !use->is_Store()) {
|
||||
memops.push(use);
|
||||
}
|
||||
}
|
||||
|
||||
MemNode* lower_insert_pt = last;
|
||||
previous = last; //previous store in pk
|
||||
current = last->in(MemNode::Memory)->as_Mem();
|
||||
|
||||
// start scheduling from "last" to "first"
|
||||
while (true) {
|
||||
assert(in_bb(current), "stay in block");
|
||||
assert(in_pack(previous, pk), "previous stays in pack");
|
||||
Node* my_mem = current->in(MemNode::Memory);
|
||||
|
||||
if (in_pack(current, pk)) {
|
||||
// Forward users of my memory state (except "previous) to my input memory state
|
||||
for (DUIterator i = current->outs(); current->has_out(i); i++) {
|
||||
Node* use = current->out(i);
|
||||
if (use->is_Mem() && use != previous) {
|
||||
assert(use->in(MemNode::Memory) == current, "must be");
|
||||
if (schedule_before_pack.member(use)) {
|
||||
_igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
|
||||
} else {
|
||||
_igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
|
||||
}
|
||||
--i; // deleted this edge; rescan position
|
||||
}
|
||||
}
|
||||
previous = current;
|
||||
} else { // !in_pack(current, pk) ==> a sandwiched store
|
||||
remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
|
||||
}
|
||||
|
||||
if (current == first) break;
|
||||
current = my_mem->as_Mem();
|
||||
} // end while
|
||||
|
||||
// Reconnect loads back to upper_insert_pt.
|
||||
for (uint i = 0; i < memops.size(); i++) {
|
||||
Node *ld = memops.at(i);
|
||||
if (ld->in(MemNode::Memory) != upper_insert_pt) {
|
||||
_igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
|
||||
}
|
||||
}
|
||||
} else if (pk->at(0)->is_Load()) { // Load pack
|
||||
// All loads in the pack should have the same memory state. By default,
|
||||
// we use the memory state of the last load. However, if any load could
|
||||
// not be moved down due to the dependence constraint, we use the memory
|
||||
// state of the first load.
|
||||
Node* mem_input = pick_mem_state(pk);
|
||||
_igvn.hash_delete(mem_input);
|
||||
// Give each load the same memory state
|
||||
for (uint i = 0; i < pk->size(); i++) {
|
||||
LoadNode* ld = pk->at(i)->as_Load();
|
||||
_igvn.replace_input_of(ld, MemNode::Memory, mem_input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Finds the first and last memory state and then picks either of them by checking dependence constraints.
|
||||
// If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
|
||||
// pick the memory state of the last load.
|
||||
Node* SuperWord::pick_mem_state(Node_List* pk) {
|
||||
Node* first_mem = find_first_mem_state(pk);
|
||||
bool is_dependent = false;
|
||||
Node* last_mem = find_last_mem_state(pk, first_mem, is_dependent);
|
||||
|
||||
for (uint i = 0; i < pk->size(); i++) {
|
||||
Node* ld = pk->at(i);
|
||||
for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
|
||||
assert(current->is_Mem() && in_bb(current), "unexpected memory");
|
||||
assert(current != first_mem, "corrupted memory graph");
|
||||
if (!independent(current, ld)) {
|
||||
// A later unvectorized store depends on this load, pick the memory state of the first load. This can happen,
|
||||
// for example, if a load pack has interleaving stores that are part of a store pack which, however, is removed
|
||||
// at the pack filtering stage. This leaves us with only a load pack for which we cannot take the memory state
|
||||
// of the last load as the remaining unvectorized stores could interfere since they have a dependency to the loads.
|
||||
// Some stores could be executed before the load vector resulting in a wrong result. We need to take the
|
||||
// memory state of the first load to prevent this.
|
||||
if (my_pack(current) != nullptr && is_dependent) {
|
||||
// For vectorized store pack, when the load pack depends on
|
||||
// some memory operations locating after first_mem, we still
|
||||
// take the memory state of the last load.
|
||||
continue;
|
||||
}
|
||||
return first_mem;
|
||||
}
|
||||
}
|
||||
}
|
||||
return last_mem;
|
||||
}
|
||||
|
||||
// Walk the memory graph from the current first load until the
|
||||
// start of the loop and check if nodes on the way are memory
|
||||
// edges of loads in the pack. The last one we encounter is the
|
||||
// first load.
|
||||
Node* SuperWord::find_first_mem_state(Node_List* pk) {
|
||||
Node* first_mem = pk->at(0)->in(MemNode::Memory);
|
||||
for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
|
||||
assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
|
||||
for (uint i = 1; i < pk->size(); i++) {
|
||||
Node* ld = pk->at(i);
|
||||
if (ld->in(MemNode::Memory) == current) {
|
||||
first_mem = current;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return first_mem;
|
||||
}
|
||||
|
||||
// Find the last load by going over the pack again and walking
|
||||
// the memory graph from the loads of the pack to the memory of
|
||||
// the first load. If we encounter the memory of the current last
|
||||
// load, then we started from further down in the memory graph and
|
||||
// the load we started from is the last load. At the same time, the
|
||||
// function also helps determine if some loads in the pack depend on
|
||||
// early memory operations which locate after first_mem.
|
||||
Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent) {
|
||||
Node* last_mem = pk->at(0)->in(MemNode::Memory);
|
||||
for (uint i = 0; i < pk->size(); i++) {
|
||||
Node* ld = pk->at(i);
|
||||
for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
|
||||
assert(current->is_Mem() && in_bb(current), "unexpected memory");
|
||||
// Determine if the load pack is dependent on some memory operations locating after first_mem.
|
||||
is_dependent |= !independent(current, ld);
|
||||
if (current->in(MemNode::Memory) == last_mem) {
|
||||
last_mem = ld->in(MemNode::Memory);
|
||||
}
|
||||
}
|
||||
}
|
||||
return last_mem;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
@ -3037,12 +2903,14 @@ bool SuperWord::output() {
|
||||
for (int i = 0; i < _block.length(); i++) {
|
||||
Node* n = _block.at(i);
|
||||
Node_List* p = my_pack(n);
|
||||
if (p && n == executed_last(p)) {
|
||||
if (p != nullptr && n == p->at(p->size()-1)) {
|
||||
// After schedule_reorder_memops, we know that the memops have the same order in the pack
|
||||
// as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
|
||||
// and "n" is the last node in the slice from the pack.
|
||||
Node* first = p->at(0);
|
||||
uint vlen = p->size();
|
||||
uint vlen_in_bytes = 0;
|
||||
Node* vn = nullptr;
|
||||
Node* low_adr = p->at(0);
|
||||
Node* first = executed_first(p);
|
||||
if (cl->is_rce_post_loop()) {
|
||||
// override vlen with the main loops vector length
|
||||
vlen = cl->slp_max_unroll();
|
||||
@ -3066,7 +2934,7 @@ bool SuperWord::output() {
|
||||
break; // dependent memory
|
||||
}
|
||||
}
|
||||
Node* adr = low_adr->in(MemNode::Address);
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
const TypePtr* atyp = n->adr_type();
|
||||
if (cl->is_rce_post_loop()) {
|
||||
assert(vmask != nullptr, "vector mask should be generated");
|
||||
@ -3089,7 +2957,7 @@ bool SuperWord::output() {
|
||||
|
||||
Node* ctl = n->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
Node* adr = low_adr->in(MemNode::Address);
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
const TypePtr* atyp = n->adr_type();
|
||||
if (cl->is_rce_post_loop()) {
|
||||
assert(vmask != nullptr, "vector mask should be generated");
|
||||
@ -3100,8 +2968,8 @@ bool SuperWord::output() {
|
||||
}
|
||||
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
||||
} else if (VectorNode::is_scalar_rotate(n)) {
|
||||
Node* in1 = low_adr->in(1);
|
||||
Node* in2 = p->at(0)->in(2);
|
||||
Node* in1 = first->in(1);
|
||||
Node* in2 = first->in(2);
|
||||
// If rotation count is non-constant or greater than 8bit value create a vector.
|
||||
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
|
||||
in2 = vector_opd(p, 2);
|
||||
@ -3110,7 +2978,7 @@ bool SuperWord::output() {
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_roundopD(n)) {
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in2 = low_adr->in(2);
|
||||
Node* in2 = first->in(2);
|
||||
assert(in2->is_Con(), "Constant rounding mode expected.");
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
@ -3133,7 +3001,7 @@ bool SuperWord::output() {
|
||||
bool node_isa_reduction = is_marked_reduction(n);
|
||||
if (node_isa_reduction) {
|
||||
// the input to the first reduction operation is retained
|
||||
in1 = low_adr->in(1);
|
||||
in1 = first->in(1);
|
||||
} else {
|
||||
in1 = vector_opd(p, 1);
|
||||
if (in1 == nullptr) {
|
||||
@ -3201,7 +3069,7 @@ bool SuperWord::output() {
|
||||
Node* in = vector_opd(p, 1);
|
||||
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
|
||||
_igvn.register_new_node_with_optimizer(longval);
|
||||
_phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
|
||||
_phase->set_ctrl(longval, _phase->get_ctrl(first));
|
||||
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
@ -3316,7 +3184,7 @@ bool SuperWord::output() {
|
||||
|
||||
_block.at_put(i, vn);
|
||||
_igvn.register_new_node_with_optimizer(vn);
|
||||
_phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
|
||||
_phase->set_ctrl(vn, _phase->get_ctrl(first));
|
||||
for (uint j = 0; j < p->size(); j++) {
|
||||
Node* pm = p->at(j);
|
||||
_igvn.replace_node(pm, vn);
|
||||
@ -4196,17 +4064,6 @@ bool SuperWord::in_packset(Node* s1, Node* s2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//------------------------------in_pack---------------------------
|
||||
// Is s in pack p?
|
||||
Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
|
||||
for (uint i = 0; i < p->size(); i++) {
|
||||
if (p->at(i) == s) {
|
||||
return p;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//------------------------------remove_pack_at---------------------------
|
||||
// Remove the pack at position pos in the packset
|
||||
void SuperWord::remove_pack_at(int pos) {
|
||||
@ -4239,39 +4096,6 @@ void SuperWord::packset_sort(int n) {
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------executed_first---------------------------
|
||||
// Return the node executed first in pack p. Uses the RPO block list
|
||||
// to determine order.
|
||||
Node* SuperWord::executed_first(Node_List* p) {
|
||||
Node* n = p->at(0);
|
||||
int n_rpo = bb_idx(n);
|
||||
for (uint i = 1; i < p->size(); i++) {
|
||||
Node* s = p->at(i);
|
||||
int s_rpo = bb_idx(s);
|
||||
if (s_rpo < n_rpo) {
|
||||
n = s;
|
||||
n_rpo = s_rpo;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
//------------------------------executed_last---------------------------
|
||||
// Return the node executed last in pack p.
|
||||
Node* SuperWord::executed_last(Node_List* p) {
|
||||
Node* n = p->at(0);
|
||||
int n_rpo = bb_idx(n);
|
||||
for (uint i = 1; i < p->size(); i++) {
|
||||
Node* s = p->at(i);
|
||||
int s_rpo = bb_idx(s);
|
||||
if (s_rpo > n_rpo) {
|
||||
n = s;
|
||||
n_rpo = s_rpo;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
|
||||
LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
|
||||
for (uint i = 0; i < p->size(); i++) {
|
||||
@ -4547,16 +4371,6 @@ void SuperWord::print_stmt(Node* s) {
|
||||
#endif
|
||||
}
|
||||
|
||||
//------------------------------blank---------------------------
|
||||
char* SuperWord::blank(uint depth) {
|
||||
static char blanks[101];
|
||||
assert(depth < 101, "too deep");
|
||||
for (uint i = 0; i < depth; i++) blanks[i] = ' ';
|
||||
blanks[depth] = '\0';
|
||||
return blanks;
|
||||
}
|
||||
|
||||
|
||||
//==============================SWPointer===========================
|
||||
#ifndef PRODUCT
|
||||
int SWPointer::Tracer::_depth = 0;
|
||||
|
@ -458,7 +458,9 @@ class SuperWord : public ResourceObj {
|
||||
bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const;
|
||||
|
||||
// my_pack
|
||||
public:
|
||||
Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
|
||||
private:
|
||||
void set_my_pack(Node* n, Node_List* p) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
|
||||
// is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
|
||||
bool is_cmov_pack(Node_List* p);
|
||||
@ -618,19 +620,10 @@ private:
|
||||
void merge_packs_to_cmove();
|
||||
// Verify that for every pack, all nodes are mutually independent
|
||||
DEBUG_ONLY(void verify_packs();)
|
||||
// Remove cycles in packset.
|
||||
void remove_cycles();
|
||||
// Adjust the memory graph for the packed operations
|
||||
void schedule();
|
||||
// Remove "current" from its current position in the memory graph and insert
|
||||
// it after the appropriate insert points (lip or uip);
|
||||
void remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, Node *uip, Unique_Node_List &schd_before);
|
||||
// Within a store pack, schedule stores together by moving out the sandwiched memory ops according
|
||||
// to dependence info; and within a load pack, move loads down to the last executed load.
|
||||
void co_locate_pack(Node_List* p);
|
||||
Node* pick_mem_state(Node_List* pk);
|
||||
Node* find_first_mem_state(Node_List* pk);
|
||||
Node* find_last_mem_state(Node_List* pk, Node* first_mem, bool &is_dependent);
|
||||
// Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
|
||||
void schedule_reorder_memops(Node_List &memops_schedule);
|
||||
|
||||
// Convert packs into vector node operations
|
||||
bool output();
|
||||
@ -662,19 +655,11 @@ private:
|
||||
void compute_vector_element_type();
|
||||
// Are s1 and s2 in a pack pair and ordered as s1,s2?
|
||||
bool in_packset(Node* s1, Node* s2);
|
||||
// Is s in pack p?
|
||||
Node_List* in_pack(Node* s, Node_List* p);
|
||||
// Remove the pack at position pos in the packset
|
||||
void remove_pack_at(int pos);
|
||||
// Return the node executed first in pack p.
|
||||
Node* executed_first(Node_List* p);
|
||||
// Return the node executed last in pack p.
|
||||
Node* executed_last(Node_List* p);
|
||||
static LoadNode::ControlDependency control_dependency(Node_List* p);
|
||||
// Alignment within a vector memory reference
|
||||
int memory_alignment(MemNode* s, int iv_adjust);
|
||||
// (Start, end] half-open range defining which operands are vector
|
||||
void vector_opd_range(Node* n, uint* start, uint* end);
|
||||
// Smallest type containing range of values
|
||||
const Type* container_type(Node* n);
|
||||
// Adjust pre-loop limit so that in main loop, a load/store reference
|
||||
@ -693,7 +678,6 @@ private:
|
||||
void print_pack(Node_List* p);
|
||||
void print_bb();
|
||||
void print_stmt(Node* s);
|
||||
char* blank(uint depth);
|
||||
|
||||
void packset_sort(int n);
|
||||
};
|
||||
|
@ -27,9 +27,8 @@
|
||||
* @bug 8304042
|
||||
* @summary Test some examples with independent packs with cyclic dependency
|
||||
* between the packs.
|
||||
* @requires vm.compiler2.enabled
|
||||
* @requires vm.bits == 64
|
||||
* @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
|
||||
* @requires vm.compiler2.enabled
|
||||
* @modules java.base/jdk.internal.misc
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency
|
||||
@ -94,8 +93,8 @@ public class TestIndependentPacksWithCyclicDependency {
|
||||
test3(goldI3, goldI3, goldF3, goldF3);
|
||||
init(goldI4, goldF4);
|
||||
test4(goldI4, goldI4, goldF4, goldF4);
|
||||
// init(goldI5, goldF5);
|
||||
// test5(goldI5, goldI5, goldF5, goldF5);
|
||||
init(goldI5, goldF5);
|
||||
test5(goldI5, goldI5, goldF5, goldF5);
|
||||
init(goldI6, goldF6, goldL6);
|
||||
test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6);
|
||||
init(goldI7, goldF7, goldL7);
|
||||
@ -232,29 +231,27 @@ public class TestIndependentPacksWithCyclicDependency {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO uncomment after fixing JDK-8304720
|
||||
//
|
||||
// @Run(test = "test5")
|
||||
// public void runTest5() {
|
||||
// int[] dataI = new int[RANGE];
|
||||
// float[] dataF = new float[RANGE];
|
||||
// init(dataI, dataF);
|
||||
// test5(dataI, dataI, dataF, dataF);
|
||||
// verify("test5", dataI, goldI5);
|
||||
// verify("test5", dataF, goldF5);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
|
||||
// for (int i = 0; i < RANGE; i+=2) {
|
||||
// // same as test2, except that reordering leads to different semantics
|
||||
// // explanation analogue to test4
|
||||
// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
|
||||
// dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
|
||||
// dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
|
||||
// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
|
||||
// }
|
||||
// }
|
||||
@Run(test = "test5")
|
||||
public void runTest5() {
|
||||
int[] dataI = new int[RANGE];
|
||||
float[] dataF = new float[RANGE];
|
||||
init(dataI, dataF);
|
||||
test5(dataI, dataI, dataF, dataF);
|
||||
verify("test5", dataI, goldI5);
|
||||
verify("test5", dataF, goldF5);
|
||||
}
|
||||
|
||||
@Test
|
||||
static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
|
||||
for (int i = 0; i < RANGE; i+=2) {
|
||||
// same as test2, except that reordering leads to different semantics
|
||||
// explanation analogue to test4
|
||||
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
|
||||
dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
|
||||
dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
|
||||
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
|
||||
}
|
||||
}
|
||||
|
||||
@Run(test = "test6")
|
||||
public void runTest6() {
|
||||
|
@ -29,8 +29,6 @@
|
||||
* between the packs.
|
||||
* Before fix, this hit: "assert(!is_visited) failed: visit only once"
|
||||
* @requires vm.compiler2.enabled
|
||||
* @requires vm.bits == 64
|
||||
* @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*"
|
||||
* @modules java.base/jdk.internal.misc
|
||||
* @library /test/lib /
|
||||
* @run main/othervm -XX:LoopUnrollLimit=250
|
||||
|
@ -0,0 +1,159 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 8304720
|
||||
* @summary Test some examples where non-vectorized memops also need to
|
||||
* be reordered during SuperWord::schedule.
|
||||
* @requires vm.compiler2.enabled
|
||||
* @modules java.base/jdk.internal.misc
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestScheduleReordersScalarMemops
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
import jdk.internal.misc.Unsafe;
|
||||
import jdk.test.lib.Asserts;
|
||||
import compiler.lib.ir_framework.*;
|
||||
|
||||
public class TestScheduleReordersScalarMemops {
|
||||
static final int RANGE = 1024;
|
||||
static final int ITER = 10_000;
|
||||
static Unsafe unsafe = Unsafe.getUnsafe();
|
||||
|
||||
int[] goldI0 = new int[RANGE];
|
||||
float[] goldF0 = new float[RANGE];
|
||||
int[] goldI1 = new int[RANGE];
|
||||
float[] goldF1 = new float[RANGE];
|
||||
|
||||
public static void main(String args[]) {
|
||||
TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
|
||||
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::test*",
|
||||
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::verify",
|
||||
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestScheduleReordersScalarMemops::init",
|
||||
"-XX:LoopUnrollLimit=1000",
|
||||
"-XX:-TieredCompilation", "-Xbatch");
|
||||
}
|
||||
|
||||
TestScheduleReordersScalarMemops() {
|
||||
// compute the gold standard in interpreter mode
|
||||
init(goldI0, goldF0);
|
||||
test0(goldI0, goldI0, goldF0, goldF0);
|
||||
init(goldI1, goldF1);
|
||||
test1(goldI1, goldI1, goldF1, goldF1);
|
||||
}
|
||||
|
||||
@Run(test = "test0")
|
||||
@Warmup(100)
|
||||
public void runTest0() {
|
||||
int[] dataI = new int[RANGE];
|
||||
float[] dataF = new float[RANGE];
|
||||
init(dataI, dataF);
|
||||
test0(dataI, dataI, dataF, dataF);
|
||||
verify("test0", dataI, goldI0);
|
||||
verify("test0", dataF, goldF0);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MUL_VI, "> 0"},
|
||||
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
|
||||
static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
|
||||
for (int i = 0; i < RANGE; i+=2) {
|
||||
// We have dependency edges:
|
||||
// A -> X
|
||||
// Y -> B
|
||||
// Still, we can vectorize [X,Y].
|
||||
// We do not vectorize A and B, because they are not isomorphic (add vs mul).
|
||||
//
|
||||
// Imagine this is unrolled at least 2x.
|
||||
// We get order: A0 X0 Y0 B0 A1 X1 Y1 B1
|
||||
// Vectorized: X0 Y0 X1 Y1
|
||||
// Scalar: A0 B0 A1 B1
|
||||
//
|
||||
// However, since the As need to be before, and the Bs after the vector operations,
|
||||
// we need to have all As before all Bs. This means we need to reorder the scalar
|
||||
// operations, and not just the vectorized ones.
|
||||
//
|
||||
// A correct reordering would be: A0 A1 [X0, Y0, X1, Y1] B0 B1
|
||||
//
|
||||
dataFa[i + 0] = dataIa[i + 0] * 1.3f; // A *1.3
|
||||
dataIb[i + 0] = (int)dataFb[i + 0] * 11; // X *11
|
||||
dataIb[i + 1] = (int)dataFb[i + 1] * 11; // Y *11
|
||||
dataFa[i + 1] = dataIa[i + 1] + 1.2f; // B +1.2
|
||||
}
|
||||
}
|
||||
|
||||
@Run(test = "test1")
|
||||
@Warmup(100)
|
||||
public void runTest1() {
|
||||
int[] dataI = new int[RANGE];
|
||||
float[] dataF = new float[RANGE];
|
||||
init(dataI, dataF);
|
||||
test1(dataI, dataI, dataF, dataF);
|
||||
verify("test1", dataI, goldI1);
|
||||
verify("test1", dataF, goldF1);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MUL_VI, "> 0"},
|
||||
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
|
||||
static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
|
||||
for (int i = 0; i < RANGE; i+=2) {
|
||||
// Do the same as test0, but without int-float conversion.
|
||||
// This should reproduce on machines where conversion is not implemented.
|
||||
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A +1
|
||||
dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
|
||||
dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
|
||||
unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11
|
||||
}
|
||||
}
|
||||
|
||||
static void init(int[] dataI, float[] dataF) {
|
||||
for (int i = 0; i < RANGE; i++) {
|
||||
dataI[i] = i + 1;
|
||||
dataF[i] = i + 0.1f;
|
||||
}
|
||||
}
|
||||
|
||||
static void verify(String name, int[] data, int[] gold) {
|
||||
for (int i = 0; i < RANGE; i++) {
|
||||
if (data[i] != gold[i]) {
|
||||
throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void verify(String name, float[] data, float[] gold) {
|
||||
for (int i = 0; i < RANGE; i++) {
|
||||
int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
|
||||
int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
|
||||
if (datav != goldv) {
|
||||
throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user