8325651: C2 SuperWord: refactor the dependency graph

Reviewed-by: chagedorn, vlivanov
This commit is contained in:
Emanuel Peter 2024-03-11 07:12:15 +00:00
parent d451f818cf
commit ca5ca85d24
5 changed files with 309 additions and 428 deletions

View File

@ -46,7 +46,6 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
_node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
_align_to_ref(nullptr), // memory reference to align vectors to
_dg(arena()), // dependence graph
_race_possible(false), // cases where SDMU is true
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
_num_work_vecs(0), // amount of vector work we have
@ -452,12 +451,6 @@ bool SuperWord::SLP_extract() {
// Ensure extra info is allocated.
initialize_node_info();
// build _dg
dependence_graph();
// compute function depth(Node*)
compute_max_depth();
// Attempt vectorization
find_adjacent_refs();
@ -749,86 +742,6 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
return iv_adjustment;
}
//---------------------------dependence_graph---------------------------
// Construct dependency graph.
// Add dependence edges to load/store nodes for memory dependence
// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
void SuperWord::dependence_graph() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
// First, assign a dependence node to each memory node
for (int i = 0; i < body().length(); i++ ) {
Node* n = body().at(i);
if (n->is_Mem() || n->is_memory_phi()) {
_dg.make_node(n);
}
}
const GrowableArray<PhiNode*>& mem_slice_head = _vloop_analyzer.memory_slices().heads();
const GrowableArray<MemNode*>& mem_slice_tail = _vloop_analyzer.memory_slices().tails();
ResourceMark rm;
GrowableArray<Node*> slice_nodes;
// For each memory slice, create the dependences
for (int i = 0; i < mem_slice_head.length(); i++) {
PhiNode* head = mem_slice_head.at(i);
MemNode* tail = mem_slice_tail.at(i);
// Get slice in predecessor order (last is first)
_vloop_analyzer.memory_slices().get_slice_in_reverse_order(head, tail, slice_nodes);
// Make the slice dependent on the root
DepMem* slice = _dg.dep(head);
_dg.make_edge(_dg.root(), slice);
// Create a sink for the slice
DepMem* slice_sink = _dg.make_node(nullptr);
_dg.make_edge(slice_sink, _dg.tail());
// Now visit each pair of memory ops, creating the edges
for (int j = slice_nodes.length() - 1; j >= 0 ; j--) {
Node* s1 = slice_nodes.at(j);
// If no dependency yet, use slice
if (_dg.dep(s1)->in_cnt() == 0) {
_dg.make_edge(slice, s1);
}
VPointer p1(s1->as_Mem(), _vloop);
bool sink_dependent = true;
for (int k = j - 1; k >= 0; k--) {
Node* s2 = slice_nodes.at(k);
if (s1->is_Load() && s2->is_Load())
continue;
VPointer p2(s2->as_Mem(), _vloop);
int cmp = p1.cmp(p2);
if (!VPointer::not_equal(cmp)) {
// Possibly same address
_dg.make_edge(s1, s2);
sink_dependent = false;
}
}
if (sink_dependent) {
_dg.make_edge(s1, slice_sink);
}
}
#ifndef PRODUCT
if (is_trace_superword_dependence_graph()) {
tty->print_cr("\nDependence graph for slice: %d", head->_idx);
for (int q = 0; q < slice_nodes.length(); q++) {
_dg.print(slice_nodes.at(q));
}
tty->cr();
}
#endif
slice_nodes.clear();
}
}
void VLoopMemorySlices::find_memory_slices() {
assert(_heads.is_empty(), "not yet computed");
assert(_tails.is_empty(), "not yet computed");
@ -861,7 +774,7 @@ void VLoopMemorySlices::print() const {
#endif
// Get all memory nodes of a slice, in reverse order
void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const {
void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*> &slice) const {
assert(slice.is_empty(), "start empty");
Node* n = tail;
Node* prev = nullptr;
@ -871,7 +784,7 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail,
Node* out = n->fast_out(i);
if (out->is_Load()) {
if (_vloop.in_bb(out)) {
slice.push(out);
slice.push(out->as_Load());
}
} else {
// FIXME
@ -889,7 +802,7 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail,
}//else
}//for
if (n == head) { break; }
slice.push(n);
slice.push(n->as_Mem());
prev = n;
assert(n->is_Mem(), "unexpected node %s", n->Name());
n = n->in(MemNode::Memory);
@ -1001,9 +914,8 @@ bool SuperWord::isomorphic(Node* s1, Node* s2) {
}
}
//------------------------------independent---------------------------
// Is there no data path from s1 to s2 or s2 to s1?
bool SuperWord::independent(Node* s1, Node* s2) {
bool VLoopDependencyGraph::independent(Node* s1, Node* s2) const {
int d1 = depth(s1);
int d2 = depth(s2);
@ -1024,9 +936,9 @@ bool SuperWord::independent(Node* s1, Node* s2) {
worklist.push(deep);
for (uint i = 0; i < worklist.size(); i++) {
Node* n = worklist.at(i);
for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (in_bb(pred) && depth(pred) >= min_d) {
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
if (pred == shallow) {
return false; // found it -> dependent
}
@ -1045,7 +957,7 @@ bool SuperWord::independent(Node* s1, Node* s2) {
// is the smallest depth of all nodes from the nodes list. Once we have
// traversed all those nodes, and have not found another node from the
// nodes list, we know that all nodes in the nodes list are independent.
bool SuperWord::mutually_independent(const Node_List* nodes) const {
bool VLoopDependencyGraph::mutually_independent(const Node_List* nodes) const {
ResourceMark rm;
Unique_Node_List worklist;
VectorSet nodes_set;
@ -1054,14 +966,14 @@ bool SuperWord::mutually_independent(const Node_List* nodes) const {
Node* n = nodes->at(k);
min_d = MIN2(min_d, depth(n));
worklist.push(n); // start traversal at all nodes in nodes list
nodes_set.set(bb_idx(n));
nodes_set.set(_body.bb_idx(n));
}
for (uint i = 0; i < worklist.size(); i++) {
Node* n = worklist.at(i);
for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (in_bb(pred) && depth(pred) >= min_d) {
if (nodes_set.test(bb_idx(pred))) {
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
if (nodes_set.test(_body.bb_idx(pred))) {
return false; // found one -> dependent
}
worklist.push(pred);
@ -1982,16 +1894,16 @@ void SuperWord::verify_packs() {
}
#endif
// The PacksetGraph combines the DepPreds graph with the packset. In the PackSet
// The PacksetGraph combines the dependency graph with the packset. In the PackSet
// graph, we have two kinds of nodes:
// (1) pack-node: Represents all nodes of some pack p in a single node, which
// shall later become a vector node.
// (2) scalar-node: Represents a node that is not in any pack.
// For any edge (n1, n2) in DepPreds, we add an edge to the PacksetGraph for the
// PacksetGraph nodes corresponding to n1 and n2.
// We work from the DepPreds graph, because it gives us all the data-dependencies,
// as well as more refined memory-dependencies than the C2 graph. DepPreds does
// not have cycles. But packing nodes can introduce cyclic dependencies. Example:
// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for
// the PacksetGraph nodes corresponding to n1 and n2.
// We work from the dependency graph, because it gives us all the data-dependencies,
// as well as more refined memory-dependencies than the C2 graph. The dependency graph
// does not have cycles. But packing nodes can introduce cyclic dependencies. Example:
//
// +--------+
// A -> X | v
@ -2055,11 +1967,10 @@ public:
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
bool schedule_success() const { return _schedule_success; }
// Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
// Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph.
void build() {
const GrowableArray<Node_List*>& packset = _slp->packset();
const GrowableArray<Node*>& body = _slp->body();
const DepGraph& dg = _slp->dg();
// Map nodes in packsets
for (int i = 0; i < packset.length(); i++) {
Node_List* p = packset.at(i);
@ -2096,7 +2007,7 @@ public:
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
assert(pid == get_pid(n), "all nodes in pack have same pid");
for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
Node* pred = preds.current();
int pred_pid = get_pid_or_zero(pred);
if (pred_pid == pid && _slp->is_marked_reduction(n)) {
@ -2118,7 +2029,7 @@ public:
if (pid <= max_pid_packset) {
continue; // Only scalar-nodes
}
for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
Node* pred = preds.current();
int pred_pid = get_pid_or_zero(pred);
// Only add edges for mapped nodes (in body)
@ -2209,7 +2120,7 @@ public:
};
// The C2 graph (specifically the memory graph), needs to be re-ordered.
// (1) Build the PacksetGraph. It combines the DepPreds graph with the
// (1) Build the PacksetGraph. It combines the dependency graph with the
// packset. The PacksetGraph gives us the dependencies that must be
// respected after scheduling.
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
@ -3042,41 +2953,6 @@ void SuperWord::initialize_node_info() {
grow_node_info(bb_idx(last));
}
//------------------------------compute_max_depth---------------------------
// Compute max depth for expressions from beginning of block
// Use to prune search paths during test for independence.
void SuperWord::compute_max_depth() {
int ct = 0;
bool again;
do {
again = false;
for (int i = 0; i < body().length(); i++) {
Node* n = body().at(i);
if (!n->is_Phi()) {
int d_orig = depth(n);
int d_in = 0;
for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (in_bb(pred)) {
d_in = MAX2(d_in, depth(pred));
}
}
if (d_in + 1 != d_orig) {
set_depth(n, d_in + 1);
again = true;
}
}
}
ct++;
} while (again);
#ifndef PRODUCT
if (is_trace_superword_dependence_graph()) {
tty->print_cr("compute_max_depth iterated: %d times", ct);
}
#endif
}
BasicType SuperWord::longer_type_for_conversion(Node* n) {
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
requires_long_to_int_conversion(n->Opcode())) ||
@ -3734,141 +3610,6 @@ void SuperWord::print_stmt(Node* s) {
const SWNodeInfo SWNodeInfo::initial;
// ============================ DepGraph ===========================
//------------------------------make_node---------------------------
// Make a new dependence graph node for an ideal node.
DepMem* DepGraph::make_node(Node* node) {
DepMem* m = new (_arena) DepMem(node);
if (node != nullptr) {
assert(_map.at_grow(node->_idx) == nullptr, "one init only");
_map.at_put_grow(node->_idx, m);
}
return m;
}
//------------------------------make_edge---------------------------
// Make a new dependence graph edge from dpred -> dsucc
DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
dpred->set_out_head(e);
dsucc->set_in_head(e);
return e;
}
// ========================== DepMem ========================
//------------------------------in_cnt---------------------------
int DepMem::in_cnt() {
int ct = 0;
for (DepEdge* e = _in_head; e != nullptr; e = e->next_in()) ct++;
return ct;
}
//------------------------------out_cnt---------------------------
int DepMem::out_cnt() {
int ct = 0;
for (DepEdge* e = _out_head; e != nullptr; e = e->next_out()) ct++;
return ct;
}
//------------------------------print-----------------------------
void DepMem::print() {
#ifndef PRODUCT
tty->print(" DepNode %d (", _node->_idx);
for (DepEdge* p = _in_head; p != nullptr; p = p->next_in()) {
Node* pred = p->pred()->node();
tty->print(" %d", pred != nullptr ? pred->_idx : 0);
}
tty->print(") [");
for (DepEdge* s = _out_head; s != nullptr; s = s->next_out()) {
Node* succ = s->succ()->node();
tty->print(" %d", succ != nullptr ? succ->_idx : 0);
}
tty->print_cr(" ]");
#endif
}
// =========================== DepEdge =========================
//------------------------------DepPreds---------------------------
void DepEdge::print() {
#ifndef PRODUCT
tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
#endif
}
// =========================== DepPreds =========================
// Iterator over predecessor edges in the dependence graph.
//------------------------------DepPreds---------------------------
DepPreds::DepPreds(Node* n, const DepGraph& dg) {
_n = n;
_done = false;
if (_n->is_Store() || _n->is_Load()) {
_next_idx = MemNode::Address;
_end_idx = n->req();
_dep_next = dg.dep(_n)->in_head();
} else if (_n->is_Mem()) {
_next_idx = 0;
_end_idx = 0;
_dep_next = dg.dep(_n)->in_head();
} else {
_next_idx = 1;
_end_idx = _n->req();
_dep_next = nullptr;
}
next();
}
//------------------------------next---------------------------
void DepPreds::next() {
if (_dep_next != nullptr) {
_current = _dep_next->pred()->node();
_dep_next = _dep_next->next_in();
} else if (_next_idx < _end_idx) {
_current = _n->in(_next_idx++);
} else {
_done = true;
}
}
// =========================== DepSuccs =========================
// Iterator over successor edges in the dependence graph.
//------------------------------DepSuccs---------------------------
DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
_n = n;
_done = false;
if (_n->is_Load()) {
_next_idx = 0;
_end_idx = _n->outcnt();
_dep_next = dg.dep(_n)->out_head();
} else if (_n->is_Mem() || _n->is_memory_phi()) {
_next_idx = 0;
_end_idx = 0;
_dep_next = dg.dep(_n)->out_head();
} else {
_next_idx = 0;
_end_idx = _n->outcnt();
_dep_next = nullptr;
}
next();
}
//-------------------------------next---------------------------
void DepSuccs::next() {
if (_dep_next != nullptr) {
_current = _dep_next->succ()->node();
_dep_next = _dep_next->next_out();
} else if (_next_idx < _end_idx) {
_current = _n->raw_out(_next_idx++);
} else {
_done = true;
}
}
//
// --------------------------------- vectorization/simd -----------------------------------
//

View File

@ -57,128 +57,6 @@
class VPointer;
// ========================= Dependence Graph =====================
class DepMem;
//------------------------------DepEdge---------------------------
// An edge in the dependence graph. The edges incident to a dependence
// node are threaded through _next_in for incoming edges and _next_out
// for outgoing edges.
class DepEdge : public ArenaObj {
protected:
DepMem* _pred;
DepMem* _succ;
DepEdge* _next_in; // list of in edges, null terminated
DepEdge* _next_out; // list of out edges, null terminated
public:
DepEdge(DepMem* pred, DepMem* succ, DepEdge* next_in, DepEdge* next_out) :
_pred(pred), _succ(succ), _next_in(next_in), _next_out(next_out) {}
DepEdge* next_in() { return _next_in; }
DepEdge* next_out() { return _next_out; }
DepMem* pred() { return _pred; }
DepMem* succ() { return _succ; }
void print();
};
//------------------------------DepMem---------------------------
// A node in the dependence graph. _in_head starts the threaded list of
// incoming edges, and _out_head starts the list of outgoing edges.
class DepMem : public ArenaObj {
protected:
Node* _node; // Corresponding ideal node
DepEdge* _in_head; // Head of list of in edges, null terminated
DepEdge* _out_head; // Head of list of out edges, null terminated
public:
DepMem(Node* node) : _node(node), _in_head(nullptr), _out_head(nullptr) {}
Node* node() { return _node; }
DepEdge* in_head() { return _in_head; }
DepEdge* out_head() { return _out_head; }
void set_in_head(DepEdge* hd) { _in_head = hd; }
void set_out_head(DepEdge* hd) { _out_head = hd; }
int in_cnt(); // Incoming edge count
int out_cnt(); // Outgoing edge count
void print();
};
//------------------------------DepGraph---------------------------
class DepGraph {
protected:
Arena* _arena;
GrowableArray<DepMem*> _map;
DepMem* _root;
DepMem* _tail;
public:
DepGraph(Arena* a) : _arena(a), _map(a, 8, 0, nullptr) {
_root = new (_arena) DepMem(nullptr);
_tail = new (_arena) DepMem(nullptr);
}
DepMem* root() { return _root; }
DepMem* tail() { return _tail; }
// Return dependence node corresponding to an ideal node
DepMem* dep(Node* node) const { return _map.at(node->_idx); }
// Make a new dependence graph node for an ideal node.
DepMem* make_node(Node* node);
// Make a new dependence graph edge dprec->dsucc
DepEdge* make_edge(DepMem* dpred, DepMem* dsucc);
DepEdge* make_edge(Node* pred, Node* succ) { return make_edge(dep(pred), dep(succ)); }
DepEdge* make_edge(DepMem* pred, Node* succ) { return make_edge(pred, dep(succ)); }
DepEdge* make_edge(Node* pred, DepMem* succ) { return make_edge(dep(pred), succ); }
void print(Node* n) { dep(n)->print(); }
void print(DepMem* d) { d->print(); }
};
//------------------------------DepPreds---------------------------
// Iterator over predecessors in the dependence graph and
// non-memory-graph inputs of ideal nodes.
class DepPreds : public StackObj {
private:
Node* _n;
int _next_idx, _end_idx;
DepEdge* _dep_next;
Node* _current;
bool _done;
public:
DepPreds(Node* n, const DepGraph& dg);
Node* current() { return _current; }
bool done() { return _done; }
void next();
};
//------------------------------DepSuccs---------------------------
// Iterator over successors in the dependence graph and
// non-memory-graph outputs of ideal nodes.
class DepSuccs : public StackObj {
private:
Node* _n;
int _next_idx, _end_idx;
DepEdge* _dep_next;
Node* _current;
bool _done;
public:
DepSuccs(Node* n, DepGraph& dg);
Node* current() { return _current; }
bool done() { return _done; }
void next();
};
// ========================= SuperWord =====================
// -----------------------------SWNodeInfo---------------------------------
@ -186,10 +64,9 @@ public:
class SWNodeInfo {
public:
int _alignment; // memory alignment for a node
int _depth; // Max expression (DAG) depth from block start
Node_List* _my_pack; // pack containing this node
SWNodeInfo() : _alignment(-1), _depth(0), _my_pack(nullptr) {}
SWNodeInfo() : _alignment(-1), _my_pack(nullptr) {}
static const SWNodeInfo initial;
};
@ -212,8 +89,6 @@ class SuperWord : public ResourceObj {
CloneMap& _clone_map; // map of nodes created in cloning
MemNode const* _align_to_ref; // Memory reference that pre-loop will align to
DepGraph _dg; // Dependence graph
public:
SuperWord(const VLoopAnalyzer &vloop_analyzer);
@ -280,6 +155,19 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.types().vector_width_in_bytes(n);
}
// VLoopDependencyGraph Accessors
const VLoopDependencyGraph& dependency_graph() const {
return _vloop_analyzer.dependency_graph();
}
bool independent(Node* n1, Node* n2) const {
return _vloop_analyzer.dependency_graph().independent(n1, n2);
}
bool mutually_independent(const Node_List* nodes) const {
return _vloop_analyzer.dependency_graph().mutually_independent(nodes);
}
#ifndef PRODUCT
// TraceAutoVectorization and TraceSuperWord
bool is_trace_superword_alignment() const {
@ -287,11 +175,6 @@ class SuperWord : public ResourceObj {
return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
}
bool is_trace_superword_dependence_graph() const {
return TraceSuperWord ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
}
bool is_trace_superword_adjacent_memops() const {
return TraceSuperWord ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
@ -321,7 +204,6 @@ class SuperWord : public ResourceObj {
return TraceSuperWord ||
is_trace_align_vector() ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
@ -338,7 +220,6 @@ class SuperWord : public ResourceObj {
bool do_vector_loop() { return _do_vector_loop; }
const GrowableArray<Node_List*>& packset() const { return _packset; }
const DepGraph& dg() const { return _dg; }
private:
bool _race_possible; // In cases where SDMU is true
bool _do_vector_loop; // whether to do vectorization/simd style
@ -362,10 +243,6 @@ class SuperWord : public ResourceObj {
int alignment(Node* n) { return _node_info.adr_at(bb_idx(n))->_alignment; }
void set_alignment(Node* n, int a) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_alignment = a; }
// Max expression (DAG) depth from beginning of the block for each node
int depth(Node* n) const { return _node_info.adr_at(bb_idx(n))->_depth; }
void set_depth(Node* n, int d) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_depth = d; }
// my_pack
public:
Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
@ -387,8 +264,6 @@ private:
MemNode* find_align_to_ref(Node_List &memops, int &idx);
// Calculate loop's iv adjustment for this memory ops.
int get_iv_adjustment(MemNode* mem);
// Construct dependency graph.
void dependence_graph();
// Can s1 and s2 be in a pack with s1 immediately preceding s2 and s1 aligned at "align"
bool stmts_can_pack(Node* s1, Node* s2, int align);
@ -398,10 +273,6 @@ private:
bool are_adjacent_refs(Node* s1, Node* s2);
// Are s1 and s2 similar?
bool isomorphic(Node* s1, Node* s2);
// Is there no data path from s1 to s2 or s2 to s1?
bool independent(Node* s1, Node* s2);
// Are all nodes in nodes list mutually independent?
bool mutually_independent(const Node_List* nodes) const;
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
bool have_similar_inputs(Node* s1, Node* s2);

View File

@ -35,8 +35,8 @@
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
flags(BODY, "Trace VLoopBody") \
flags(TYPES, "Trace VLoopTypes") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \
flags(SW_DEPENDENCE_GRAPH, "Trace SuperWord::dependence_graph") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
@ -115,14 +115,12 @@ class TraceAutoVectorizationTagValidator {
_tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
} else if (SW_VERBOSE == tag) {
_tags.at_put(SW_ALIGNMENT, set_bit);
_tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
_tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
_tags.at_put(SW_REJECTIONS, set_bit);
_tags.at_put(SW_PACKSET, set_bit);
_tags.at_put(SW_INFO, set_bit);
_tags.at_put(SW_VERBOSE, set_bit);
} else if (SW_INFO == tag) {
_tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
_tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
_tags.at_put(SW_REJECTIONS, set_bit);
_tags.at_put(SW_PACKSET, set_bit);

View File

@ -161,9 +161,170 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
_types.compute_vector_element_type();
_dependency_graph.construct();
return VStatus::make_success();
}
// Construct the dependency graph:
// - Data-dependencies: implicit (taken from C2 node inputs).
// - Memory-dependencies:
// - No edges between different slices.
// - No Load-Load edges.
// - Inside a slice, add all Store-Load, Load-Store, Store-Store edges,
// except if we can prove that the memory does not overlap.
void VLoopDependencyGraph::construct() {
const GrowableArray<PhiNode*>& mem_slice_heads = _memory_slices.heads();
const GrowableArray<MemNode*>& mem_slice_tails = _memory_slices.tails();
ResourceMark rm;
GrowableArray<MemNode*> slice_nodes;
GrowableArray<int> memory_pred_edges;
// For each memory slice, create the memory subgraph
for (int i = 0; i < mem_slice_heads.length(); i++) {
PhiNode* head = mem_slice_heads.at(i);
MemNode* tail = mem_slice_tails.at(i);
_memory_slices.get_slice_in_reverse_order(head, tail, slice_nodes);
// In forward order (reverse of reverse), visit all memory nodes in the slice.
for (int j = slice_nodes.length() - 1; j >= 0 ; j--) {
MemNode* n1 = slice_nodes.at(j);
memory_pred_edges.clear();
VPointer p1(n1, _vloop);
// For all memory nodes before it, check if we need to add a memory edge.
for (int k = slice_nodes.length() - 1; k > j; k--) {
MemNode* n2 = slice_nodes.at(k);
// Ignore Load-Load dependencies:
if (n1->is_Load() && n2->is_Load()) { continue; }
VPointer p2(n2, _vloop);
if (!VPointer::not_equal(p1.cmp(p2))) {
// Possibly overlapping memory
memory_pred_edges.append(_body.bb_idx(n2));
}
}
if (memory_pred_edges.is_nonempty()) {
// Data edges are taken implicitly from the C2 graph, thus we only add
// a dependency node if we have memory edges.
add_node(n1, memory_pred_edges);
}
}
slice_nodes.clear();
}
compute_depth();
NOT_PRODUCT( if (_vloop.is_trace_dependency_graph()) { print(); } )
}
void VLoopDependencyGraph::add_node(MemNode* n, GrowableArray<int>& memory_pred_edges) {
assert(_dependency_nodes.at_grow(_body.bb_idx(n), nullptr) == nullptr, "not yet created");
assert(!memory_pred_edges.is_empty(), "no need to create a node without edges");
DependencyNode* dn = new (_arena) DependencyNode(n, memory_pred_edges, _arena);
_dependency_nodes.at_put_grow(_body.bb_idx(n), dn, nullptr);
}
// We iterate over the body, which is already ordered by the dependencies, i.e. pred comes
// before use. With a single pass, we can compute the depth of every node, since we can
// assume that the depth of all preds is already computed when we compute the depth of use.
void VLoopDependencyGraph::compute_depth() {
for (int i = 0; i < _body.body().length(); i++) {
Node* n = _body.body().at(i);
int max_pred_depth = 0;
if (n->is_Phi()) {
for (PredsIterator it(*this, n); !it.done(); it.next()) {
Node* pred = it.current();
if (_vloop.in_bb(pred)) {
max_pred_depth = MAX2(max_pred_depth, depth(pred));
}
}
}
set_depth(n, max_pred_depth + 1);
}
}
#ifndef PRODUCT
void VLoopDependencyGraph::print() const {
tty->print_cr("\nVLoopDependencyGraph::print:");
tty->print_cr(" Memory pred edges:");
for (int i = 0; i < _body.body().length(); i++) {
Node* n = _body.body().at(i);
const DependencyNode* dn = dependency_node(n);
if (dn != nullptr) {
tty->print(" DependencyNode[%d %s:", n->_idx, n->Name());
for (uint j = 0; j < dn->memory_pred_edges_length(); j++) {
Node* pred = _body.body().at(dn->memory_pred_edge(j));
tty->print(" %d %s", pred->_idx, pred->Name());
}
tty->print_cr("]");
}
}
tty->cr();
tty->print_cr(" Complete dependency graph:");
for (int i = 0; i < _body.body().length(); i++) {
Node* n = _body.body().at(i);
tty->print(" d%02d Dependencies[%d %s:", depth(n), n->_idx, n->Name());
for (PredsIterator it(*this, n); !it.done(); it.next()) {
Node* pred = it.current();
tty->print(" %d %s", pred->_idx, pred->Name());
}
tty->print_cr("]");
}
}
#endif
VLoopDependencyGraph::DependencyNode::DependencyNode(MemNode* n,
GrowableArray<int>& memory_pred_edges,
Arena* arena) :
_node(n),
_memory_pred_edges_length(memory_pred_edges.length()),
_memory_pred_edges(nullptr)
{
assert(memory_pred_edges.is_nonempty(), "not empty");
uint bytes = memory_pred_edges.length() * sizeof(int);
_memory_pred_edges = (int*)arena->Amalloc(bytes);
memcpy(_memory_pred_edges, memory_pred_edges.adr_at(0), bytes);
}
VLoopDependencyGraph::PredsIterator::PredsIterator(const VLoopDependencyGraph& dependency_graph,
const Node* node) :
_dependency_graph(dependency_graph),
_node(node),
_dependency_node(dependency_graph.dependency_node(node)),
_current(nullptr),
_next_pred(0),
_end_pred(node->req()),
_next_memory_pred(0),
_end_memory_pred((_dependency_node != nullptr) ? _dependency_node->memory_pred_edges_length() : 0)
{
if (_node->is_Store() || _node->is_Load()) {
// Load: address
// Store: address, value
_next_pred = MemNode::Address;
} else {
assert(!_node->is_Mem(), "only loads and stores are expected mem nodes");
_next_pred = 1; // skip control
}
next();
}
void VLoopDependencyGraph::PredsIterator::next() {
if (_next_pred < _end_pred) {
_current = _node->in(_next_pred++);
} else if (_next_memory_pred < _end_memory_pred) {
int pred_bb_idx = _dependency_node->memory_pred_edge(_next_memory_pred++);
_current = _dependency_graph._body.body().at(pred_bb_idx);
} else {
_current = nullptr; // done
}
}
#ifndef PRODUCT
int VPointer::Tracer::_depth = 0;
#endif

View File

@ -150,6 +150,10 @@ public:
return _vtrace.is_trace(TraceAutoVectorizationTag::TYPES);
}
bool is_trace_dependency_graph() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::DEPENDENCY_GRAPH);
}
bool is_trace_pointer_analysis() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
}
@ -308,7 +312,7 @@ public:
const GrowableArray<MemNode*>& tails() const { return _tails; }
// Get all memory nodes of a slice, in reverse order
void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<Node*>& slice) const;
void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*>& slice) const;
bool same_memory_slice(MemNode* m1, MemNode* m2) const;
@ -441,6 +445,109 @@ private:
const Type* container_type(Node* n) const;
};
// Submodule of VLoopAnalyzer.
// The dependency graph is used to determine if nodes are independent, and can thus potentially
// be executed in parallel. That is a prerequisite for packing nodes into vector operations.
// The dependency graph is a combination:
// - Data-dependencies: they can directly be taken from the C2 node inputs.
// - Memory-dependencies: the edges in the C2 memory-slice are too restrictive: for example all
// stores are serialized, even if their memory does not overlap. Thus,
// we refine the memory-dependencies (see construct method).
class VLoopDependencyGraph : public StackObj {
private:
class DependencyNode;
Arena* _arena;
const VLoop& _vloop;
const VLoopBody& _body;
const VLoopMemorySlices& _memory_slices;
// bb_idx -> DependenceNode*
GrowableArray<DependencyNode*> _dependency_nodes;
// Node depth in DAG: bb_idx -> depth
GrowableArray<int> _depths;
public:
VLoopDependencyGraph(Arena* arena,
const VLoop& vloop,
const VLoopBody& body,
const VLoopMemorySlices& memory_slices) :
_arena(arena),
_vloop(vloop),
_body(body),
_memory_slices(memory_slices),
_dependency_nodes(arena,
vloop.estimated_body_length(),
vloop.estimated_body_length(),
nullptr),
_depths(arena,
vloop.estimated_body_length(),
vloop.estimated_body_length(),
0) {}
NONCOPYABLE(VLoopDependencyGraph);
void construct();
bool independent(Node* s1, Node* s2) const;
bool mutually_independent(const Node_List* nodes) const;
private:
void add_node(MemNode* n, GrowableArray<int>& memory_pred_edges);
int depth(const Node* n) const { return _depths.at(_body.bb_idx(n)); }
void set_depth(const Node* n, int d) { _depths.at_put(_body.bb_idx(n), d); }
void compute_depth();
NOT_PRODUCT( void print() const; )
const DependencyNode* dependency_node(const Node* n) const {
return _dependency_nodes.at(_body.bb_idx(n));
}
class DependencyNode : public ArenaObj {
private:
MemNode* _node; // Corresponding ideal node
const uint _memory_pred_edges_length;
int* _memory_pred_edges; // memory pred-edges, mapping to bb_idx
public:
DependencyNode(MemNode* n, GrowableArray<int>& memory_pred_edges, Arena* arena);
NONCOPYABLE(DependencyNode);
uint memory_pred_edges_length() const { return _memory_pred_edges_length; }
int memory_pred_edge(uint i) const {
assert(i < _memory_pred_edges_length, "bounds check");
return _memory_pred_edges[i];
}
};
public:
// Iterator for dependency graph predecessors of a node.
class PredsIterator : public StackObj {
private:
const VLoopDependencyGraph& _dependency_graph;
const Node* _node;
const DependencyNode* _dependency_node;
Node* _current;
// Iterate in node->in(i)
int _next_pred;
int _end_pred;
// Iterate in dependency_node->memory_pred_edge(i)
int _next_memory_pred;
int _end_memory_pred;
public:
PredsIterator(const VLoopDependencyGraph& dependency_graph, const Node* node);
NONCOPYABLE(PredsIterator);
void next();
bool done() const { return _current == nullptr; }
Node* current() const {
assert(!done(), "not done yet");
return _current;
}
};
};
// Analyze the loop in preparation for auto-vectorization. This class is
// deliberately structured into many submodules, which are as independent
// as possible, though some submodules do require other submodules.
@ -463,6 +570,7 @@ private:
VLoopMemorySlices _memory_slices;
VLoopBody _body;
VLoopTypes _types;
VLoopDependencyGraph _dependency_graph;
public:
VLoopAnalyzer(const VLoop& vloop, VSharedData& vshared) :
@ -472,7 +580,8 @@ public:
_reductions (&_arena, vloop),
_memory_slices (&_arena, vloop),
_body (&_arena, vloop, vshared),
_types (&_arena, vloop, _body)
_types (&_arena, vloop, _body),
_dependency_graph(&_arena, vloop, _body, _memory_slices)
{
_success = setup_submodules();
}
@ -486,6 +595,7 @@ public:
const VLoopMemorySlices& memory_slices() const { return _memory_slices; }
const VLoopBody& body() const { return _body; }
const VLoopTypes& types() const { return _types; }
const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
private:
bool setup_submodules();