8332163: C2 SuperWord: refactor PacksetGraph and SuperWord::output into VTransformGraph
Reviewed-by: chagedorn, kvn
This commit is contained in:
parent
3f37c5718d
commit
02956ab6e1
@ -22,22 +22,13 @@
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "libadt/vectset.hpp"
|
||||
#include "memory/allocation.inline.hpp"
|
||||
#include "memory/resourceArea.hpp"
|
||||
#include "opto/addnode.hpp"
|
||||
#include "opto/c2compiler.hpp"
|
||||
#include "opto/castnode.hpp"
|
||||
#include "opto/convertnode.hpp"
|
||||
#include "opto/matcher.hpp"
|
||||
#include "opto/memnode.hpp"
|
||||
#include "opto/opcodes.hpp"
|
||||
#include "opto/opaquenode.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/superword.hpp"
|
||||
#include "opto/superwordVTransformBuilder.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "opto/movenode.hpp"
|
||||
#include "utilities/powerOfTwo.hpp"
|
||||
|
||||
SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
|
||||
_vloop_analyzer(vloop_analyzer),
|
||||
@ -707,7 +698,7 @@ bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) {
|
||||
}
|
||||
|
||||
// Forbid anything that looks like a PopulateIndex to be packed. It does not need to be packed,
|
||||
// and will still be vectorized by SuperWord::vector_opd.
|
||||
// and will still be vectorized by SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index.
|
||||
if (isomorphic(s1, s2) && !is_populate_index(s1, s2)) {
|
||||
if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
|
||||
if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
|
||||
@ -769,8 +760,9 @@ bool SuperWord::isomorphic(Node* s1, Node* s2) {
|
||||
}
|
||||
}
|
||||
|
||||
// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to PopulateIndex vector node.
|
||||
// We skip the pack creation of these nodes. They will be vectorized by SuperWord::vector_opd.
|
||||
// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to
|
||||
// PopulateIndex vector node. We skip the pack creation of these nodes. They
|
||||
// will be vectorized by SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index.
|
||||
bool SuperWord::is_populate_index(const Node* n1, const Node* n2) const {
|
||||
return n1->is_Add() &&
|
||||
n2->is_Add() &&
|
||||
@ -1858,307 +1850,74 @@ void PackSet::verify() const {
|
||||
}
|
||||
#endif
|
||||
|
||||
// The PacksetGraph combines the dependency graph with the packset. In the PackSet
|
||||
// graph, we have two kinds of nodes:
|
||||
// (1) pack-node: Represents all nodes of some pack p in a single node, which
|
||||
// shall later become a vector node.
|
||||
// (2) scalar-node: Represents a node that is not in any pack.
|
||||
// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for
|
||||
// the PacksetGraph nodes corresponding to n1 and n2.
|
||||
// We work from the dependency graph, because it gives us all the data-dependencies,
|
||||
// as well as more refined memory-dependencies than the C2 graph. The dependency graph
|
||||
// does not have cycles. But packing nodes can introduce cyclic dependencies. Example:
|
||||
//
|
||||
// +--------+
|
||||
// A -> X | v
|
||||
// Pack [A,B] and [X,Y] [A,B] [X,Y]
|
||||
// Y -> B ^ |
|
||||
// +--------+
|
||||
//
|
||||
class PacksetGraph {
|
||||
private:
|
||||
// pid: packset graph node id.
|
||||
GrowableArray<int> _pid; // bb_idx(n) -> pid
|
||||
GrowableArray<Node*> _pid_to_node; // one node per pid, find rest via _packset.pack
|
||||
GrowableArray<GrowableArray<int>> _out; // out-edges
|
||||
GrowableArray<int> _incnt; // number of (implicit) in-edges
|
||||
int _max_pid = 0;
|
||||
bool SuperWord::schedule_and_apply() const {
|
||||
if (_packset.is_empty()) { return false; }
|
||||
|
||||
bool _schedule_success;
|
||||
|
||||
SuperWord* _slp;
|
||||
public:
|
||||
PacksetGraph(SuperWord* slp)
|
||||
: _pid(8, 0, /* default */ 0), _slp(slp) {
|
||||
}
|
||||
// Get pid, if there is a packset node that n belongs to. Else return 0.
|
||||
int get_pid_or_zero(const Node* n) const {
|
||||
if (!_slp->in_bb(n)) {
|
||||
return 0;
|
||||
}
|
||||
int idx = _slp->bb_idx(n);
|
||||
if (idx >= _pid.length()) {
|
||||
return 0;
|
||||
} else {
|
||||
return _pid.at(idx);
|
||||
}
|
||||
}
|
||||
int get_pid(const Node* n) {
|
||||
int poz = get_pid_or_zero(n);
|
||||
assert(poz != 0, "pid should not be zero");
|
||||
return poz;
|
||||
}
|
||||
void set_pid(Node* n, int pid) {
|
||||
assert(n != nullptr && pid > 0, "sane inputs");
|
||||
assert(_slp->in_bb(n), "must be");
|
||||
int idx = _slp->bb_idx(n);
|
||||
_pid.at_put_grow(idx, pid);
|
||||
_pid_to_node.at_put_grow(pid - 1, n, nullptr);
|
||||
}
|
||||
Node* get_node(int pid) {
|
||||
assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
|
||||
Node* n = _pid_to_node.at(pid - 1);
|
||||
assert(n != nullptr, "sanity");
|
||||
return n;
|
||||
}
|
||||
int new_pid() {
|
||||
_incnt.push(0);
|
||||
_out.push(GrowableArray<int>());
|
||||
return ++_max_pid;
|
||||
}
|
||||
int incnt(int pid) { return _incnt.at(pid - 1); }
|
||||
void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
|
||||
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
|
||||
bool schedule_success() const { return _schedule_success; }
|
||||
|
||||
// Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph.
|
||||
void build() {
|
||||
const PackSet& packset = _slp->packset();
|
||||
const GrowableArray<Node*>& body = _slp->body();
|
||||
// Map nodes in packsets
|
||||
for (int i = 0; i < packset.length(); i++) {
|
||||
Node_List* p = packset.at(i);
|
||||
int pid = new_pid();
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
Node* n = p->at(k);
|
||||
set_pid(n, pid);
|
||||
assert(packset.get_pack(n) == p, "matching packset");
|
||||
}
|
||||
}
|
||||
|
||||
int max_pid_packset = _max_pid;
|
||||
|
||||
// Map nodes not in packset
|
||||
for (int i = 0; i < body.length(); i++) {
|
||||
Node* n = body.at(i);
|
||||
if (n->is_Phi() || n->is_CFG()) {
|
||||
continue; // ignore control flow
|
||||
}
|
||||
int pid = get_pid_or_zero(n);
|
||||
if (pid == 0) {
|
||||
pid = new_pid();
|
||||
set_pid(n, pid);
|
||||
assert(packset.get_pack(n) == nullptr, "no packset");
|
||||
}
|
||||
}
|
||||
|
||||
// Map edges for packset nodes
|
||||
VectorSet set;
|
||||
for (int i = 0; i < packset.length(); i++) {
|
||||
Node_List* p = packset.at(i);
|
||||
set.clear();
|
||||
int pid = get_pid(p->at(0));
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
Node* n = p->at(k);
|
||||
assert(pid == get_pid(n), "all nodes in pack have same pid");
|
||||
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
|
||||
Node* pred = preds.current();
|
||||
int pred_pid = get_pid_or_zero(pred);
|
||||
if (pred_pid == pid && _slp->is_marked_reduction(n)) {
|
||||
continue; // reduction -> self-cycle is not a cyclic dependency
|
||||
}
|
||||
// Only add edges once, and only for mapped nodes (in body)
|
||||
if (pred_pid > 0 && !set.test_set(pred_pid)) {
|
||||
incnt_set(pid, incnt(pid) + 1); // increment
|
||||
out(pred_pid).push(pid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map edges for nodes not in packset
|
||||
for (int i = 0; i < body.length(); i++) {
|
||||
Node* n = body.at(i);
|
||||
int pid = get_pid_or_zero(n); // zero for Phi or CFG
|
||||
if (pid <= max_pid_packset) {
|
||||
continue; // Only scalar-nodes
|
||||
}
|
||||
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
|
||||
Node* pred = preds.current();
|
||||
int pred_pid = get_pid_or_zero(pred);
|
||||
// Only add edges for mapped nodes (in body)
|
||||
if (pred_pid > 0) {
|
||||
incnt_set(pid, incnt(pid) + 1); // increment
|
||||
out(pred_pid).push(pid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
|
||||
// that has zero incnt. If a PacksetGraph node corresponds to memops, then add
|
||||
// those to the memops_schedule. At the end, we return the memops_schedule, and
|
||||
// note if topsort was successful.
|
||||
Node_List schedule() {
|
||||
Node_List memops_schedule;
|
||||
GrowableArray<int> worklist;
|
||||
// Directly schedule all nodes without precedence
|
||||
for (int pid = 1; pid <= _max_pid; pid++) {
|
||||
if (incnt(pid) == 0) {
|
||||
worklist.push(pid);
|
||||
}
|
||||
}
|
||||
// Continue scheduling via topological sort
|
||||
for (int i = 0; i < worklist.length(); i++) {
|
||||
int pid = worklist.at(i);
|
||||
|
||||
// Add memops to memops_schedule
|
||||
Node* n = get_node(pid);
|
||||
Node_List* p = _slp->packset().get_pack(n);
|
||||
if (n->is_Mem()) {
|
||||
if (p == nullptr) {
|
||||
memops_schedule.push(n);
|
||||
} else {
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
memops_schedule.push(p->at(k));
|
||||
assert(p->at(k)->is_Mem(), "only schedule memops");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Decrement incnt for all successors
|
||||
for (int j = 0; j < out(pid).length(); j++){
|
||||
int pid_use = out(pid).at(j);
|
||||
int incnt_use = incnt(pid_use) - 1;
|
||||
incnt_set(pid_use, incnt_use);
|
||||
// Did use lose its last input?
|
||||
if (incnt_use == 0) {
|
||||
worklist.push(pid_use);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
|
||||
_schedule_success = (worklist.length() == _max_pid);
|
||||
return memops_schedule;
|
||||
}
|
||||
|
||||
// Print the PacksetGraph.
|
||||
// print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
|
||||
// print_zero_incnt = false: do not print nodes that have no in-edges (any more).
|
||||
void print(bool print_nodes, bool print_zero_incnt) {
|
||||
const GrowableArray<Node*> &body = _slp->body();
|
||||
tty->print_cr("PacksetGraph");
|
||||
for (int pid = 1; pid <= _max_pid; pid++) {
|
||||
if (incnt(pid) == 0 && !print_zero_incnt) {
|
||||
continue;
|
||||
}
|
||||
tty->print("Node %d. incnt %d [", pid, incnt(pid));
|
||||
for (int j = 0; j < out(pid).length(); j++) {
|
||||
tty->print("%d ", out(pid).at(j));
|
||||
}
|
||||
tty->print_cr("]");
|
||||
// Make an empty transform.
|
||||
#ifndef PRODUCT
|
||||
if (print_nodes) {
|
||||
for (int i = 0; i < body.length(); i++) {
|
||||
Node* n = body.at(i);
|
||||
if (get_pid_or_zero(n) == pid) {
|
||||
tty->print(" ");
|
||||
n->dump();
|
||||
}
|
||||
}
|
||||
}
|
||||
VTransformTrace trace(_vloop.vtrace(),
|
||||
is_trace_superword_rejections(),
|
||||
is_trace_align_vector(),
|
||||
is_trace_superword_info());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
VTransform vtransform(_vloop_analyzer,
|
||||
_mem_ref_for_main_loop_alignment,
|
||||
_aw_for_main_loop_alignment
|
||||
NOT_PRODUCT(COMMA trace)
|
||||
);
|
||||
|
||||
// We want to replace the packed scalars from the PackSet and replace them
|
||||
// with vector operations. This requires scheduling and re-ordering the memory
|
||||
// graph. We take these steps:
|
||||
// (1) Build the PacksetGraph. It combines the dependency graph with the
|
||||
// packset. The PacksetGraph gives us the dependencies that must be
|
||||
// respected after scheduling.
|
||||
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
|
||||
// a linear order of all memops in the body. The order respects the
|
||||
// dependencies of the PacksetGraph.
|
||||
// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
|
||||
// (4) Apply the vectorization, including re-ordering the memops and replacing
|
||||
// packed scalars with vector operations.
|
||||
bool SuperWord::schedule_and_apply() {
|
||||
if (_packset.is_empty()) {
|
||||
return false;
|
||||
}
|
||||
ResourceMark rm;
|
||||
|
||||
// (1) Build the PacksetGraph.
|
||||
PacksetGraph graph(this);
|
||||
graph.build();
|
||||
|
||||
// (2) Schedule the PacksetGraph.
|
||||
Node_List memops_schedule = graph.schedule();
|
||||
|
||||
// (3) Check if the PacksetGraph schedule succeeded (had no cycles).
|
||||
// We now know that we only have independent packs, see verify_packs.
|
||||
// This is a necessary but not a sufficient condition for an acyclic
|
||||
// graph (DAG) after scheduling. Thus, we must check if the packs have
|
||||
// introduced a cycle. The SuperWord paper mentions the need for this
|
||||
// in "3.7 Scheduling".
|
||||
if (!graph.schedule_success()) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
|
||||
graph.print(true, false);
|
||||
tty->print_cr("removing all packs from packset.");
|
||||
}
|
||||
#endif
|
||||
_packset.clear();
|
||||
return false;
|
||||
// Build the transform from the packset.
|
||||
{
|
||||
ResourceMark rm;
|
||||
SuperWordVTransformBuilder builder(_packset, vtransform);
|
||||
}
|
||||
|
||||
// (4) Apply the vectorization, including re-ordering the memops.
|
||||
return apply(memops_schedule);
|
||||
if (!vtransform.schedule()) { return false; }
|
||||
vtransform.apply();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SuperWord::apply(Node_List& memops_schedule) {
|
||||
Compile* C = phase()->C;
|
||||
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl);
|
||||
// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
|
||||
// correctness and profitability checks have passed, and the graph was successfully scheduled.
|
||||
void VTransform::apply() {
|
||||
#ifndef PRODUCT
|
||||
if (_trace._info || TraceLoopOpts) {
|
||||
tty->print_cr("\nVTransform::apply:");
|
||||
lpt()->dump_head();
|
||||
lpt()->head()->dump();
|
||||
}
|
||||
assert(cl()->is_main_loop(), "auto vectorization only for main loops");
|
||||
assert(_graph.is_scheduled(), "must already be scheduled");
|
||||
#endif
|
||||
|
||||
apply_memops_reordering_with_schedule(memops_schedule);
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl);
|
||||
Compile* C = phase()->C;
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl());
|
||||
|
||||
_graph.apply_memops_reordering_with_schedule();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl());
|
||||
|
||||
adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl);
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl());
|
||||
|
||||
bool is_success = apply_vectorization();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl);
|
||||
|
||||
return is_success;
|
||||
apply_vectorization();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl());
|
||||
}
|
||||
|
||||
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
|
||||
// and track the current memory state of each slice.
|
||||
void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule) {
|
||||
// We prepare the memory graph for the replacement of scalar memops with vector memops.
|
||||
// We reorder all slices in parallel, ensuring that the memops inside each slice are
|
||||
// ordered according to the _schedule. This means that all packed memops are consecutive
|
||||
// in the memory graph after the reordering.
|
||||
void VTransformGraph::apply_memops_reordering_with_schedule() const {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_info()) {
|
||||
tty->print_cr("\nSuperWord::apply_memops_reordering_with_schedule:");
|
||||
memops_schedule.dump();
|
||||
assert(is_scheduled(), "must be already scheduled");
|
||||
if (_trace._info) {
|
||||
print_memops_schedule();
|
||||
}
|
||||
#endif
|
||||
|
||||
ResourceMark rm;
|
||||
int max_slices = phase()->C->num_alias_types();
|
||||
// When iterating over the memops_schedule, we keep track of the current memory state,
|
||||
// When iterating over the schedule, we keep track of the current memory state,
|
||||
// which is the Phi or a store in the loop.
|
||||
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
|
||||
// The memory state after the loop is the last store inside the loop. If we reorder the
|
||||
@ -2179,10 +1938,9 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule
|
||||
old_last_store_in_slice.at_put(alias_idx, last_store);
|
||||
}
|
||||
|
||||
// (2) Walk over memops_schedule, append memops to the current state
|
||||
// (2) Walk over schedule, append memops to the current state
|
||||
// of that slice. If it is a Store, we take it as the new state.
|
||||
for (uint i = 0; i < memops_schedule.size(); i++) {
|
||||
MemNode* n = memops_schedule.at(i)->as_Mem();
|
||||
for_each_memop_in_schedule([&] (MemNode* n) {
|
||||
assert(n->is_Load() || n->is_Store(), "only loads or stores");
|
||||
int alias_idx = phase()->C->get_alias_index(n->adr_type());
|
||||
Node* current_state = current_state_in_slice.at(alias_idx);
|
||||
@ -2198,12 +1956,12 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule
|
||||
current_state_in_slice.at_put(alias_idx, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// (3) For each slice, we add the current state to the backedge
|
||||
// in the Phi. Further, we replace uses of the old last store
|
||||
// with uses of the new last store (current_state).
|
||||
Node_List uses_after_loop;
|
||||
GrowableArray<Node*> uses_after_loop;
|
||||
for (int i = 0; i < mem_slice_head.length(); i++) {
|
||||
Node* phi = mem_slice_head.at(i);
|
||||
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
|
||||
@ -2225,7 +1983,7 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule
|
||||
uses_after_loop.push(use);
|
||||
}
|
||||
}
|
||||
for (uint k = 0; k < uses_after_loop.size(); k++) {
|
||||
for (int k = 0; k < uses_after_loop.length(); k++) {
|
||||
Node* use = uses_after_loop.at(k);
|
||||
for (uint j = 0; j < use->req(); j++) {
|
||||
Node* def = use->in(j);
|
||||
@ -2237,396 +1995,65 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule
|
||||
}
|
||||
}
|
||||
|
||||
// Convert packs into vector node operations
|
||||
// At this point, all correctness and profitability checks have passed.
|
||||
// We start the irreversible process of editing the C2 graph. Should
|
||||
// there be an unexpected situation (assert fails), then we can only
|
||||
// bail out of the compilation, as the graph has already been partially
|
||||
// modified. We bail out, and retry without SuperWord.
|
||||
bool SuperWord::apply_vectorization() {
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
assert(cl->is_main_loop(), "SLP should only work on main loops");
|
||||
void VTransformGraph::apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const {
|
||||
ResourceMark rm;
|
||||
// We keep track of the resulting Nodes from every "VTransformNode::apply" call.
|
||||
// Since "apply" is called on defs before uses, this allows us to find the
|
||||
// generated def (input) nodes when we are generating the use nodes in "apply".
|
||||
int length = _vtnodes.length();
|
||||
GrowableArray<Node*> vtnode_idx_to_transformed_node(length, length, nullptr);
|
||||
|
||||
for (int i = 0; i < _schedule.length(); i++) {
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
VTransformApplyResult result = vtn->apply(_vloop_analyzer,
|
||||
vtnode_idx_to_transformed_node);
|
||||
NOT_PRODUCT( if (_trace._verbose) { result.trace(vtn); } )
|
||||
|
||||
vtnode_idx_to_transformed_node.at_put(vtn->_idx, result.node());
|
||||
max_vector_length = MAX2(max_vector_length, result.vector_length());
|
||||
max_vector_width = MAX2(max_vector_width, result.vector_width());
|
||||
}
|
||||
}
|
||||
|
||||
// We call "apply" on every VTransformNode, which replaces the packed scalar nodes with vector nodes.
|
||||
void VTransform::apply_vectorization() const {
|
||||
Compile* C = phase()->C;
|
||||
assert(!_packset.is_empty(), "vectorization requires non-empty packset");
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceLoopOpts) {
|
||||
tty->print("SuperWord::apply_vectorization ");
|
||||
lpt()->dump_head();
|
||||
if (_trace._verbose) {
|
||||
tty->print_cr("\nVTransform::apply_vectorization:");
|
||||
}
|
||||
#endif
|
||||
|
||||
uint max_vlen_in_bytes = 0;
|
||||
uint max_vlen = 0;
|
||||
uint max_vector_length = 0; // number of elements
|
||||
uint max_vector_width = 0; // total width in bytes
|
||||
_graph.apply_vectorization_for_each_vtnode(max_vector_length, max_vector_width);
|
||||
|
||||
for (int i = 0; i < body().length(); i++) {
|
||||
Node* n = body().at(i);
|
||||
Node_List* p = get_pack(n);
|
||||
if (p != nullptr && n == p->at(p->size()-1)) {
|
||||
// After apply_memops_reordering_with_schedule, we know that the memops have the same order in the pack
|
||||
// as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
|
||||
// and "n" is the last node in the slice from the pack.
|
||||
Node* first = p->at(0);
|
||||
uint vlen = p->size();
|
||||
uint vlen_in_bytes = 0;
|
||||
Node* vn = nullptr;
|
||||
int opc = n->Opcode();
|
||||
if (n->is_Load()) {
|
||||
Node* ctl = n->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
// Set the memory dependency of the LoadVector as early as possible.
|
||||
// Walk up the memory chain, and ignore any StoreVector that provably
|
||||
// does not have any memory dependency.
|
||||
while (mem->is_StoreVector()) {
|
||||
VPointer p_store(mem->as_Mem(), _vloop);
|
||||
if (p_store.overlap_possible_with_any_in(p)) {
|
||||
break;
|
||||
} else {
|
||||
mem = mem->in(MemNode::Memory);
|
||||
}
|
||||
}
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
const TypePtr* atyp = n->adr_type();
|
||||
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
|
||||
vlen_in_bytes = vn->as_LoadVector()->memory_size();
|
||||
} else if (n->is_Store()) {
|
||||
// Promote value to be stored to vector
|
||||
Node* val = vector_opd(p, MemNode::ValueIn);
|
||||
if (val == nullptr) {
|
||||
assert(false, "input to vector store was not created");
|
||||
C->record_failure(C2Compiler::retry_no_superword());
|
||||
return false; // bailout
|
||||
}
|
||||
assert(max_vector_length > 0 && max_vector_width > 0, "must have vectorized");
|
||||
cl()->mark_loop_vectorized();
|
||||
|
||||
Node* ctl = n->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
const TypePtr* atyp = n->adr_type();
|
||||
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
|
||||
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
||||
} else if (VectorNode::is_scalar_rotate(n)) {
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in2 = first->in(2);
|
||||
// If rotation count is non-constant or greater than 8bit value create a vector.
|
||||
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
|
||||
in2 = vector_opd(p, 2);
|
||||
}
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_roundopD(n)) {
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in2 = first->in(2);
|
||||
assert(in2->is_Con(), "Constant rounding mode expected.");
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_muladds2i(n)) {
|
||||
assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in2 = vector_opd(p, 2);
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (opc == Op_SignumF || opc == Op_SignumD) {
|
||||
assert(n->req() == 4, "four inputs expected");
|
||||
Node* in = vector_opd(p, 1);
|
||||
Node* zero = vector_opd(p, 2);
|
||||
Node* one = vector_opd(p, 3);
|
||||
vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (n->is_Cmp()) {
|
||||
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
||||
continue;
|
||||
} else if (n->is_Bool()) {
|
||||
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
||||
continue;
|
||||
} else if (n->is_CMove()) {
|
||||
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
||||
|
||||
BoolNode* bol = n->in(1)->as_Bool();
|
||||
assert(bol != nullptr, "must have Bool above CMove");
|
||||
Node_List* bool_pack = get_pack(bol);
|
||||
assert(bool_pack != nullptr, "CMove must have matching Bool pack");
|
||||
|
||||
CmpNode* cmp = bol->in(1)->as_Cmp();
|
||||
assert(cmp != nullptr, "must have cmp above CMove");
|
||||
Node_List* cmp_pack = get_pack(cmp);
|
||||
assert(cmp_pack != nullptr, "Bool must have matching Cmp pack");
|
||||
|
||||
Node* cmp_in1 = vector_opd(cmp_pack, 1);
|
||||
Node* cmp_in2 = vector_opd(cmp_pack, 2);
|
||||
|
||||
Node* blend_in1 = vector_opd(p, 2);
|
||||
Node* blend_in2 = vector_opd(p, 3);
|
||||
|
||||
VTransformBoolTest bool_test = _packset.get_bool_test(bool_pack);
|
||||
BoolTest::mask test_mask = bool_test._mask;
|
||||
if (bool_test._is_negated) {
|
||||
// We can cancel out the negation by swapping the blend inputs.
|
||||
swap(blend_in1, blend_in2);
|
||||
}
|
||||
|
||||
// VectorMaskCmp
|
||||
ConINode* test_mask_node = igvn().intcon((int)test_mask);
|
||||
BasicType bt = velt_basic_type(cmp);
|
||||
const TypeVect* vt = TypeVect::make(bt, vlen);
|
||||
VectorNode* mask = new VectorMaskCmpNode(test_mask, cmp_in1, cmp_in2, test_mask_node, vt);
|
||||
phase()->register_new_node_with_ctrl_of(mask, p->at(0));
|
||||
igvn()._worklist.push(mask);
|
||||
|
||||
// VectorBlend
|
||||
vn = new VectorBlendNode(blend_in1, blend_in2, mask);
|
||||
} else if (n->req() == 3) {
|
||||
// Promote operands to vector
|
||||
Node* in1 = nullptr;
|
||||
bool node_isa_reduction = is_marked_reduction(n);
|
||||
if (node_isa_reduction) {
|
||||
// the input to the first reduction operation is retained
|
||||
in1 = first->in(1);
|
||||
} else {
|
||||
in1 = vector_opd(p, 1);
|
||||
if (in1 == nullptr) {
|
||||
assert(false, "input in1 to vector operand was not created");
|
||||
C->record_failure(C2Compiler::retry_no_superword());
|
||||
return false; // bailout
|
||||
}
|
||||
}
|
||||
Node* in2 = vector_opd(p, 2);
|
||||
if (in2 == nullptr) {
|
||||
assert(false, "input in2 to vector operand was not created");
|
||||
C->record_failure(C2Compiler::retry_no_superword());
|
||||
return false; // bailout
|
||||
}
|
||||
if (in1->Opcode() == Op_Replicate && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
|
||||
// Move invariant vector input into second position to avoid register spilling.
|
||||
Node* tmp = in1;
|
||||
in1 = in2;
|
||||
in2 = tmp;
|
||||
}
|
||||
if (node_isa_reduction) {
|
||||
const Type *arith_type = n->bottom_type();
|
||||
vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type());
|
||||
if (in2->is_Load()) {
|
||||
vlen_in_bytes = in2->as_LoadVector()->memory_size();
|
||||
} else {
|
||||
vlen_in_bytes = in2->as_Vector()->length_in_bytes();
|
||||
}
|
||||
} else {
|
||||
if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) {
|
||||
opc = Op_RShiftI;
|
||||
}
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
}
|
||||
} else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) {
|
||||
assert(n->req() == 2, "only one input expected");
|
||||
Node* in = vector_opd(p, 1);
|
||||
vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
|
||||
assert(n->req() == 2, "only one input expected");
|
||||
Node* in = vector_opd(p, 1);
|
||||
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
|
||||
phase()->register_new_node_with_ctrl_of(longval, first);
|
||||
// Requires extra vector long -> int conversion.
|
||||
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
assert(n->req() == 2, "only one input expected");
|
||||
BasicType bt = velt_basic_type(n);
|
||||
Node* in = vector_opd(p, 1);
|
||||
int vopc = VectorCastNode::opcode(opc, in->bottom_type()->is_vect()->element_basic_type());
|
||||
vn = VectorCastNode::make(vopc, in, bt, vlen);
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (opc == Op_FmaD || opc == Op_FmaF) {
|
||||
// Promote operands to vector
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in2 = vector_opd(p, 2);
|
||||
Node* in3 = vector_opd(p, 3);
|
||||
vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else {
|
||||
assert(false, "Unhandled scalar opcode (%s)", NodeClassNames[opc]);
|
||||
C->record_failure(C2Compiler::retry_no_superword());
|
||||
return false; // bailout
|
||||
}
|
||||
|
||||
if (vn == nullptr) {
|
||||
assert(false, "got null node instead of vector node");
|
||||
C->record_failure(C2Compiler::retry_no_superword());
|
||||
return false; // bailout
|
||||
}
|
||||
|
||||
#ifdef ASSERT
|
||||
// Mark Load/Store Vector for alignment verification
|
||||
if (VerifyAlignVector) {
|
||||
if (vn->Opcode() == Op_LoadVector) {
|
||||
vn->as_LoadVector()->set_must_verify_alignment();
|
||||
} else if (vn->Opcode() == Op_StoreVector) {
|
||||
vn->as_StoreVector()->set_must_verify_alignment();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
phase()->register_new_node_with_ctrl_of(vn, first);
|
||||
for (uint j = 0; j < p->size(); j++) {
|
||||
Node* pm = p->at(j);
|
||||
igvn().replace_node(pm, vn);
|
||||
}
|
||||
igvn()._worklist.push(vn);
|
||||
|
||||
if (vlen > max_vlen) {
|
||||
max_vlen = vlen;
|
||||
}
|
||||
if (vlen_in_bytes > max_vlen_in_bytes) {
|
||||
max_vlen_in_bytes = vlen_in_bytes;
|
||||
}
|
||||
VectorNode::trace_new_vector(vn, "SuperWord");
|
||||
}
|
||||
}//for (int i = 0; i < body().length(); i++)
|
||||
|
||||
if (max_vlen_in_bytes > C->max_vector_size()) {
|
||||
C->set_max_vector_size(max_vlen_in_bytes);
|
||||
}
|
||||
if (max_vlen_in_bytes > 0) {
|
||||
cl->mark_loop_vectorized();
|
||||
if (max_vector_width > C->max_vector_size()) {
|
||||
C->set_max_vector_size(max_vector_width);
|
||||
}
|
||||
|
||||
if (SuperWordLoopUnrollAnalysis) {
|
||||
if (cl->has_passed_slp()) {
|
||||
uint slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if (slp_max_unroll_factor == max_vlen) {
|
||||
if (cl()->has_passed_slp()) {
|
||||
uint slp_max_unroll_factor = cl()->slp_max_unroll();
|
||||
if (slp_max_unroll_factor == max_vector_length) {
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWordLoopUnrollAnalysis) {
|
||||
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
|
||||
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vector_length, max_vector_width * BitsPerByte);
|
||||
}
|
||||
#endif
|
||||
// For atomic unrolled loops which are vector mapped, instigate more unrolling
|
||||
cl->set_notpassed_slp();
|
||||
cl()->set_notpassed_slp();
|
||||
// if vector resources are limited, do not allow additional unrolling
|
||||
if (Matcher::float_pressure_limit() > 8) {
|
||||
C->set_major_progress();
|
||||
cl->mark_do_unroll_only();
|
||||
cl()->mark_do_unroll_only();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//------------------------------vector_opd---------------------------
|
||||
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
|
||||
Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
Node* p0 = p->at(0);
|
||||
uint vlen = p->size();
|
||||
Node* opd = p0->in(opd_idx);
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
Node* same_input = _packset.same_inputs_at_index_or_null(p, opd_idx);
|
||||
|
||||
// Insert index population operation to create a vector of increasing
|
||||
// indices starting from the iv value. In some special unrolled loops
|
||||
// (see JDK-8286125), we need scalar replications of the iv value if
|
||||
// all inputs are the same iv, so we do a same inputs check here.
|
||||
if (opd == iv() && same_input == nullptr) {
|
||||
BasicType p0_bt = velt_basic_type(p0);
|
||||
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
|
||||
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
|
||||
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
|
||||
Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
|
||||
VectorNode::trace_new_vector(vn, "SuperWord");
|
||||
phase()->register_new_node_with_ctrl_of(vn, opd);
|
||||
return vn;
|
||||
}
|
||||
|
||||
if (same_input != nullptr) {
|
||||
if (opd->is_Vector() || opd->is_LoadVector()) {
|
||||
if (opd_idx == 2 && VectorNode::is_shift(p0)) {
|
||||
assert(false, "shift's count can't be vector");
|
||||
return nullptr;
|
||||
}
|
||||
return opd; // input is matching vector
|
||||
}
|
||||
if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
|
||||
Node* cnt = opd;
|
||||
// Vector instructions do not mask shift count, do it here.
|
||||
juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
|
||||
const TypeInt* t = opd->find_int_type();
|
||||
if (t != nullptr && t->is_con()) {
|
||||
juint shift = t->get_con();
|
||||
if (shift > mask) { // Unsigned cmp
|
||||
cnt = igvn().intcon(shift & mask);
|
||||
phase()->set_ctrl(cnt, phase()->C->root());
|
||||
}
|
||||
} else {
|
||||
if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
|
||||
cnt = igvn().intcon(mask);
|
||||
cnt = new AndINode(opd, cnt);
|
||||
phase()->register_new_node_with_ctrl_of(cnt, opd);
|
||||
}
|
||||
if (!opd->bottom_type()->isa_int()) {
|
||||
assert(false, "int type only");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
// Move shift count into vector register.
|
||||
cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
|
||||
phase()->register_new_node_with_ctrl_of(cnt, opd);
|
||||
return cnt;
|
||||
}
|
||||
if (opd->is_StoreVector()) {
|
||||
assert(false, "StoreVector is not expected here");
|
||||
return nullptr;
|
||||
}
|
||||
// Convert scalar input to vector with the same number of elements as
|
||||
// p0's vector. Use p0's type because size of operand's container in
|
||||
// vector should match p0's size regardless operand's size.
|
||||
const Type* p0_t = nullptr;
|
||||
VectorNode* vn = nullptr;
|
||||
if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) {
|
||||
Node* conv = opd;
|
||||
p0_t = TypeInt::INT;
|
||||
if (p0->bottom_type()->isa_long()) {
|
||||
p0_t = TypeLong::LONG;
|
||||
conv = new ConvI2LNode(opd);
|
||||
phase()->register_new_node_with_ctrl_of(conv, opd);
|
||||
}
|
||||
vn = VectorNode::scalar2vector(conv, vlen, p0_t);
|
||||
} else {
|
||||
p0_t = velt_type(p0);
|
||||
vn = VectorNode::scalar2vector(opd, vlen, p0_t);
|
||||
}
|
||||
|
||||
phase()->register_new_node_with_ctrl_of(vn, opd);
|
||||
VectorNode::trace_new_vector(vn, "SuperWord");
|
||||
return vn;
|
||||
}
|
||||
|
||||
// Insert pack operation
|
||||
BasicType bt = velt_basic_type(p0);
|
||||
PackNode* pk = PackNode::make(opd, vlen, bt);
|
||||
DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
|
||||
|
||||
for (uint i = 1; i < vlen; i++) {
|
||||
Node* pi = p->at(i);
|
||||
Node* in = pi->in(opd_idx);
|
||||
if (get_pack(in) != nullptr) {
|
||||
assert(false, "Should already have been unpacked");
|
||||
return nullptr;
|
||||
}
|
||||
assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
|
||||
pk->add_opd(in);
|
||||
if (VectorNode::is_muladds2i(pi)) {
|
||||
Node* in2 = pi->in(opd_idx + 2);
|
||||
if (get_pack(in2) != nullptr) {
|
||||
assert(false, "Should already have been unpacked");
|
||||
return nullptr;
|
||||
}
|
||||
assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
|
||||
pk->add_opd(in2);
|
||||
}
|
||||
}
|
||||
phase()->register_new_node_with_ctrl_of(pk, opd);
|
||||
VectorNode::trace_new_vector(pk, "SuperWord");
|
||||
return pk;
|
||||
}
|
||||
|
||||
#ifdef ASSERT
|
||||
@ -2797,18 +2224,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) const {
|
||||
return _packset.is_muladds2i_pack_with_pack_inputs(u_pk);
|
||||
}
|
||||
|
||||
if (u_pk->size() != d_pk->size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < u_pk->size(); i++) {
|
||||
Node* ui = u_pk->at(i);
|
||||
Node* di = d_pk->at(i);
|
||||
if (ui->in(u_idx) != di) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return _packset.pack_input_at_index_or_null(u_pk, u_idx) != nullptr;
|
||||
}
|
||||
|
||||
// MulAddS2I takes 4 shorts and produces an int. We can reinterpret
|
||||
@ -3182,10 +2598,10 @@ bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
|
||||
_vloop.phase()->C->get_alias_index(m2->adr_type());
|
||||
}
|
||||
|
||||
LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
|
||||
LoadNode::ControlDependency VTransformLoadVectorNode::control_dependency() const {
|
||||
LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
|
||||
for (uint i = 0; i < p->size(); i++) {
|
||||
Node* n = p->at(i);
|
||||
for (int i = 0; i < nodes().length(); i++) {
|
||||
Node* n = nodes().at(i);
|
||||
assert(n->is_Load(), "only meaningful for loads");
|
||||
if (!n->depends_only_on_test()) {
|
||||
if (n->as_Load()->has_unknown_control_dependency() &&
|
||||
@ -3202,8 +2618,8 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
|
||||
}
|
||||
|
||||
// Find the memop pack with the maximum vector width, unless they were already
|
||||
// determined by SuperWord::filter_packs_for_alignment().
|
||||
void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
||||
// determined (e.g. by SuperWord::filter_packs_for_alignment()).
|
||||
void VTransform::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
||||
if (_mem_ref_for_main_loop_alignment != nullptr) {
|
||||
assert(VLoop::vectors_should_be_aligned(), "mem_ref only set if filtered for alignment");
|
||||
return;
|
||||
@ -3211,15 +2627,18 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
||||
|
||||
MemNode const* mem_ref = nullptr;
|
||||
int max_aw = 0;
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* pack = _packset.at(i);
|
||||
MemNode* first = pack->at(0)->isa_Mem();
|
||||
if (first == nullptr) { continue; }
|
||||
|
||||
int vw = first->memory_size() * pack->size();
|
||||
const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
|
||||
for (int i = 0; i < vtnodes.length(); i++) {
|
||||
VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector();
|
||||
if (vtn == nullptr) { continue; }
|
||||
MemNode* p0 = vtn->nodes().at(0)->isa_Mem();
|
||||
if (p0 == nullptr) { continue; }
|
||||
|
||||
int vw = p0->memory_size() * vtn->nodes().length();
|
||||
if (vw > max_aw) {
|
||||
max_aw = vw;
|
||||
mem_ref = first;
|
||||
mem_ref = p0;
|
||||
}
|
||||
}
|
||||
assert(mem_ref != nullptr && max_aw > 0, "found mem_ref and aw");
|
||||
@ -3229,7 +2648,7 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
||||
|
||||
#define TRACE_ALIGN_VECTOR_NODE(node) { \
|
||||
DEBUG_ONLY( \
|
||||
if (is_trace_align_vector()) { \
|
||||
if (_trace._align_vector) { \
|
||||
tty->print(" " #node ": "); \
|
||||
node->dump(); \
|
||||
} \
|
||||
@ -3240,7 +2659,7 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
||||
// the address of "_mem_ref_for_main_loop_alignment" to "_aw_for_main_loop_alignment", which is a
|
||||
// sufficiently large alignment width. We adjust the pre-loop iteration count by adjusting the
|
||||
// pre-loop limit.
|
||||
void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
determine_mem_ref_and_aw_for_main_loop_alignment();
|
||||
const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment;
|
||||
const int aw = _aw_for_main_loop_alignment;
|
||||
@ -3397,8 +2816,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
Node* invar = align_to_ref_p.invar();
|
||||
|
||||
#ifdef ASSERT
|
||||
if (is_trace_align_vector()) {
|
||||
tty->print_cr("\nadjust_pre_loop_limit_to_align_main_loop_vectors:");
|
||||
if (_trace._align_vector) {
|
||||
tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors:");
|
||||
tty->print(" align_to_ref:");
|
||||
align_to_ref->dump();
|
||||
tty->print_cr(" aw: %d", aw);
|
||||
@ -3424,7 +2843,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
scale == 0 || !is_power_of_2(abs(scale)) ||
|
||||
abs(scale) >= aw) {
|
||||
#ifdef ASSERT
|
||||
if (is_trace_align_vector()) {
|
||||
if (_trace._align_vector) {
|
||||
tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because");
|
||||
tty->print_cr(" stride or scale are not power of 2, or abs(scale) >= aw.");
|
||||
}
|
||||
@ -3440,7 +2859,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
const int AW = aw / abs(scale);
|
||||
|
||||
#ifdef ASSERT
|
||||
if (is_trace_align_vector()) {
|
||||
if (_trace._align_vector) {
|
||||
tty->print_cr(" AW = aw(%d) / abs(scale(%d)) = %d", aw, scale, AW);
|
||||
}
|
||||
#endif
|
||||
@ -3595,10 +3014,10 @@ void PackSet::print_pack(Node_List* pack) {
|
||||
|
||||
#ifndef PRODUCT
|
||||
void VLoopBody::print() const {
|
||||
tty->print_cr("\nBlock");
|
||||
tty->print_cr("\nVLoopBody::print");
|
||||
for (int i = 0; i < body().length(); i++) {
|
||||
Node* n = body().at(i);
|
||||
tty->print("%d ", i);
|
||||
tty->print("%4d ", i);
|
||||
if (n != nullptr) {
|
||||
n->dump();
|
||||
}
|
||||
@ -3615,3 +3034,4 @@ bool SuperWord::same_origin_idx(Node* a, Node* b) const {
|
||||
bool SuperWord::same_generation(Node* a, Node* b) const {
|
||||
return a != nullptr && b != nullptr && _clone_map.same_gen(a->_idx, b->_idx);
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#define SHARE_OPTO_SUPERWORD_HPP
|
||||
|
||||
#include "opto/vectorization.hpp"
|
||||
#include "opto/vtransform.hpp"
|
||||
#include "utilities/growableArray.hpp"
|
||||
|
||||
//
|
||||
@ -367,6 +368,10 @@ public:
|
||||
Node* same_inputs_at_index_or_null(const Node_List* pack, const int index) const;
|
||||
VTransformBoolTest get_bool_test(const Node_List* bool_pack) const;
|
||||
|
||||
Node_List* pack_input_at_index_or_null(const Node_List* pack, const int index) const {
|
||||
return strided_pack_input_at_index_or_null(pack, index, 1, 0);
|
||||
}
|
||||
|
||||
private:
|
||||
SplitStatus split_pack(const char* split_name, Node_List* pack, SplitTask task);
|
||||
public:
|
||||
@ -599,13 +604,6 @@ private:
|
||||
|
||||
DEBUG_ONLY(void verify_packs() const;)
|
||||
|
||||
bool schedule_and_apply();
|
||||
bool apply(Node_List& memops_schedule);
|
||||
void apply_memops_reordering_with_schedule(Node_List& memops_schedule);
|
||||
bool apply_vectorization();
|
||||
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
|
||||
Node* vector_opd(Node_List* p, int opd_idx);
|
||||
|
||||
// Can code be generated for the pack, restricted to size nodes?
|
||||
bool implemented(const Node_List* pack, const uint size) const;
|
||||
// Find the maximal implemented size smaller or equal to the packs size
|
||||
@ -630,11 +628,7 @@ private:
|
||||
|
||||
bool is_velt_basic_type_compatible_use_def(Node* use, Node* def) const;
|
||||
|
||||
static LoadNode::ControlDependency control_dependency(Node_List* p);
|
||||
|
||||
// Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
|
||||
void determine_mem_ref_and_aw_for_main_loop_alignment();
|
||||
void adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
bool schedule_and_apply() const;
|
||||
};
|
||||
|
||||
#endif // SHARE_OPTO_SUPERWORD_HPP
|
||||
|
308
src/hotspot/share/opto/superwordVTransformBuilder.cpp
Normal file
308
src/hotspot/share/opto/superwordVTransformBuilder.cpp
Normal file
@ -0,0 +1,308 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "opto/superwordVTransformBuilder.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
|
||||
void SuperWordVTransformBuilder::build() {
|
||||
assert(!_packset.is_empty(), "must have non-empty packset");
|
||||
assert(!_vtransform.has_graph(), "start with empty vtransform");
|
||||
|
||||
// Create vtnodes for all nodes in the loop.
|
||||
build_vector_vtnodes_for_packed_nodes();
|
||||
build_scalar_vtnodes_for_non_packed_nodes();
|
||||
|
||||
// Connect all vtnodes with their inputs. Possibly create vtnodes for input
|
||||
// nodes that are outside the loop.
|
||||
VectorSet vtn_dependencies; // Shared, but cleared for every vtnode.
|
||||
build_inputs_for_vector_vtnodes(vtn_dependencies);
|
||||
build_inputs_for_scalar_vtnodes(vtn_dependencies);
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::build_vector_vtnodes_for_packed_nodes() {
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* pack = _packset.at(i);
|
||||
VTransformVectorNode* vtn = make_vector_vtnode_for_pack(pack);
|
||||
for (uint k = 0; k < pack->size(); k++) {
|
||||
map_node_to_vtnode(pack->at(k), vtn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::build_scalar_vtnodes_for_non_packed_nodes() {
|
||||
for (int i = 0; i < _vloop_analyzer.body().body().length(); i++) {
|
||||
Node* n = _vloop_analyzer.body().body().at(i);
|
||||
if (_packset.get_pack(n) != nullptr) { continue; }
|
||||
VTransformScalarNode* vtn = new (_vtransform.arena()) VTransformScalarNode(_vtransform, n);
|
||||
map_node_to_vtnode(n, vtn);
|
||||
}
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::build_inputs_for_vector_vtnodes(VectorSet& vtn_dependencies) {
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* pack = _packset.at(i);
|
||||
Node* p0 = pack->at(0);
|
||||
|
||||
VTransformVectorNode* vtn = get_vtnode(p0)->isa_Vector();
|
||||
assert(vtn != nullptr, "all packs must have vector vtnodes");
|
||||
vtn_dependencies.clear(); // Add every dependency only once per vtn.
|
||||
|
||||
if (p0->is_Load()) {
|
||||
set_req_with_scalar(p0, vtn, vtn_dependencies, MemNode::Address);
|
||||
} else if (p0->is_Store()) {
|
||||
set_req_with_scalar(p0, vtn, vtn_dependencies, MemNode::Address);
|
||||
set_req_with_vector(pack, vtn, vtn_dependencies, MemNode::ValueIn);
|
||||
} else if (vtn->isa_ReductionVector() != nullptr) {
|
||||
set_req_with_scalar(p0, vtn, vtn_dependencies, 1); // scalar init
|
||||
set_req_with_vector(pack, vtn, vtn_dependencies, 2); // vector
|
||||
} else {
|
||||
assert(vtn->isa_ElementWiseVector() != nullptr, "all other vtnodes are handled above");
|
||||
if (VectorNode::is_scalar_rotate(p0) &&
|
||||
p0->in(2)->is_Con() &&
|
||||
Matcher::supports_vector_constant_rotates(p0->in(2)->get_int())) {
|
||||
set_req_with_vector(pack, vtn, vtn_dependencies, 1);
|
||||
set_req_with_scalar(p0, vtn, vtn_dependencies, 2); // constant rotation
|
||||
} else if (VectorNode::is_roundopD(p0)) {
|
||||
set_req_with_vector(pack, vtn, vtn_dependencies, 1);
|
||||
set_req_with_scalar(p0, vtn, vtn_dependencies, 2); // constant rounding mode
|
||||
} else if (p0->is_CMove()) {
|
||||
// Cmp + Bool + CMove -> VectorMaskCmp + VectorBlend.
|
||||
set_all_req_with_vectors(pack, vtn, vtn_dependencies);
|
||||
VTransformBoolVectorNode* vtn_mask_cmp = vtn->in(1)->isa_BoolVector();
|
||||
if (vtn_mask_cmp->test()._is_negated) {
|
||||
vtn->swap_req(2, 3); // swap if test was negated.
|
||||
}
|
||||
} else {
|
||||
set_all_req_with_vectors(pack, vtn, vtn_dependencies);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint k = 0; k < pack->size(); k++) {
|
||||
add_dependencies_of_node_to_vtnode(pack->at(k), vtn, vtn_dependencies);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::build_inputs_for_scalar_vtnodes(VectorSet& vtn_dependencies) {
|
||||
for (int i = 0; i < _vloop_analyzer.body().body().length(); i++) {
|
||||
Node* n = _vloop_analyzer.body().body().at(i);
|
||||
VTransformScalarNode* vtn = get_vtnode(n)->isa_Scalar();
|
||||
if (vtn == nullptr) { continue; }
|
||||
vtn_dependencies.clear(); // Add every dependency only once per vtn.
|
||||
|
||||
if (n->is_Load()) {
|
||||
set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::Address);
|
||||
} else if (n->is_Store()) {
|
||||
set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::Address);
|
||||
set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::ValueIn);
|
||||
} else if (n->is_CountedLoop()) {
|
||||
continue; // Is "root", has no dependency.
|
||||
} else if (n->is_Phi()) {
|
||||
// CountedLoop Phi's: ignore backedge (and entry value).
|
||||
assert(n->in(0) == _vloop.cl(), "only Phi's from the CountedLoop allowed");
|
||||
set_req_with_scalar(n, vtn, vtn_dependencies, 0);
|
||||
continue;
|
||||
} else {
|
||||
set_all_req_with_scalars(n, vtn, vtn_dependencies);
|
||||
}
|
||||
|
||||
add_dependencies_of_node_to_vtnode(n, vtn, vtn_dependencies);
|
||||
}
|
||||
}
|
||||
|
||||
// Create a vtnode for each pack. No in/out edges set yet.
|
||||
VTransformVectorNode* SuperWordVTransformBuilder::make_vector_vtnode_for_pack(const Node_List* pack) const {
|
||||
uint pack_size = pack->size();
|
||||
Node* p0 = pack->at(0);
|
||||
int opc = p0->Opcode();
|
||||
VTransformVectorNode* vtn = nullptr;
|
||||
|
||||
if (p0->is_Load()) {
|
||||
vtn = new (_vtransform.arena()) VTransformLoadVectorNode(_vtransform, pack_size);
|
||||
} else if (p0->is_Store()) {
|
||||
vtn = new (_vtransform.arena()) VTransformStoreVectorNode(_vtransform, pack_size);
|
||||
} else if (p0->is_Bool()) {
|
||||
VTransformBoolTest kind = _packset.get_bool_test(pack);
|
||||
vtn = new (_vtransform.arena()) VTransformBoolVectorNode(_vtransform, pack_size, kind);
|
||||
} else if (_vloop_analyzer.reductions().is_marked_reduction(p0)) {
|
||||
vtn = new (_vtransform.arena()) VTransformReductionVectorNode(_vtransform, pack_size);
|
||||
} else if (VectorNode::is_muladds2i(p0)) {
|
||||
// A special kind of binary element-wise vector op: the inputs are "ints" a and b,
|
||||
// but reinterpreted as two "shorts" [a0, a1] and [b0, b1]:
|
||||
// v = MulAddS2I(a, b) = a0 * b0 + a1 + b1
|
||||
assert(p0->req() == 5, "MulAddS2I should have 4 operands");
|
||||
vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, 3, pack_size);
|
||||
} else {
|
||||
assert(p0->req() == 3 ||
|
||||
p0->is_CMove() ||
|
||||
VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc) ||
|
||||
VectorNode::is_convert_opcode(opc) ||
|
||||
VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc) ||
|
||||
opc == Op_FmaD ||
|
||||
opc == Op_FmaF ||
|
||||
opc == Op_SignumF ||
|
||||
opc == Op_SignumD,
|
||||
"pack type must be in this list");
|
||||
vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, p0->req(), pack_size);
|
||||
}
|
||||
vtn->set_nodes(pack);
|
||||
return vtn;
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::set_req_with_scalar(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index) {
|
||||
VTransformNode* req = get_vtnode_or_wrap_as_input_scalar(n->in(index));
|
||||
vtn->set_req(index, req);
|
||||
vtn_dependencies.set(req->_idx);
|
||||
}
|
||||
|
||||
// Either get the existing vtnode vector input (when input is a pack), or else make a
|
||||
// new vector vtnode for the input (e.g. for Replicate or PopulateIndex).
|
||||
VTransformNode* SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index(const Node_List* pack, const int index) {
|
||||
Node* p0 = pack->at(0);
|
||||
|
||||
Node_List* pack_in = _packset.pack_input_at_index_or_null(pack, index);
|
||||
if (pack_in != nullptr) {
|
||||
// Input is a matching pack -> vtnode already exists.
|
||||
assert(index != 2 || !VectorNode::is_shift(p0), "shift's count cannot be vector");
|
||||
return get_vtnode(pack_in->at(0));
|
||||
}
|
||||
|
||||
if (VectorNode::is_muladds2i(p0)) {
|
||||
assert(_packset.is_muladds2i_pack_with_pack_inputs(pack), "inputs must all be packs");
|
||||
// All inputs are strided (stride = 2), either with offset 0 or 1.
|
||||
Node_List* pack_in0 = _packset.strided_pack_input_at_index_or_null(pack, index, 2, 0);
|
||||
if (pack_in0 != nullptr) {
|
||||
return get_vtnode(pack_in0->at(0));
|
||||
}
|
||||
Node_List* pack_in1 = _packset.strided_pack_input_at_index_or_null(pack, index, 2, 1);
|
||||
if (pack_in1 != nullptr) {
|
||||
return get_vtnode(pack_in1->at(0));
|
||||
}
|
||||
}
|
||||
|
||||
Node* same_input = _packset.same_inputs_at_index_or_null(pack, index);
|
||||
if (same_input == nullptr && p0->in(index) == _vloop.iv()) {
|
||||
// PopulateIndex: [iv+0, iv+1, iv+2, ...]
|
||||
VTransformNode* iv_vtn = get_vtnode_or_wrap_as_input_scalar(_vloop.iv());
|
||||
BasicType p0_bt = _vloop_analyzer.types().velt_basic_type(p0);
|
||||
// If we have subword type, take that type directly. If p0 is some ConvI2L/F/D,
|
||||
// then the p0_bt can also be L/F/D but we need to produce ints for the input of
|
||||
// the ConvI2L/F/D.
|
||||
BasicType element_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
|
||||
VTransformNode* populate_index = new (_vtransform.arena()) VTransformPopulateIndexNode(_vtransform, pack->size(), element_bt);
|
||||
populate_index->set_req(1, iv_vtn);
|
||||
return populate_index;
|
||||
}
|
||||
|
||||
if (same_input != nullptr) {
|
||||
VTransformNode* same_input_vtn = get_vtnode_or_wrap_as_input_scalar(same_input);
|
||||
if (index == 2 && VectorNode::is_shift(p0)) {
|
||||
// Scalar shift count for vector shift operation: vec2 = shiftV(vec1, scalar_count)
|
||||
// Scalar shift operations masks the shift count, but the vector shift does not, so
|
||||
// create a special ShiftCount node.
|
||||
BasicType element_bt = _vloop_analyzer.types().velt_basic_type(p0);
|
||||
juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
|
||||
VTransformNode* shift_count = new (_vtransform.arena()) VTransformShiftCountNode(_vtransform, pack->size(), element_bt, mask, p0->Opcode());
|
||||
shift_count->set_req(1, same_input_vtn);
|
||||
return shift_count;
|
||||
} else {
|
||||
// Replicate the scalar same_input to every vector element.
|
||||
const Type* element_type = _vloop_analyzer.types().velt_type(p0);
|
||||
if (index == 2 && VectorNode::is_scalar_rotate(p0) && element_type->isa_long()) {
|
||||
// Scalar rotate has int rotation value, but the scalar rotate expects longs.
|
||||
assert(same_input->bottom_type()->isa_int(), "scalar rotate expects int rotation");
|
||||
VTransformNode* conv = new (_vtransform.arena()) VTransformConvI2LNode(_vtransform);
|
||||
conv->set_req(1, same_input_vtn);
|
||||
same_input_vtn = conv;
|
||||
}
|
||||
VTransformNode* replicate = new (_vtransform.arena()) VTransformReplicateNode(_vtransform, pack->size(), element_type);
|
||||
replicate->set_req(1, same_input_vtn);
|
||||
return replicate;
|
||||
}
|
||||
}
|
||||
|
||||
// The input is neither a pack not a same_input node. SuperWord::profitable does not allow
|
||||
// any other case. In the future, we could insert a PackNode.
|
||||
#ifdef ASSERT
|
||||
tty->print_cr("\nSuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index: index=%d", index);
|
||||
pack->dump();
|
||||
assert(false, "Pack input was neither a pack nor a same_input node");
|
||||
#endif
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
VTransformNode* SuperWordVTransformBuilder::get_vtnode_or_wrap_as_input_scalar(Node* n) {
|
||||
VTransformNode* vtn = get_vtnode_or_null(n);
|
||||
if (vtn != nullptr) { return vtn; }
|
||||
|
||||
assert(!_vloop.in_bb(n), "only nodes outside the loop can be input nodes to the loop");
|
||||
vtn = new (_vtransform.arena()) VTransformInputScalarNode(_vtransform, n);
|
||||
map_node_to_vtnode(n, vtn);
|
||||
return vtn;
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::set_req_with_vector(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies, int j) {
|
||||
VTransformNode* req = get_or_make_vtnode_vector_input_at_index(pack, j);
|
||||
vtn->set_req(j, req);
|
||||
vtn_dependencies.set(req->_idx);
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::set_all_req_with_scalars(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies) {
|
||||
assert(vtn->req() == n->req(), "scalars must have same number of reqs");
|
||||
for (uint j = 0; j < n->req(); j++) {
|
||||
Node* def = n->in(j);
|
||||
if (def == nullptr) { continue; }
|
||||
set_req_with_scalar(n, vtn, vtn_dependencies, j);
|
||||
}
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::set_all_req_with_vectors(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies) {
|
||||
Node* p0 = pack->at(0);
|
||||
assert(vtn->req() <= p0->req(), "must have at at most as many reqs");
|
||||
// Vectors have no ctrl, so ignore it.
|
||||
for (uint j = 1; j < vtn->req(); j++) {
|
||||
Node* def = p0->in(j);
|
||||
if (def == nullptr) { continue; }
|
||||
set_req_with_vector(pack, vtn, vtn_dependencies, j);
|
||||
}
|
||||
}
|
||||
|
||||
void SuperWordVTransformBuilder::add_dependencies_of_node_to_vtnode(Node*n, VTransformNode* vtn, VectorSet& vtn_dependencies) {
|
||||
for (VLoopDependencyGraph::PredsIterator preds(_vloop_analyzer.dependency_graph(), n); !preds.done(); preds.next()) {
|
||||
Node* pred = preds.current();
|
||||
if (!_vloop.in_bb(pred)) { continue; }
|
||||
|
||||
// Only add memory dependencies to memory nodes. All others are taken care of with the req.
|
||||
if (n->is_Mem() && !pred->is_Mem()) { continue; }
|
||||
|
||||
VTransformNode* dependency = get_vtnode(pred);
|
||||
|
||||
// Reduction self-cycle?
|
||||
if (vtn == dependency && _vloop_analyzer.reductions().is_marked_reduction(n)) { continue; }
|
||||
|
||||
if (vtn_dependencies.test_set(dependency->_idx)) { continue; }
|
||||
vtn->add_dependency(dependency); // Add every dependency only once per vtn.
|
||||
}
|
||||
}
|
||||
|
87
src/hotspot/share/opto/superwordVTransformBuilder.hpp
Normal file
87
src/hotspot/share/opto/superwordVTransformBuilder.hpp
Normal file
@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
#include "opto/vtransform.hpp"
|
||||
#include "opto/superword.hpp"
|
||||
|
||||
#ifndef SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP
|
||||
#define SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP
|
||||
|
||||
// Facility class that builds a VTransform from a SuperWord PackSet.
|
||||
class SuperWordVTransformBuilder : public StackObj {
|
||||
private:
|
||||
const VLoopAnalyzer& _vloop_analyzer;
|
||||
const VLoop& _vloop;
|
||||
const PackSet& _packset;
|
||||
VTransform& _vtransform;
|
||||
|
||||
ResourceHashtable</* Node::_idx*/ int, VTransformNode* /* or null*/> _idx_to_vtnode;
|
||||
|
||||
public:
|
||||
SuperWordVTransformBuilder(const PackSet& packset,
|
||||
VTransform& vtransform) :
|
||||
_vloop_analyzer(vtransform.vloop_analyzer()),
|
||||
_vloop(_vloop_analyzer.vloop()),
|
||||
_packset(packset),
|
||||
_vtransform(vtransform)
|
||||
{
|
||||
assert(!_vtransform.has_graph(), "constructor is passed an empty vtransform");
|
||||
build();
|
||||
assert(_vtransform.has_graph(), "vtransform must contain some vtnodes now");
|
||||
}
|
||||
|
||||
private:
|
||||
void build();
|
||||
void build_vector_vtnodes_for_packed_nodes();
|
||||
void build_scalar_vtnodes_for_non_packed_nodes();
|
||||
void build_inputs_for_vector_vtnodes(VectorSet& vtn_dependencies);
|
||||
void build_inputs_for_scalar_vtnodes(VectorSet& vtn_dependencies);
|
||||
|
||||
// Helper methods for building VTransform.
|
||||
VTransformNode* get_vtnode_or_null(Node* n) const {
|
||||
VTransformNode** ptr = _idx_to_vtnode.get(n->_idx);
|
||||
return (ptr == nullptr) ? nullptr : *ptr;
|
||||
}
|
||||
|
||||
VTransformNode* get_vtnode(Node* n) const {
|
||||
VTransformNode* vtn = get_vtnode_or_null(n);
|
||||
assert(vtn != nullptr, "expect non-null vtnode");
|
||||
return vtn;
|
||||
}
|
||||
|
||||
void map_node_to_vtnode(Node* n, VTransformNode* vtn) {
|
||||
assert(vtn != nullptr, "only set non-null vtnodes");
|
||||
_idx_to_vtnode.put_when_absent(n->_idx, vtn);
|
||||
}
|
||||
|
||||
VTransformVectorNode* make_vector_vtnode_for_pack(const Node_List* pack) const;
|
||||
VTransformNode* get_or_make_vtnode_vector_input_at_index(const Node_List* pack, const int index);
|
||||
VTransformNode* get_vtnode_or_wrap_as_input_scalar(Node* n);
|
||||
void set_req_with_scalar(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index);
|
||||
void set_req_with_vector(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index);
|
||||
void set_all_req_with_scalars(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies);
|
||||
void set_all_req_with_vectors(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies);
|
||||
void add_dependencies_of_node_to_vtnode(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies);
|
||||
};
|
||||
|
||||
#endif // SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP
|
@ -43,6 +43,7 @@
|
||||
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
|
||||
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
|
||||
flags(ALIGN_VECTOR, "Trace AlignVector") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(ALL, "Trace everything (very verbose)")
|
||||
|
||||
#define table_entry(name, description) name,
|
||||
|
@ -26,7 +26,6 @@
|
||||
#include "opto/addnode.hpp"
|
||||
#include "opto/connode.hpp"
|
||||
#include "opto/convertnode.hpp"
|
||||
#include "opto/matcher.hpp"
|
||||
#include "opto/mulnode.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/vectorization.hpp"
|
||||
|
@ -25,7 +25,7 @@
|
||||
#ifndef SHARE_OPTO_VECTORIZATION_HPP
|
||||
#define SHARE_OPTO_VECTORIZATION_HPP
|
||||
|
||||
#include "opto/node.hpp"
|
||||
#include "opto/matcher.hpp"
|
||||
#include "opto/loopnode.hpp"
|
||||
#include "opto/traceAutoVectorizationTag.hpp"
|
||||
#include "utilities/pair.hpp"
|
||||
@ -763,9 +763,9 @@ class VPointer : public ArenaObj {
|
||||
}
|
||||
}
|
||||
|
||||
bool overlap_possible_with_any_in(const Node_List* p) const {
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
MemNode* mem = p->at(k)->as_Mem();
|
||||
bool overlap_possible_with_any_in(const GrowableArray<Node*>& nodes) const {
|
||||
for (int i = 0; i < nodes.length(); i++) {
|
||||
MemNode* mem = nodes.at(i)->as_Mem();
|
||||
VPointer p_mem(mem, _vloop);
|
||||
// Only if we know that we have Less or Greater can we
|
||||
// be sure that there can never be an overlap between
|
||||
@ -1323,12 +1323,4 @@ private:
|
||||
#endif
|
||||
};
|
||||
|
||||
struct VTransformBoolTest {
|
||||
const BoolTest::mask _mask;
|
||||
const bool _is_negated;
|
||||
|
||||
VTransformBoolTest(const BoolTest::mask mask, bool is_negated) :
|
||||
_mask(mask), _is_negated(is_negated) {}
|
||||
};
|
||||
|
||||
#endif // SHARE_OPTO_VECTORIZATION_HPP
|
||||
|
450
src/hotspot/share/opto/vtransform.cpp
Normal file
450
src/hotspot/share/opto/vtransform.cpp
Normal file
@ -0,0 +1,450 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "opto/vtransform.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "opto/convertnode.hpp"
|
||||
|
||||
void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
|
||||
assert(vtnode->_idx == _vtnodes.length(), "position must match idx");
|
||||
_vtnodes.push(vtnode);
|
||||
}
|
||||
|
||||
// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
|
||||
// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
|
||||
// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
|
||||
// graph has additional constraints which can introduce cycles. Example:
|
||||
//
|
||||
// +--------+
|
||||
// A -> X | v
|
||||
// Pack [A,B] and [X,Y] [A,B] [X,Y]
|
||||
// Y -> B ^ |
|
||||
// +--------+
|
||||
//
|
||||
// We return "true" IFF we find no cycle, i.e. if the linearization succeeds.
|
||||
bool VTransformGraph::schedule() {
|
||||
assert(!is_scheduled(), "not yet scheduled");
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (_trace._verbose) {
|
||||
print_vtnodes();
|
||||
}
|
||||
#endif
|
||||
|
||||
ResourceMark rm;
|
||||
GrowableArray<VTransformNode*> stack;
|
||||
VectorSet pre_visited;
|
||||
VectorSet post_visited;
|
||||
|
||||
collect_nodes_without_req_or_dependency(stack);
|
||||
|
||||
// We create a reverse-post-visit order. This gives us a linearization, if there are
|
||||
// no cycles. Then, we simply reverse the order, and we have a schedule.
|
||||
int rpo_idx = _vtnodes.length() - 1;
|
||||
while (!stack.is_empty()) {
|
||||
VTransformNode* vtn = stack.top();
|
||||
if (!pre_visited.test_set(vtn->_idx)) {
|
||||
// Forward arc in graph (pre-visit).
|
||||
} else if (!post_visited.test(vtn->_idx)) {
|
||||
// Forward arc in graph. Check if all uses were already visited:
|
||||
// Yes -> post-visit.
|
||||
// No -> we are mid-visit.
|
||||
bool all_uses_already_visited = true;
|
||||
|
||||
for (int i = 0; i < vtn->outs(); i++) {
|
||||
VTransformNode* use = vtn->out(i);
|
||||
if (post_visited.test(use->_idx)) { continue; }
|
||||
if (pre_visited.test(use->_idx)) {
|
||||
// Cycle detected!
|
||||
// The nodes that are pre_visited but not yet post_visited form a path from
|
||||
// the "root" to the current vtn. Now, we are looking at an edge (vtn, use),
|
||||
// and discover that use is also pre_visited but not post_visited. Thus, use
|
||||
// lies on that path from "root" to vtn, and the edge (vtn, use) closes a
|
||||
// cycle.
|
||||
NOT_PRODUCT(if (_trace._rejections) { trace_schedule_cycle(stack, pre_visited, post_visited); } )
|
||||
return false;
|
||||
}
|
||||
stack.push(use);
|
||||
all_uses_already_visited = false;
|
||||
}
|
||||
|
||||
if (all_uses_already_visited) {
|
||||
stack.pop();
|
||||
post_visited.set(vtn->_idx); // post-visit
|
||||
_schedule.at_put_grow(rpo_idx--, vtn); // assign rpo_idx
|
||||
}
|
||||
} else {
|
||||
stack.pop(); // Already post-visited. Ignore secondary edge.
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (_trace._verbose) {
|
||||
print_schedule();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(rpo_idx == -1, "used up all rpo_idx, rpo_idx=%d", rpo_idx);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Push all "root" nodes, i.e. those that have no inputs (req or dependency):
|
||||
void VTransformGraph::collect_nodes_without_req_or_dependency(GrowableArray<VTransformNode*>& stack) const {
|
||||
for (int i = 0; i < _vtnodes.length(); i++) {
|
||||
VTransformNode* vtn = _vtnodes.at(i);
|
||||
if (!vtn->has_req_or_dependency()) {
|
||||
stack.push(vtn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
|
||||
const VectorSet& pre_visited,
|
||||
const VectorSet& post_visited) const {
|
||||
tty->print_cr("\nVTransform::schedule found a cycle on path (P), vectorization attempt fails.");
|
||||
for (int j = 0; j < stack.length(); j++) {
|
||||
VTransformNode* n = stack.at(j);
|
||||
bool on_path = pre_visited.test(n->_idx) && !post_visited.test(n->_idx);
|
||||
tty->print(" %s ", on_path ? "P" : "_");
|
||||
n->print();
|
||||
}
|
||||
}
|
||||
|
||||
void VTransformApplyResult::trace(VTransformNode* vtnode) const {
|
||||
tty->print(" apply: ");
|
||||
vtnode->print();
|
||||
tty->print(" -> ");
|
||||
if (_node == nullptr) {
|
||||
tty->print_cr("nullptr");
|
||||
} else {
|
||||
_node->dump();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Node* VTransformNode::find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx);
|
||||
assert(n != nullptr, "must find input IR node");
|
||||
return n;
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformScalarNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
// This was just wrapped. Now we simply unwap without touching the inputs.
|
||||
return VTransformApplyResult::make_scalar(_node);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReplicateNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
Node* val = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
|
||||
register_new_node_from_vectorization(vloop_analyzer, vn, val);
|
||||
return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformConvI2LNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
Node* val = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
Node* n = new ConvI2LNode(val);
|
||||
register_new_node_from_vectorization(vloop_analyzer, n, val);
|
||||
return VTransformApplyResult::make_scalar(n);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformShiftCountNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
Node* shift_count_in = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
assert(shift_count_in->bottom_type()->isa_int(), "int type only for shift count");
|
||||
// The shift_count_in would be automatically truncated to the lowest _mask
|
||||
// bits in a scalar shift operation. But vector shift does not truncate, so
|
||||
// we must apply the mask now.
|
||||
Node* shift_count_masked = new AndINode(shift_count_in, phase->igvn().intcon(_mask));
|
||||
register_new_node_from_vectorization(vloop_analyzer, shift_count_masked, shift_count_in);
|
||||
// Now that masked value is "boadcast" (some platforms only set the lowest element).
|
||||
VectorNode* vn = VectorNode::shift_count(_shift_opcode, shift_count_masked, _vlen, _element_bt);
|
||||
register_new_node_from_vectorization(vloop_analyzer, vn, shift_count_in);
|
||||
return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes());
|
||||
}
|
||||
|
||||
|
||||
VTransformApplyResult VTransformPopulateIndexNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
Node* val = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
assert(val->is_Phi(), "expected to be iv");
|
||||
assert(VectorNode::is_populate_index_supported(_element_bt), "should support");
|
||||
const TypeVect* vt = TypeVect::make(_element_bt, _vlen);
|
||||
VectorNode* vn = new PopulateIndexNode(val, phase->igvn().intcon(1), vt);
|
||||
register_new_node_from_vectorization(vloop_analyzer, vn, val);
|
||||
return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformElementWiseVectorNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
Node* first = nodes().at(0);
|
||||
uint vlen = nodes().length();
|
||||
int opc = first->Opcode();
|
||||
BasicType bt = vloop_analyzer.types().velt_basic_type(first);
|
||||
|
||||
if (first->is_Cmp()) {
|
||||
// Cmp + Bool -> VectorMaskCmp
|
||||
// Handled by Bool / VTransformBoolVectorNode, so we do not generate any nodes here.
|
||||
return VTransformApplyResult::make_empty();
|
||||
}
|
||||
|
||||
assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
|
||||
VectorNode* vn = nullptr;
|
||||
Node* in1 = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
Node* in2 = (req() >= 3) ? find_transformed_input(2, vnode_idx_to_transformed_node) : nullptr;
|
||||
Node* in3 = (req() >= 4) ? find_transformed_input(3, vnode_idx_to_transformed_node) : nullptr;
|
||||
|
||||
if (first->is_CMove()) {
|
||||
assert(req() == 4, "three inputs expected: mask, blend1, blend2");
|
||||
vn = new VectorBlendNode(/* blend1 */ in2, /* blend2 */ in3, /* mask */ in1);
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
assert(first->req() == 2 && req() == 2, "only one input expected");
|
||||
int vopc = VectorCastNode::opcode(opc, in1->bottom_type()->is_vect()->element_basic_type());
|
||||
vn = VectorCastNode::make(vopc, in1, bt, vlen);
|
||||
} else if (VectorNode::can_use_RShiftI_instead_of_URShiftI(first, bt)) {
|
||||
opc = Op_RShiftI;
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, bt);
|
||||
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
|
||||
// The scalar operation was a long -> int operation.
|
||||
// However, the vector operation is long -> long.
|
||||
VectorNode* long_vn = VectorNode::make(opc, in1, nullptr, vlen, T_LONG);
|
||||
register_new_node_from_vectorization(vloop_analyzer, long_vn, first);
|
||||
// Cast long -> int, to mimic the scalar long -> int operation.
|
||||
vn = VectorCastNode::make(Op_VectorCastL2X, long_vn, T_INT, vlen);
|
||||
} else if (req() == 3 ||
|
||||
VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) {
|
||||
assert(!VectorNode::is_roundopD(first) || in2->is_Con(), "rounding mode must be constant");
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, bt); // unary and binary
|
||||
} else {
|
||||
assert(req() == 4, "three inputs expected");
|
||||
assert(opc == Op_FmaD ||
|
||||
opc == Op_FmaF ||
|
||||
opc == Op_SignumF ||
|
||||
opc == Op_SignumD,
|
||||
"element wise operation must be from this list");
|
||||
vn = VectorNode::make(opc, in1, in2, in3, vlen, bt); // ternary
|
||||
}
|
||||
|
||||
register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn);
|
||||
return VTransformApplyResult::make_vector(vn, vlen, vn->length_in_bytes());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformBoolVectorNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
BoolNode* first = nodes().at(0)->as_Bool();
|
||||
uint vlen = nodes().length();
|
||||
BasicType bt = vloop_analyzer.types().velt_basic_type(first);
|
||||
|
||||
// Cmp + Bool -> VectorMaskCmp
|
||||
VTransformElementWiseVectorNode* vtn_cmp = in(1)->isa_ElementWiseVector();
|
||||
assert(vtn_cmp != nullptr && vtn_cmp->nodes().at(0)->is_Cmp(),
|
||||
"bool vtn expects cmp vtn as input");
|
||||
|
||||
Node* cmp_in1 = vtn_cmp->find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
Node* cmp_in2 = vtn_cmp->find_transformed_input(2, vnode_idx_to_transformed_node);
|
||||
BoolTest::mask mask = test()._mask;
|
||||
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
ConINode* mask_node = phase->igvn().intcon((int)mask);
|
||||
const TypeVect* vt = TypeVect::make(bt, vlen);
|
||||
VectorNode* vn = new VectorMaskCmpNode(mask, cmp_in1, cmp_in2, mask_node, vt);
|
||||
register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn);
|
||||
return VTransformApplyResult::make_vector(vn, vlen, vn->vect_type()->length_in_bytes());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReductionVectorNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
Node* first = nodes().at(0);
|
||||
uint vlen = nodes().length();
|
||||
int opc = first->Opcode();
|
||||
BasicType bt = first->bottom_type()->basic_type();
|
||||
|
||||
Node* init = find_transformed_input(1, vnode_idx_to_transformed_node);
|
||||
Node* vec = find_transformed_input(2, vnode_idx_to_transformed_node);
|
||||
|
||||
ReductionNode* vn = ReductionNode::make(opc, nullptr, init, vec, bt);
|
||||
register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn);
|
||||
return VTransformApplyResult::make_vector(vn, vlen, vn->vect_type()->length_in_bytes());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
LoadNode* first = nodes().at(0)->as_Load();
|
||||
uint vlen = nodes().length();
|
||||
Node* ctrl = first->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
int opc = first->Opcode();
|
||||
const TypePtr* adr_type = first->adr_type();
|
||||
BasicType bt = vloop_analyzer.types().velt_basic_type(first);
|
||||
|
||||
// Set the memory dependency of the LoadVector as early as possible.
|
||||
// Walk up the memory chain, and ignore any StoreVector that provably
|
||||
// does not have any memory dependency.
|
||||
while (mem->is_StoreVector()) {
|
||||
VPointer p_store(mem->as_Mem(), vloop_analyzer.vloop());
|
||||
if (p_store.overlap_possible_with_any_in(nodes())) {
|
||||
break;
|
||||
} else {
|
||||
mem = mem->in(MemNode::Memory);
|
||||
}
|
||||
}
|
||||
|
||||
LoadVectorNode* vn = LoadVectorNode::make(opc, ctrl, mem, adr, adr_type, vlen, bt,
|
||||
control_dependency());
|
||||
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
|
||||
register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn);
|
||||
return VTransformApplyResult::make_vector(vn, vlen, vn->memory_size());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformStoreVectorNode::apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||
StoreNode* first = nodes().at(0)->as_Store();
|
||||
uint vlen = nodes().length();
|
||||
Node* ctrl = first->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
Node* adr = first->in(MemNode::Address);
|
||||
int opc = first->Opcode();
|
||||
const TypePtr* adr_type = first->adr_type();
|
||||
|
||||
Node* value = find_transformed_input(MemNode::ValueIn, vnode_idx_to_transformed_node);
|
||||
StoreVectorNode* vn = StoreVectorNode::make(opc, ctrl, mem, adr, adr_type, value, vlen);
|
||||
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
|
||||
register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn);
|
||||
return VTransformApplyResult::make_vector(vn, vlen, vn->memory_size());
|
||||
}
|
||||
|
||||
void VTransformVectorNode::register_new_node_from_vectorization_and_replace_scalar_nodes(const VLoopAnalyzer& vloop_analyzer, Node* vn) const {
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
Node* first = nodes().at(0);
|
||||
|
||||
register_new_node_from_vectorization(vloop_analyzer, vn, first);
|
||||
|
||||
for (int i = 0; i < _nodes.length(); i++) {
|
||||
Node* n = _nodes.at(i);
|
||||
phase->igvn().replace_node(n, vn);
|
||||
}
|
||||
}
|
||||
|
||||
void VTransformNode::register_new_node_from_vectorization(const VLoopAnalyzer& vloop_analyzer, Node* vn, Node* old_node) const {
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
phase->register_new_node_with_ctrl_of(vn, old_node);
|
||||
phase->igvn()._worklist.push(vn);
|
||||
VectorNode::trace_new_vector(vn, "AutoVectorization");
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
void VTransformGraph::print_vtnodes() const {
|
||||
tty->print_cr("\nVTransformGraph::print_vtnodes:");
|
||||
for (int i = 0; i < _vtnodes.length(); i++) {
|
||||
_vtnodes.at(i)->print();
|
||||
}
|
||||
}
|
||||
|
||||
void VTransformGraph::print_schedule() const {
|
||||
tty->print_cr("\nVTransformGraph::print_schedule:");
|
||||
for (int i = 0; i < _schedule.length(); i++) {
|
||||
tty->print(" %3d: ", i);
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
if (vtn == nullptr) {
|
||||
tty->print_cr("nullptr");
|
||||
} else {
|
||||
vtn->print();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VTransformGraph::print_memops_schedule() const {
|
||||
tty->print_cr("\nVTransformGraph::print_memops_schedule:");
|
||||
int i = 0;
|
||||
for_each_memop_in_schedule([&] (MemNode* mem) {
|
||||
tty->print(" %3d: ", i++);
|
||||
mem->dump();
|
||||
});
|
||||
}
|
||||
|
||||
void VTransformNode::print() const {
|
||||
tty->print("%3d %s (", _idx, name());
|
||||
for (uint i = 0; i < _req; i++) {
|
||||
print_node_idx(_in.at(i));
|
||||
}
|
||||
if ((uint)_in.length() > _req) {
|
||||
tty->print(" |");
|
||||
for (int i = _req; i < _in.length(); i++) {
|
||||
print_node_idx(_in.at(i));
|
||||
}
|
||||
}
|
||||
tty->print(") [");
|
||||
for (int i = 0; i < _out.length(); i++) {
|
||||
print_node_idx(_out.at(i));
|
||||
}
|
||||
tty->print("] ");
|
||||
print_spec();
|
||||
tty->cr();
|
||||
}
|
||||
|
||||
void VTransformNode::print_node_idx(const VTransformNode* vtn) {
|
||||
if (vtn == nullptr) {
|
||||
tty->print(" _");
|
||||
} else {
|
||||
tty->print(" %d", vtn->_idx);
|
||||
}
|
||||
}
|
||||
|
||||
void VTransformScalarNode::print_spec() const {
|
||||
tty->print("node[%d %s]", _node->_idx, _node->Name());
|
||||
}
|
||||
|
||||
void VTransformReplicateNode::print_spec() const {
|
||||
tty->print("vlen=%d element_type=", _vlen);
|
||||
_element_type->dump();
|
||||
}
|
||||
|
||||
void VTransformShiftCountNode::print_spec() const {
|
||||
tty->print("vlen=%d element_bt=%s mask=%d shift_opcode=%s",
|
||||
_vlen, type2name(_element_bt), _mask,
|
||||
NodeClassNames[_shift_opcode]);
|
||||
}
|
||||
|
||||
void VTransformPopulateIndexNode::print_spec() const {
|
||||
tty->print("vlen=%d element_bt=%s", _vlen, type2name(_element_bt));
|
||||
}
|
||||
|
||||
void VTransformVectorNode::print_spec() const {
|
||||
tty->print("%d-pack[", _nodes.length());
|
||||
for (int i = 0; i < _nodes.length(); i++) {
|
||||
Node* n = _nodes.at(i);
|
||||
if (i > 0) {
|
||||
tty->print(", ");
|
||||
}
|
||||
tty->print("%d %s", n->_idx, n->Name());
|
||||
}
|
||||
tty->print("]");
|
||||
}
|
||||
#endif
|
515
src/hotspot/share/opto/vtransform.hpp
Normal file
515
src/hotspot/share/opto/vtransform.hpp
Normal file
@ -0,0 +1,515 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
#ifndef SHARE_OPTO_VTRANSFORM_HPP
|
||||
#define SHARE_OPTO_VTRANSFORM_HPP
|
||||
|
||||
#include "opto/node.hpp"
|
||||
#include "opto/vectorization.hpp"
|
||||
|
||||
// VTransform:
|
||||
// - Models the transformation of the scalar loop to vectorized loop:
|
||||
// It is a "C2 subgraph" -> "C2 subgraph" mapping.
|
||||
// - The VTransform contains a graph (VTransformGraph), which consists of
|
||||
// many vtnodes (VTransformNode).
|
||||
// - Each vtnode models a part of the transformation, and is supposed
|
||||
// to represent the output C2 nodes after the vectorization as closely
|
||||
// as possible.
|
||||
//
|
||||
// This is the life-cycle of a VTransform:
|
||||
// - Construction:
|
||||
// - From SuperWord, with the SuperWordVTransformBuilder.
|
||||
//
|
||||
// - Future Plans: optimize, if-conversion, etc.
|
||||
//
|
||||
// - Schedule:
|
||||
// - Compute linearization of the VTransformGraph, into an order that respects
|
||||
// all edges in the graph (bailout if cycle detected).
|
||||
//
|
||||
// - Apply:
|
||||
// - Changes to the C2 IR are only made once the "apply" method is called.
|
||||
// - Each vtnode generates its corresponding scalar and vector C2 nodes,
|
||||
// possibly replacing old scalar C2 nodes.
|
||||
//
|
||||
// Future Plans with VTransform:
|
||||
// - Cost model: estimate if vectorization is profitable.
|
||||
// - Optimizations: moving unordered reductions out of the loop, whih decreases cost.
|
||||
// - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop.
|
||||
// This is difficult to do with the SuperWord packset approach.
|
||||
// - If-conversion: convert predicated nodes into CFG.
|
||||
|
||||
typedef int VTransformNodeIDX;
|
||||
class VTransformNode;
|
||||
class VTransformScalarNode;
|
||||
class VTransformInputScalarNode;
|
||||
class VTransformVectorNode;
|
||||
class VTransformElementWiseVectorNode;
|
||||
class VTransformBoolVectorNode;
|
||||
class VTransformReductionVectorNode;
|
||||
|
||||
// Result from VTransformNode::apply
|
||||
class VTransformApplyResult {
|
||||
private:
|
||||
Node* const _node;
|
||||
const uint _vector_length; // number of elements
|
||||
const uint _vector_width; // total width in bytes
|
||||
|
||||
VTransformApplyResult(Node* n, uint vector_length, uint vector_width) :
|
||||
_node(n),
|
||||
_vector_length(vector_length),
|
||||
_vector_width(vector_width) {}
|
||||
|
||||
public:
|
||||
static VTransformApplyResult make_scalar(Node* n) {
|
||||
return VTransformApplyResult(n, 0, 0);
|
||||
}
|
||||
|
||||
static VTransformApplyResult make_vector(Node* n, uint vector_length, uint vector_width) {
|
||||
assert(vector_length > 0 && vector_width > 0, "must have nonzero size");
|
||||
return VTransformApplyResult(n, vector_length, vector_width);
|
||||
}
|
||||
|
||||
static VTransformApplyResult make_empty() {
|
||||
return VTransformApplyResult(nullptr, 0, 0);
|
||||
}
|
||||
|
||||
Node* node() const { return _node; }
|
||||
uint vector_length() const { return _vector_length; }
|
||||
uint vector_width() const { return _vector_width; }
|
||||
NOT_PRODUCT( void trace(VTransformNode* vtnode) const; )
|
||||
};
|
||||
|
||||
#ifndef PRODUCT
|
||||
// Convenience class for tracing flags.
|
||||
class VTransformTrace {
|
||||
public:
|
||||
const bool _verbose;
|
||||
const bool _rejections;
|
||||
const bool _align_vector;
|
||||
const bool _info;
|
||||
|
||||
VTransformTrace(const VTrace& vtrace,
|
||||
const bool is_trace_rejections,
|
||||
const bool is_trace_align_vector,
|
||||
const bool is_trace_info) :
|
||||
_verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
|
||||
_rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
|
||||
_align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
|
||||
_info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
|
||||
|
||||
static bool is_trace_vtransform(const VTrace& vtrace) {
|
||||
return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
// VTransformGraph: component of VTransform
|
||||
// See description at top of this file.
|
||||
class VTransformGraph : public StackObj {
|
||||
private:
|
||||
const VLoopAnalyzer& _vloop_analyzer;
|
||||
const VLoop& _vloop;
|
||||
|
||||
NOT_PRODUCT(const VTransformTrace _trace;)
|
||||
|
||||
VTransformNodeIDX _next_idx;
|
||||
GrowableArray<VTransformNode*> _vtnodes;
|
||||
|
||||
// Schedule (linearization) of the graph. We use this to reorder the memory graph
|
||||
// before inserting vector operations.
|
||||
GrowableArray<VTransformNode*> _schedule;
|
||||
|
||||
public:
|
||||
VTransformGraph(const VLoopAnalyzer& vloop_analyzer,
|
||||
Arena& arena
|
||||
NOT_PRODUCT( COMMA const VTransformTrace trace)) :
|
||||
_vloop_analyzer(vloop_analyzer),
|
||||
_vloop(vloop_analyzer.vloop()),
|
||||
NOT_PRODUCT(_trace(trace) COMMA)
|
||||
_next_idx(0),
|
||||
_vtnodes(&arena, _vloop.estimated_body_length(), 0, nullptr),
|
||||
_schedule(&arena, _vloop.estimated_body_length(), 0, nullptr) {}
|
||||
|
||||
VTransformNodeIDX new_idx() { return _next_idx++; }
|
||||
void add_vtnode(VTransformNode* vtnode);
|
||||
DEBUG_ONLY( bool is_empty() const { return _vtnodes.is_empty(); } )
|
||||
DEBUG_ONLY( bool is_scheduled() const { return _schedule.is_nonempty(); } )
|
||||
const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
|
||||
|
||||
bool schedule();
|
||||
void apply_memops_reordering_with_schedule() const;
|
||||
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
|
||||
|
||||
private:
|
||||
// VLoop accessors
|
||||
PhaseIdealLoop* phase() const { return _vloop.phase(); }
|
||||
PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); }
|
||||
bool in_bb(const Node* n) const { return _vloop.in_bb(n); }
|
||||
|
||||
void collect_nodes_without_req_or_dependency(GrowableArray<VTransformNode*>& stack) const;
|
||||
|
||||
template<typename Callback>
|
||||
void for_each_memop_in_schedule(Callback callback) const;
|
||||
|
||||
#ifndef PRODUCT
|
||||
void print_vtnodes() const;
|
||||
void print_schedule() const;
|
||||
void print_memops_schedule() const;
|
||||
void trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
|
||||
const VectorSet& pre_visited,
|
||||
const VectorSet& post_visited) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
// VTransform: models the transformation of the scalar loop to vectorized loop.
|
||||
// It is a "C2 subgraph" to "C2 subgraph" mapping.
|
||||
// See description at top of this file.
|
||||
class VTransform : public StackObj {
|
||||
private:
|
||||
const VLoopAnalyzer& _vloop_analyzer;
|
||||
const VLoop& _vloop;
|
||||
|
||||
NOT_PRODUCT(const VTransformTrace _trace;)
|
||||
|
||||
// Everything in the vtransform is allocated from this arena, including all vtnodes.
|
||||
Arena _arena;
|
||||
|
||||
VTransformGraph _graph;
|
||||
|
||||
// Memory reference, and the alignment width (aw) for which we align the main-loop,
|
||||
// by adjusting the pre-loop limit.
|
||||
MemNode const* _mem_ref_for_main_loop_alignment;
|
||||
int _aw_for_main_loop_alignment;
|
||||
|
||||
public:
|
||||
VTransform(const VLoopAnalyzer& vloop_analyzer,
|
||||
MemNode const* mem_ref_for_main_loop_alignment,
|
||||
int aw_for_main_loop_alignment
|
||||
NOT_PRODUCT( COMMA const VTransformTrace trace)
|
||||
) :
|
||||
_vloop_analyzer(vloop_analyzer),
|
||||
_vloop(vloop_analyzer.vloop()),
|
||||
NOT_PRODUCT(_trace(trace) COMMA)
|
||||
_arena(mtCompiler),
|
||||
_graph(_vloop_analyzer, _arena NOT_PRODUCT(COMMA _trace)),
|
||||
_mem_ref_for_main_loop_alignment(mem_ref_for_main_loop_alignment),
|
||||
_aw_for_main_loop_alignment(aw_for_main_loop_alignment) {}
|
||||
|
||||
const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
|
||||
Arena* arena() { return &_arena; }
|
||||
DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
|
||||
VTransformGraph& graph() { return _graph; }
|
||||
|
||||
bool schedule() { return _graph.schedule(); }
|
||||
void apply();
|
||||
|
||||
private:
|
||||
// VLoop accessors
|
||||
PhaseIdealLoop* phase() const { return _vloop.phase(); }
|
||||
PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); }
|
||||
IdealLoopTree* lpt() const { return _vloop.lpt(); }
|
||||
CountedLoopNode* cl() const { return _vloop.cl(); }
|
||||
int iv_stride() const { return cl()->stride_con(); }
|
||||
|
||||
// VLoopVPointers accessors
|
||||
const VPointer& vpointer(const MemNode* mem) const {
|
||||
return _vloop_analyzer.vpointers().vpointer(mem);
|
||||
}
|
||||
|
||||
// Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
|
||||
void determine_mem_ref_and_aw_for_main_loop_alignment();
|
||||
void adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
|
||||
void apply_vectorization() const;
|
||||
};
|
||||
|
||||
// The vtnodes (VTransformNode) resemble the C2 IR Nodes, and model a part of the
|
||||
// VTransform. Many such vtnodes make up the VTransformGraph. The vtnodes represent
|
||||
// the resulting scalar and vector nodes as closely as possible.
|
||||
// See description at top of this file.
|
||||
class VTransformNode : public ArenaObj {
|
||||
public:
|
||||
const VTransformNodeIDX _idx;
|
||||
|
||||
private:
|
||||
// _in is split into required inputs (_req), and additional dependencies.
|
||||
const uint _req;
|
||||
GrowableArray<VTransformNode*> _in;
|
||||
GrowableArray<VTransformNode*> _out;
|
||||
|
||||
public:
|
||||
VTransformNode(VTransform& vtransform, const uint req) :
|
||||
_idx(vtransform.graph().new_idx()),
|
||||
_req(req),
|
||||
_in(vtransform.arena(), req, req, nullptr),
|
||||
_out(vtransform.arena(), 4, 0, nullptr)
|
||||
{
|
||||
vtransform.graph().add_vtnode(this);
|
||||
}
|
||||
|
||||
void set_req(uint i, VTransformNode* n) {
|
||||
assert(i < _req, "must be a req");
|
||||
assert(_in.at(i) == nullptr && n != nullptr, "only set once");
|
||||
_in.at_put(i, n);
|
||||
n->add_out(this);
|
||||
}
|
||||
|
||||
void swap_req(uint i, uint j) {
|
||||
assert(i < _req, "must be a req");
|
||||
assert(j < _req, "must be a req");
|
||||
VTransformNode* tmp = _in.at(i);
|
||||
_in.at_put(i, _in.at(j));
|
||||
_in.at_put(j, tmp);
|
||||
}
|
||||
|
||||
void add_dependency(VTransformNode* n) {
|
||||
assert(n != nullptr, "no need to add nullptr");
|
||||
_in.push(n);
|
||||
n->add_out(this);
|
||||
}
|
||||
|
||||
void add_out(VTransformNode* n) {
|
||||
_out.push(n);
|
||||
}
|
||||
|
||||
uint req() const { return _req; }
|
||||
VTransformNode* in(int i) const { return _in.at(i); }
|
||||
int outs() const { return _out.length(); }
|
||||
VTransformNode* out(int i) const { return _out.at(i); }
|
||||
|
||||
bool has_req_or_dependency() const {
|
||||
for (int i = 0; i < _in.length(); i++) {
|
||||
if (_in.at(i) != nullptr) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual VTransformScalarNode* isa_Scalar() { return nullptr; }
|
||||
virtual VTransformInputScalarNode* isa_InputScalar() { return nullptr; }
|
||||
virtual VTransformVectorNode* isa_Vector() { return nullptr; }
|
||||
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; }
|
||||
virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; }
|
||||
virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; }
|
||||
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const = 0;
|
||||
|
||||
Node* find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const;
|
||||
|
||||
void register_new_node_from_vectorization(const VLoopAnalyzer& vloop_analyzer, Node* vn, Node* old_node) const;
|
||||
|
||||
NOT_PRODUCT(virtual const char* name() const = 0;)
|
||||
NOT_PRODUCT(void print() const;)
|
||||
NOT_PRODUCT(virtual void print_spec() const {};)
|
||||
NOT_PRODUCT(static void print_node_idx(const VTransformNode* vtn);)
|
||||
};
|
||||
|
||||
// Identity transform for scalar nodes.
|
||||
class VTransformScalarNode : public VTransformNode {
|
||||
private:
|
||||
Node* _node;
|
||||
public:
|
||||
VTransformScalarNode(VTransform& vtransform, Node* n) :
|
||||
VTransformNode(vtransform, n->req()), _node(n) {}
|
||||
Node* node() const { return _node; }
|
||||
virtual VTransformScalarNode* isa_Scalar() override { return this; }
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
};
|
||||
|
||||
// Wrapper node for nodes outside the loop that are inputs to nodes in the loop.
|
||||
// Since we want the loop-internal nodes to be able to reference all inputs as vtnodes,
|
||||
// we must wrap the inputs that are outside the loop into special vtnodes, too.
|
||||
class VTransformInputScalarNode : public VTransformScalarNode {
|
||||
public:
|
||||
VTransformInputScalarNode(VTransform& vtransform, Node* n) :
|
||||
VTransformScalarNode(vtransform, n) {}
|
||||
virtual VTransformInputScalarNode* isa_InputScalar() override { return this; }
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };)
|
||||
};
|
||||
|
||||
// Transform produces a ReplicateNode, replicating the input to all vector lanes.
|
||||
class VTransformReplicateNode : public VTransformNode {
|
||||
private:
|
||||
int _vlen;
|
||||
const Type* _element_type;
|
||||
public:
|
||||
VTransformReplicateNode(VTransform& vtransform, int vlen, const Type* element_type) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
};
|
||||
|
||||
// Transform introduces a scalar ConvI2LNode that was not previously in the C2 graph.
|
||||
class VTransformConvI2LNode : public VTransformNode {
|
||||
public:
|
||||
VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
|
||||
};
|
||||
|
||||
// Transform introduces a shift-count node that truncates the shift count for a vector shift.
|
||||
class VTransformShiftCountNode : public VTransformNode {
|
||||
private:
|
||||
int _vlen;
|
||||
const BasicType _element_bt;
|
||||
juint _mask;
|
||||
int _shift_opcode;
|
||||
public:
|
||||
VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
};
|
||||
|
||||
// Transform introduces a PopulateIndex node: [phi, phi+1, phi+2, phi+3, ...].
|
||||
class VTransformPopulateIndexNode : public VTransformNode {
|
||||
private:
|
||||
int _vlen;
|
||||
const BasicType _element_bt;
|
||||
public:
|
||||
VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
};
|
||||
|
||||
// Base class for all vector vtnodes.
|
||||
class VTransformVectorNode : public VTransformNode {
|
||||
private:
|
||||
GrowableArray<Node*> _nodes;
|
||||
public:
|
||||
VTransformVectorNode(VTransform& vtransform, const uint req, const uint number_of_nodes) :
|
||||
VTransformNode(vtransform, req), _nodes(vtransform.arena(), number_of_nodes, number_of_nodes, nullptr) {}
|
||||
|
||||
void set_nodes(const Node_List* pack) {
|
||||
for (uint k = 0; k < pack->size(); k++) {
|
||||
_nodes.at_put(k, pack->at(k));
|
||||
}
|
||||
}
|
||||
|
||||
const GrowableArray<Node*>& nodes() const { return _nodes; }
|
||||
virtual VTransformVectorNode* isa_Vector() override { return this; }
|
||||
void register_new_node_from_vectorization_and_replace_scalar_nodes(const VLoopAnalyzer& vloop_analyzer, Node* vn) const;
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
};
|
||||
|
||||
// Catch all for all element-wise vector operations.
|
||||
class VTransformElementWiseVectorNode : public VTransformVectorNode {
|
||||
public:
|
||||
VTransformElementWiseVectorNode(VTransform& vtransform, uint req, uint number_of_nodes) :
|
||||
VTransformVectorNode(vtransform, req, number_of_nodes) {}
|
||||
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
|
||||
};
|
||||
|
||||
struct VTransformBoolTest {
|
||||
const BoolTest::mask _mask;
|
||||
const bool _is_negated;
|
||||
|
||||
VTransformBoolTest(const BoolTest::mask mask, bool is_negated) :
|
||||
_mask(mask), _is_negated(is_negated) {}
|
||||
};
|
||||
|
||||
class VTransformBoolVectorNode : public VTransformElementWiseVectorNode {
|
||||
private:
|
||||
const VTransformBoolTest _test;
|
||||
public:
|
||||
VTransformBoolVectorNode(VTransform& vtransform, uint number_of_nodes, VTransformBoolTest test) :
|
||||
VTransformElementWiseVectorNode(vtransform, 2, number_of_nodes), _test(test) {}
|
||||
VTransformBoolTest test() const { return _test; }
|
||||
virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
|
||||
};
|
||||
|
||||
class VTransformReductionVectorNode : public VTransformVectorNode {
|
||||
public:
|
||||
// req = 3 -> [ctrl, scalar init, vector]
|
||||
VTransformReductionVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
||||
VTransformVectorNode(vtransform, 3, number_of_nodes) {}
|
||||
virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
|
||||
};
|
||||
|
||||
class VTransformLoadVectorNode : public VTransformVectorNode {
|
||||
public:
|
||||
// req = 3 -> [ctrl, mem, adr]
|
||||
VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
||||
VTransformVectorNode(vtransform, 3, number_of_nodes) {}
|
||||
LoadNode::ControlDependency control_dependency() const;
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
|
||||
};
|
||||
|
||||
class VTransformStoreVectorNode : public VTransformVectorNode {
|
||||
public:
|
||||
// req = 4 -> [ctrl, mem, adr, val]
|
||||
VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
||||
VTransformVectorNode(vtransform, 4, number_of_nodes) {}
|
||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
|
||||
};
|
||||
|
||||
// Invoke callback on all memops, in the order of the schedule.
|
||||
template<typename Callback>
|
||||
void VTransformGraph::for_each_memop_in_schedule(Callback callback) const {
|
||||
assert(_schedule.length() == _vtnodes.length(), "schedule was computed");
|
||||
|
||||
for (int i = 0; i < _schedule.length(); i++) {
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
|
||||
// We can ignore input nodes, they are outside the loop.
|
||||
if (vtn->isa_InputScalar() != nullptr) { continue; }
|
||||
|
||||
VTransformScalarNode* scalar = vtn->isa_Scalar();
|
||||
if (scalar != nullptr && scalar->node()->is_Mem()) {
|
||||
callback(scalar->node()->as_Mem());
|
||||
}
|
||||
|
||||
VTransformVectorNode* vector = vtn->isa_Vector();
|
||||
if (vector != nullptr && vector->nodes().at(0)->is_Mem()) {
|
||||
for (int j = 0; j < vector->nodes().length(); j++) {
|
||||
callback(vector->nodes().at(j)->as_Mem());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // SHARE_OPTO_VTRANSFORM_HPP
|
Loading…
x
Reference in New Issue
Block a user