8324890: C2 SuperWord: refactor out VLoop, make unrolling_analysis static, remove init/reset mechanism
Reviewed-by: kvn, roland
This commit is contained in:
parent
71d2dbd0b6
commit
232d136885
@ -1104,12 +1104,9 @@ void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLo
|
||||
if (!cl->was_slp_analyzed()) {
|
||||
Compile::TracePhase tp("autoVectorize", &Phase::timers[Phase::_t_autoVectorize]);
|
||||
|
||||
SuperWord sw(phase);
|
||||
sw.transform_loop(this, false);
|
||||
|
||||
// If the loop is slp canonical analyze it
|
||||
if (sw.early_return() == false) {
|
||||
sw.unrolling_analysis(_local_loop_unroll_factor);
|
||||
VLoop vloop(this, true);
|
||||
if (vloop.check_preconditions()) {
|
||||
SuperWord::unrolling_analysis(vloop, _local_loop_unroll_factor);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,7 +45,7 @@
|
||||
#include "opto/predicates.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/runtime.hpp"
|
||||
#include "opto/superword.hpp"
|
||||
#include "opto/vectorization.hpp"
|
||||
#include "runtime/sharedRuntime.hpp"
|
||||
#include "utilities/checkedCast.hpp"
|
||||
#include "utilities/powerOfTwo.hpp"
|
||||
@ -4863,30 +4863,30 @@ void PhaseIdealLoop::build_and_optimize() {
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
// Convert scalar to superword operations at the end of all loop opts.
|
||||
// Auto-vectorize main-loop
|
||||
if (C->do_superword() && C->has_loops() && !C->major_progress()) {
|
||||
Compile::TracePhase tp("autoVectorize", &timers[_t_autoVectorize]);
|
||||
// SuperWord transform
|
||||
SuperWord sw(this);
|
||||
|
||||
// Shared data structures for all AutoVectorizations, to reduce allocations
|
||||
// of large arrays.
|
||||
VSharedData vshared;
|
||||
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
|
||||
IdealLoopTree* lpt = iter.current();
|
||||
if (lpt->is_counted()) {
|
||||
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
|
||||
if (cl->is_main_loop()) {
|
||||
if (!sw.transform_loop(lpt, true)) {
|
||||
// Instigate more unrolling for optimization when vectorization fails.
|
||||
if (cl->has_passed_slp()) {
|
||||
C->set_major_progress();
|
||||
cl->set_notpassed_slp();
|
||||
cl->mark_do_unroll_only();
|
||||
}
|
||||
}
|
||||
AutoVectorizeStatus status = auto_vectorize(lpt, vshared);
|
||||
|
||||
if (status == AutoVectorizeStatus::TriedAndFailed) {
|
||||
// We tried vectorization, but failed. From now on only unroll the loop.
|
||||
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
|
||||
if (cl->has_passed_slp()) {
|
||||
C->set_major_progress();
|
||||
cl->set_notpassed_slp();
|
||||
cl->mark_do_unroll_only();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
|
||||
// Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
|
||||
if (C->has_loops() && !C->major_progress()) {
|
||||
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
|
||||
IdealLoopTree* lpt = iter.current();
|
||||
@ -5963,30 +5963,6 @@ CountedLoopEndNode* CountedLoopNode::find_pre_loop_end() {
|
||||
return pre_end;
|
||||
}
|
||||
|
||||
CountedLoopNode* CountedLoopNode::pre_loop_head() const {
|
||||
assert(is_main_loop(), "Only main loop has pre loop");
|
||||
assert(_pre_loop_end != nullptr && _pre_loop_end->loopnode() != nullptr,
|
||||
"should find head from pre loop end");
|
||||
return _pre_loop_end->loopnode();
|
||||
}
|
||||
|
||||
CountedLoopEndNode* CountedLoopNode::pre_loop_end() {
|
||||
#ifdef ASSERT
|
||||
assert(is_main_loop(), "Only main loop has pre loop");
|
||||
assert(_pre_loop_end != nullptr, "should be set when fetched");
|
||||
Node* found_pre_end = find_pre_loop_end();
|
||||
assert(_pre_loop_end == found_pre_end && _pre_loop_end == pre_loop_head()->loopexit(),
|
||||
"should find the pre loop end and must be the same result");
|
||||
#endif
|
||||
return _pre_loop_end;
|
||||
}
|
||||
|
||||
void CountedLoopNode::set_pre_loop_end(CountedLoopEndNode* pre_loop_end) {
|
||||
assert(is_main_loop(), "Only main loop has pre loop");
|
||||
assert(pre_loop_end, "must be valid");
|
||||
_pre_loop_end = pre_loop_end;
|
||||
}
|
||||
|
||||
//------------------------------get_late_ctrl----------------------------------
|
||||
// Compute latest legal control.
|
||||
Node *PhaseIdealLoop::get_late_ctrl( Node *n, Node *early ) {
|
||||
|
@ -43,6 +43,7 @@ class PredicateBlock;
|
||||
class PathFrequency;
|
||||
class PhaseIdealLoop;
|
||||
class VectorSet;
|
||||
class VSharedData;
|
||||
class Invariance;
|
||||
struct small_cache;
|
||||
|
||||
@ -231,14 +232,11 @@ class CountedLoopNode : public BaseCountedLoopNode {
|
||||
// vector mapped unroll factor here
|
||||
int _slp_maximum_unroll_factor;
|
||||
|
||||
// Cached CountedLoopEndNode of pre loop for main loops
|
||||
CountedLoopEndNode* _pre_loop_end;
|
||||
|
||||
public:
|
||||
CountedLoopNode(Node *entry, Node *backedge)
|
||||
: BaseCountedLoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
|
||||
_unrolled_count_log2(0), _node_count_before_unroll(0),
|
||||
_slp_maximum_unroll_factor(0), _pre_loop_end(nullptr) {
|
||||
_slp_maximum_unroll_factor(0) {
|
||||
init_class_id(Class_CountedLoop);
|
||||
// Initialize _trip_count to the largest possible value.
|
||||
// Will be reset (lower) if the loop's trip count is known.
|
||||
@ -330,9 +328,6 @@ public:
|
||||
|
||||
Node* is_canonical_loop_entry();
|
||||
CountedLoopEndNode* find_pre_loop_end();
|
||||
CountedLoopNode* pre_loop_head() const;
|
||||
CountedLoopEndNode* pre_loop_end();
|
||||
void set_pre_loop_end(CountedLoopEndNode* pre_loop_end);
|
||||
|
||||
#ifndef PRODUCT
|
||||
virtual void dump_spec(outputStream *st) const;
|
||||
@ -1437,6 +1432,14 @@ public:
|
||||
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
|
||||
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);
|
||||
|
||||
// AutoVectorize the loop: replace scalar ops with vector ops.
|
||||
enum AutoVectorizeStatus {
|
||||
Impossible, // This loop has the wrong shape to even try vectorization.
|
||||
Success, // We just successfully vectorized the loop.
|
||||
TriedAndFailed, // We tried to vectorize, but failed.
|
||||
};
|
||||
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
|
||||
|
||||
// Move UnorderedReduction out of loop if possible
|
||||
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
|
||||
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/subnode.hpp"
|
||||
#include "opto/subtypenode.hpp"
|
||||
#include "opto/superword.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "utilities/macros.hpp"
|
||||
|
||||
@ -4209,6 +4210,36 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old
|
||||
return true;
|
||||
}
|
||||
|
||||
// AutoVectorize the loop: replace scalar ops with vector ops.
|
||||
PhaseIdealLoop::AutoVectorizeStatus
|
||||
PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
|
||||
// Counted loop only
|
||||
if (!lpt->is_counted()) {
|
||||
return AutoVectorizeStatus::Impossible;
|
||||
}
|
||||
|
||||
// Main-loop only
|
||||
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
|
||||
if (!cl->is_main_loop()) {
|
||||
return AutoVectorizeStatus::Impossible;
|
||||
}
|
||||
|
||||
VLoop vloop(lpt, false);
|
||||
if (!vloop.check_preconditions()) {
|
||||
return AutoVectorizeStatus::TriedAndFailed;
|
||||
}
|
||||
|
||||
// Ensure the shared data is cleared before each use
|
||||
vshared.clear();
|
||||
|
||||
SuperWord sw(vloop, vshared);
|
||||
if (!sw.transform_loop()) {
|
||||
return AutoVectorizeStatus::TriedAndFailed;
|
||||
}
|
||||
|
||||
return AutoVectorizeStatus::Success;
|
||||
}
|
||||
|
||||
// Having ReductionNodes in the loop is expensive. They need to recursively
|
||||
// fold together the vector values, for every vectorized loop iteration. If
|
||||
// we encounter the following pattern, we can vector accumulate the values
|
||||
|
@ -38,134 +38,40 @@
|
||||
#include "opto/movenode.hpp"
|
||||
#include "utilities/powerOfTwo.hpp"
|
||||
|
||||
//
|
||||
// S U P E R W O R D T R A N S F O R M
|
||||
//=============================================================================
|
||||
|
||||
//------------------------------SuperWord---------------------------
|
||||
SuperWord::SuperWord(PhaseIdealLoop* phase) :
|
||||
_phase(phase),
|
||||
_arena(phase->C->comp_arena()),
|
||||
_igvn(phase->_igvn),
|
||||
SuperWord::SuperWord(const VLoop &vloop, VSharedData &vshared) :
|
||||
_vloop(vloop),
|
||||
_arena(mtCompiler),
|
||||
_packset(arena(), 8, 0, nullptr), // packs for the current block
|
||||
_bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
|
||||
_block(arena(), 8, 0, nullptr), // nodes in current block
|
||||
_bb_idx(vshared.node_idx_to_loop_body_idx()), // node idx to index in bb
|
||||
_block(arena(), vloop.estimated_body_length(), 0, nullptr), // nodes in current block
|
||||
_mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads
|
||||
_mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails
|
||||
_node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
|
||||
_clone_map(phase->C->clone_map()), // map of nodes created in cloning
|
||||
_node_info(arena(), vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
|
||||
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
|
||||
_align_to_ref(nullptr), // memory reference to align vectors to
|
||||
_dg(_arena), // dependence graph
|
||||
_nlist(arena(), 8, 0, nullptr), // scratch list of nodes
|
||||
_lpt(nullptr), // loop tree node
|
||||
_lp(nullptr), // CountedLoopNode
|
||||
_dg(arena()), // dependence graph
|
||||
_nlist(arena(), vloop.estimated_body_length(), 0, nullptr), // scratch list of nodes
|
||||
_loop_reductions(arena()), // reduction nodes in the current loop
|
||||
_bb(nullptr), // basic block
|
||||
_iv(nullptr), // induction var
|
||||
_race_possible(false), // cases where SDMU is true
|
||||
_early_return(true), // analysis evaluations routine
|
||||
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
|
||||
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
|
||||
_num_work_vecs(0), // amount of vector work we have
|
||||
_num_reductions(0) // amount of reduction work we have
|
||||
{
|
||||
}
|
||||
|
||||
//------------------------------transform_loop---------------------------
|
||||
bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
assert(_phase->C->do_superword(), "SuperWord option should be enabled");
|
||||
// SuperWord only works with power of two vector sizes.
|
||||
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
|
||||
if (vector_width < 2 || !is_power_of_2(vector_width)) {
|
||||
return false;
|
||||
}
|
||||
void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) {
|
||||
IdealLoopTree* lpt = vloop.lpt();
|
||||
CountedLoopNode* cl = vloop.cl();
|
||||
Node* cl_exit = vloop.cl_exit();
|
||||
PhaseIdealLoop* phase = vloop.phase();
|
||||
|
||||
assert(lpt->_head->is_CountedLoop(), "must be");
|
||||
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
|
||||
|
||||
if (!cl->is_valid_counted_loop(T_INT)) {
|
||||
return false; // skip malformed counted loop
|
||||
}
|
||||
|
||||
// Initialize simple data used by reduction marking early.
|
||||
set_lpt(lpt);
|
||||
set_lp(cl);
|
||||
// For now, define one block which is the entire loop body.
|
||||
set_bb(cl);
|
||||
|
||||
if (SuperWordReductions) {
|
||||
mark_reductions();
|
||||
}
|
||||
|
||||
// skip any loop that has not been assigned max unroll by analysis
|
||||
if (do_optimization) {
|
||||
if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for no control flow in body (other than exit)
|
||||
Node *cl_exit = cl->loopexit();
|
||||
if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_precondition()) {
|
||||
tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
|
||||
tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
|
||||
tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
|
||||
tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
|
||||
lpt->dump_head();
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure the are no extra control users of the loop backedge
|
||||
if (cl->back_control()->outcnt() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip any loops already optimized by slp
|
||||
if (cl->is_vectorized_loop()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cl->is_unroll_only()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cl->is_main_loop()) {
|
||||
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
|
||||
CountedLoopEndNode* pre_end = cl->find_pre_loop_end();
|
||||
if (pre_end == nullptr) {
|
||||
return false;
|
||||
}
|
||||
Node* pre_opaq1 = pre_end->limit();
|
||||
if (pre_opaq1->Opcode() != Op_Opaque1) {
|
||||
return false;
|
||||
}
|
||||
cl->set_pre_loop_end(pre_end);
|
||||
}
|
||||
|
||||
init(); // initialize data structures
|
||||
|
||||
bool success = true;
|
||||
if (do_optimization) {
|
||||
assert(_packset.length() == 0, "packset must be empty");
|
||||
success = SLP_extract();
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
//------------------------------early unrolling analysis------------------------------
|
||||
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
bool is_slp = true;
|
||||
size_t ignored_size = lpt()->_body.size();
|
||||
size_t ignored_size = lpt->_body.size();
|
||||
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
|
||||
Node_Stack nstack((int)ignored_size);
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
Node *cl_exit = cl->loopexit_or_null();
|
||||
|
||||
// First clear the entries
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
for (uint i = 0; i < lpt->_body.size(); i++) {
|
||||
ignored_loop_nodes[i] = -1;
|
||||
}
|
||||
|
||||
@ -173,8 +79,8 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
|
||||
// Process the loop, some/all of the stack entries will not be in order, ergo
|
||||
// need to preprocess the ignored initial state before we process the loop
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
Node* n = lpt()->_body.at(i);
|
||||
for (uint i = 0; i < lpt->_body.size(); i++) {
|
||||
Node* n = lpt->_body.at(i);
|
||||
if (n == cl->incr() ||
|
||||
n->is_AddP() ||
|
||||
n->is_Cmp() ||
|
||||
@ -189,7 +95,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
if (n->is_If()) {
|
||||
IfNode *iff = n->as_If();
|
||||
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
|
||||
if (lpt()->is_loop_exit(iff)) {
|
||||
if (lpt->is_loop_exit(iff)) {
|
||||
ignored_loop_nodes[i] = n->_idx;
|
||||
continue;
|
||||
}
|
||||
@ -233,10 +139,10 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
if (n->is_Mem()) {
|
||||
MemNode* current = n->as_Mem();
|
||||
Node* adr = n->in(MemNode::Address);
|
||||
Node* n_ctrl = _phase->get_ctrl(adr);
|
||||
Node* n_ctrl = phase->get_ctrl(adr);
|
||||
|
||||
// save a queue of post process nodes
|
||||
if (n_ctrl != nullptr && lpt()->is_member(_phase->get_loop(n_ctrl))) {
|
||||
if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) {
|
||||
// Process the memory expression
|
||||
int stack_idx = 0;
|
||||
bool have_side_effects = true;
|
||||
@ -244,15 +150,15 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
nstack.push(adr, stack_idx++);
|
||||
} else {
|
||||
// Mark the components of the memory operation in nstack
|
||||
VPointer p1(current, phase(), lpt(), &nstack, true);
|
||||
VPointer p1(current, vloop, &nstack);
|
||||
have_side_effects = p1.node_stack()->is_nonempty();
|
||||
}
|
||||
|
||||
// Process the pointer stack
|
||||
while (have_side_effects) {
|
||||
Node* pointer_node = nstack.node();
|
||||
for (uint j = 0; j < lpt()->_body.size(); j++) {
|
||||
Node* cur_node = lpt()->_body.at(j);
|
||||
for (uint j = 0; j < lpt->_body.size(); j++) {
|
||||
Node* cur_node = lpt->_body.at(j);
|
||||
if (cur_node == pointer_node) {
|
||||
ignored_loop_nodes[j] = cur_node->_idx;
|
||||
break;
|
||||
@ -269,11 +175,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
// Now we try to find the maximum supported consistent vector which the machine
|
||||
// description can use
|
||||
bool flag_small_bt = false;
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
for (uint i = 0; i < lpt->_body.size(); i++) {
|
||||
if (ignored_loop_nodes[i] != -1) continue;
|
||||
|
||||
BasicType bt;
|
||||
Node* n = lpt()->_body.at(i);
|
||||
Node* n = lpt->_body.at(i);
|
||||
if (n->is_Mem()) {
|
||||
bt = n->as_Mem()->memory_type();
|
||||
} else {
|
||||
@ -313,11 +219,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
for (uint j = start; j < end; j++) {
|
||||
Node* in = n->in(j);
|
||||
// Don't propagate through a memory
|
||||
if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
|
||||
if (!in->is_Mem() && vloop.in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
|
||||
bool same_type = true;
|
||||
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
|
||||
Node *use = in->fast_out(k);
|
||||
if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
|
||||
if (!vloop.in_bb(use) && use->bottom_type()->basic_type() != bt) {
|
||||
same_type = false;
|
||||
break;
|
||||
}
|
||||
@ -403,8 +309,8 @@ void SuperWord::mark_reductions() {
|
||||
|
||||
// Iterate through all phi nodes associated to the loop and search for
|
||||
// reduction cycles in the basic block.
|
||||
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
|
||||
const Node* phi = lp()->fast_out(i);
|
||||
for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
|
||||
const Node* phi = cl()->fast_out(i);
|
||||
if (!phi->is_Phi()) {
|
||||
continue;
|
||||
}
|
||||
@ -482,6 +388,44 @@ void SuperWord::mark_reductions() {
|
||||
}
|
||||
}
|
||||
|
||||
bool SuperWord::transform_loop() {
|
||||
assert(phase()->C->do_superword(), "SuperWord option should be enabled");
|
||||
assert(cl()->is_main_loop(), "SLP should only work on main loops");
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
tty->print_cr("\nSuperWord::transform_loop:");
|
||||
lpt()->dump_head();
|
||||
cl()->dump();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Skip any loop that has not been assigned max unroll by analysis
|
||||
if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
tty->print_cr("\nSuperWord::transform_loop failed: slp max unroll analysis was not already done");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!SLP_extract()) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
tty->print_cr("\nSuperWord::transform_loop failed: SuperWord::SLP_extract did not vectorize");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
tty->print_cr("\nSuperWord::transform_loop: success");
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
//------------------------------SLP_extract---------------------------
|
||||
// Extract the superword level parallelism
|
||||
//
|
||||
@ -517,8 +461,11 @@ void SuperWord::mark_reductions() {
|
||||
// extraction of scalar values from vectors.
|
||||
//
|
||||
bool SuperWord::SLP_extract() {
|
||||
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
|
||||
assert(cl->is_main_loop(), "SLP should only work on main loops");
|
||||
assert(cl()->is_main_loop(), "SLP should only work on main loops");
|
||||
|
||||
if (SuperWordReductions) {
|
||||
mark_reductions();
|
||||
}
|
||||
|
||||
// Find memory slices
|
||||
find_memory_slices();
|
||||
@ -629,13 +576,13 @@ void SuperWord::find_adjacent_refs() {
|
||||
set_align_to_ref(align_to_mem_ref);
|
||||
}
|
||||
|
||||
VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
|
||||
VPointer align_to_ref_p(mem_ref, vloop());
|
||||
// Set alignment relative to "align_to_ref" for all related memory operations.
|
||||
for (int i = memops.size() - 1; i >= 0; i--) {
|
||||
MemNode* s = memops.at(i)->as_Mem();
|
||||
if (isomorphic(s, mem_ref) &&
|
||||
(!_do_vector_loop || same_origin_idx(s, mem_ref))) {
|
||||
VPointer p2(s, phase(), lpt(), nullptr, false);
|
||||
VPointer p2(s, vloop());
|
||||
if (p2.comparable(align_to_ref_p)) {
|
||||
int align = memory_alignment(s, iv_adjustment);
|
||||
set_alignment(s, align);
|
||||
@ -694,11 +641,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
|
||||
// Count number of comparable memory ops
|
||||
for (uint i = 0; i < memops.size(); i++) {
|
||||
MemNode* s1 = memops.at(i)->as_Mem();
|
||||
VPointer p1(s1, phase(), lpt(), nullptr, false);
|
||||
VPointer p1(s1, vloop());
|
||||
for (uint j = i+1; j < memops.size(); j++) {
|
||||
MemNode* s2 = memops.at(j)->as_Mem();
|
||||
if (isomorphic(s1, s2)) {
|
||||
VPointer p2(s2, phase(), lpt(), nullptr, false);
|
||||
VPointer p2(s2, vloop());
|
||||
if (p1.comparable(p2)) {
|
||||
(*cmp_ct.adr_at(i))++;
|
||||
(*cmp_ct.adr_at(j))++;
|
||||
@ -719,7 +666,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
|
||||
if (s->is_Store()) {
|
||||
int vw = vector_width_in_bytes(s);
|
||||
assert(vw > 1, "sanity");
|
||||
VPointer p(s, phase(), lpt(), nullptr, false);
|
||||
VPointer p(s, vloop());
|
||||
if ( cmp_ct.at(j) > max_ct ||
|
||||
(cmp_ct.at(j) == max_ct &&
|
||||
( vw > max_vw ||
|
||||
@ -742,7 +689,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
|
||||
if (s->is_Load()) {
|
||||
int vw = vector_width_in_bytes(s);
|
||||
assert(vw > 1, "sanity");
|
||||
VPointer p(s, phase(), lpt(), nullptr, false);
|
||||
VPointer p(s, vloop());
|
||||
if ( cmp_ct.at(j) > max_ct ||
|
||||
(cmp_ct.at(j) == max_ct &&
|
||||
( vw > max_vw ||
|
||||
@ -815,7 +762,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) {
|
||||
//---------------------------get_iv_adjustment---------------------------
|
||||
// Calculate loop's iv adjustment for this memory ops.
|
||||
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
|
||||
VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
|
||||
VPointer align_to_ref_p(mem_ref, vloop());
|
||||
int offset = align_to_ref_p.offset_in_bytes();
|
||||
int scale = align_to_ref_p.scale_in_bytes();
|
||||
int elt_size = align_to_ref_p.memory_size();
|
||||
@ -884,13 +831,13 @@ void SuperWord::dependence_graph() {
|
||||
if (_dg.dep(s1)->in_cnt() == 0) {
|
||||
_dg.make_edge(slice, s1);
|
||||
}
|
||||
VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
|
||||
VPointer p1(s1->as_Mem(), vloop());
|
||||
bool sink_dependent = true;
|
||||
for (int k = j - 1; k >= 0; k--) {
|
||||
Node* s2 = _nlist.at(k);
|
||||
if (s1->is_Load() && s2->is_Load())
|
||||
continue;
|
||||
VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
|
||||
VPointer p2(s2->as_Mem(), vloop());
|
||||
|
||||
int cmp = p1.cmp(p2);
|
||||
if (!VPointer::not_equal(cmp)) {
|
||||
@ -923,8 +870,8 @@ void SuperWord::find_memory_slices() {
|
||||
assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
|
||||
|
||||
// Iterate over all memory phis
|
||||
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
|
||||
PhiNode* phi = lp()->fast_out(i)->isa_Phi();
|
||||
for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
|
||||
PhiNode* phi = cl()->fast_out(i)->isa_Phi();
|
||||
if (phi != nullptr && in_bb(phi) && phi->is_memory_phi()) {
|
||||
Node* phi_tail = phi->in(LoopNode::LoopBackControl);
|
||||
if (phi_tail != phi->in(LoopNode::EntryControl)) {
|
||||
@ -1060,8 +1007,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
|
||||
|
||||
// Adjacent memory references must have the same base, be comparable
|
||||
// and have the correct distance between them.
|
||||
VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
|
||||
VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
|
||||
VPointer p1(s1->as_Mem(), vloop());
|
||||
VPointer p2(s2->as_Mem(), vloop());
|
||||
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
|
||||
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
|
||||
return diff == data_size(s1);
|
||||
@ -1352,7 +1299,7 @@ bool SuperWord::follow_def_uses(Node_List* p) {
|
||||
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
|
||||
continue;
|
||||
}
|
||||
if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
|
||||
if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
|
||||
if (!opnd_positions_match(s1, t1, s2, t2))
|
||||
continue;
|
||||
int adjusted_align = alignment(s1);
|
||||
@ -1651,8 +1598,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(Node_List* pack) {
|
||||
assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
|
||||
|
||||
const MemNode* mem_ref = pack->at(0)->as_Mem();
|
||||
VPointer mem_ref_p(mem_ref, phase(), lpt(), nullptr, false);
|
||||
const CountedLoopEndNode* pre_end = lp()->pre_loop_end();
|
||||
VPointer mem_ref_p(mem_ref, vloop());
|
||||
const CountedLoopEndNode* pre_end = vloop().pre_loop_end();
|
||||
assert(pre_end->stride_is_con(), "pre loop stride is constant");
|
||||
|
||||
AlignmentSolver solver(pack->at(0)->as_Mem(),
|
||||
@ -1971,8 +1918,8 @@ bool SuperWord::profitable(Node_List* p) {
|
||||
// Reductions should only have a Phi use at the loop head or a non-phi use
|
||||
// outside of the loop if it is the last element of the pack (e.g. SafePoint).
|
||||
if (is_marked_reduction(def) &&
|
||||
((use->is_Phi() && use->in(0) == _lpt->_head) ||
|
||||
(!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
|
||||
((use->is_Phi() && use->in(0) == lpt()->_head) ||
|
||||
(!lpt()->is_member(phase()->get_loop(phase()->ctrl_or_self(use))) && i == p->size()-1))) {
|
||||
continue;
|
||||
}
|
||||
if (!is_vector_use(use, k)) {
|
||||
@ -2327,7 +2274,7 @@ void SuperWord::schedule() {
|
||||
#endif
|
||||
|
||||
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
|
||||
_phase->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
|
||||
phase()->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
|
||||
|
||||
// (4) Use the memops_schedule to re-order the memops in all slices.
|
||||
schedule_reorder_memops(memops_schedule);
|
||||
@ -2337,7 +2284,7 @@ void SuperWord::schedule() {
|
||||
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
|
||||
// and track the current memory state of each slice.
|
||||
void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
int max_slices = _phase->C->num_alias_types();
|
||||
int max_slices = phase()->C->num_alias_types();
|
||||
// When iterating over the memops_schedule, we keep track of the current memory state,
|
||||
// which is the Phi or a store in the loop.
|
||||
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
|
||||
@ -2349,7 +2296,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
for (int i = 0; i < _mem_slice_head.length(); i++) {
|
||||
Node* phi = _mem_slice_head.at(i);
|
||||
assert(phi->is_Phi(), "must be phi");
|
||||
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
|
||||
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
|
||||
current_state_in_slice.at_put(alias_idx, phi);
|
||||
|
||||
// If we have a memory phi, we have a last store in the loop, find it over backedge.
|
||||
@ -2362,7 +2309,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
for (uint i = 0; i < memops_schedule.size(); i++) {
|
||||
MemNode* n = memops_schedule.at(i)->as_Mem();
|
||||
assert(n->is_Load() || n->is_Store(), "only loads or stores");
|
||||
int alias_idx = _phase->C->get_alias_index(n->adr_type());
|
||||
int alias_idx = phase()->C->get_alias_index(n->adr_type());
|
||||
Node* current_state = current_state_in_slice.at(alias_idx);
|
||||
if (current_state == nullptr) {
|
||||
// If there are only loads in a slice, we never update the memory
|
||||
@ -2371,7 +2318,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
|
||||
"only loads can have memory state from outside loop");
|
||||
} else {
|
||||
_igvn.replace_input_of(n, MemNode::Memory, current_state);
|
||||
igvn().replace_input_of(n, MemNode::Memory, current_state);
|
||||
if (n->is_Store()) {
|
||||
current_state_in_slice.at_put(alias_idx, n);
|
||||
}
|
||||
@ -2384,12 +2331,12 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
Node_List uses_after_loop;
|
||||
for (int i = 0; i < _mem_slice_head.length(); i++) {
|
||||
Node* phi = _mem_slice_head.at(i);
|
||||
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
|
||||
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
|
||||
Node* current_state = current_state_in_slice.at(alias_idx);
|
||||
assert(current_state != nullptr, "slice is mapped");
|
||||
assert(current_state != phi, "did some work in between");
|
||||
assert(current_state->is_Store(), "sanity");
|
||||
_igvn.replace_input_of(phi, 2, current_state);
|
||||
igvn().replace_input_of(phi, 2, current_state);
|
||||
|
||||
// Replace uses of old last store with current_state (new last store)
|
||||
// Do it in two loops: first find all the uses, and change the graph
|
||||
@ -2408,7 +2355,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
for (uint j = 0; j < use->req(); j++) {
|
||||
Node* def = use->in(j);
|
||||
if (def == last_store) {
|
||||
_igvn.replace_input_of(use, j, current_state);
|
||||
igvn().replace_input_of(use, j, current_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2425,7 +2372,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
|
||||
bool SuperWord::output() {
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
assert(cl->is_main_loop(), "SLP should only work on main loops");
|
||||
Compile* C = _phase->C;
|
||||
Compile* C = phase()->C;
|
||||
if (_packset.length() == 0) {
|
||||
return false;
|
||||
}
|
||||
@ -2436,7 +2383,7 @@ bool SuperWord::output() {
|
||||
lpt()->dump_head();
|
||||
}
|
||||
#endif
|
||||
_phase->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
|
||||
phase()->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
|
||||
|
||||
adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
|
||||
@ -2464,7 +2411,7 @@ bool SuperWord::output() {
|
||||
// Walk up the memory chain, and ignore any StoreVector that provably
|
||||
// does not have any memory dependency.
|
||||
while (mem->is_StoreVector()) {
|
||||
VPointer p_store(mem->as_Mem(), phase(), lpt(), nullptr, false);
|
||||
VPointer p_store(mem->as_Mem(), vloop());
|
||||
if (p_store.overlap_possible_with_any_in(p)) {
|
||||
break;
|
||||
} else {
|
||||
@ -2598,13 +2545,13 @@ bool SuperWord::output() {
|
||||
}
|
||||
|
||||
// VectorMaskCmp
|
||||
ConINode* bol_test_node = _igvn.intcon((int)bol_test);
|
||||
ConINode* bol_test_node = igvn().intcon((int)bol_test);
|
||||
BasicType bt = velt_basic_type(cmp);
|
||||
const TypeVect* vt = TypeVect::make(bt, vlen);
|
||||
VectorNode* mask = new VectorMaskCmpNode(bol_test, cmp_in1, cmp_in2, bol_test_node, vt);
|
||||
_igvn.register_new_node_with_optimizer(mask);
|
||||
_phase->set_ctrl(mask, _phase->get_ctrl(p->at(0)));
|
||||
_igvn._worklist.push(mask);
|
||||
igvn().register_new_node_with_optimizer(mask);
|
||||
phase()->set_ctrl(mask, phase()->get_ctrl(p->at(0)));
|
||||
igvn()._worklist.push(mask);
|
||||
|
||||
// VectorBlend
|
||||
vn = new VectorBlendNode(blend_in1, blend_in2, mask);
|
||||
@ -2677,8 +2624,8 @@ bool SuperWord::output() {
|
||||
assert(n->req() == 2, "only one input expected");
|
||||
Node* in = vector_opd(p, 1);
|
||||
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
|
||||
_igvn.register_new_node_with_optimizer(longval);
|
||||
_phase->set_ctrl(longval, _phase->get_ctrl(first));
|
||||
igvn().register_new_node_with_optimizer(longval);
|
||||
phase()->set_ctrl(longval, phase()->get_ctrl(first));
|
||||
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
@ -2719,13 +2666,13 @@ bool SuperWord::output() {
|
||||
#endif
|
||||
|
||||
_block.at_put(i, vn);
|
||||
_igvn.register_new_node_with_optimizer(vn);
|
||||
_phase->set_ctrl(vn, _phase->get_ctrl(first));
|
||||
igvn().register_new_node_with_optimizer(vn);
|
||||
phase()->set_ctrl(vn, phase()->get_ctrl(first));
|
||||
for (uint j = 0; j < p->size(); j++) {
|
||||
Node* pm = p->at(j);
|
||||
_igvn.replace_node(pm, vn);
|
||||
igvn().replace_node(pm, vn);
|
||||
}
|
||||
_igvn._worklist.push(vn);
|
||||
igvn()._worklist.push(vn);
|
||||
|
||||
if (vlen > max_vlen) {
|
||||
max_vlen = vlen;
|
||||
@ -2764,7 +2711,7 @@ bool SuperWord::output() {
|
||||
}
|
||||
}
|
||||
|
||||
_phase->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
|
||||
phase()->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -2787,10 +2734,10 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
|
||||
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
|
||||
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
|
||||
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
|
||||
Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
|
||||
VectorNode::trace_new_vector(vn, "SuperWord");
|
||||
_igvn.register_new_node_with_optimizer(vn);
|
||||
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(vn);
|
||||
phase()->set_ctrl(vn, phase()->get_ctrl(opd));
|
||||
return vn;
|
||||
}
|
||||
|
||||
@ -2811,15 +2758,15 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
juint shift = t->get_con();
|
||||
if (shift > mask) { // Unsigned cmp
|
||||
cnt = ConNode::make(TypeInt::make(shift & mask));
|
||||
_igvn.register_new_node_with_optimizer(cnt);
|
||||
igvn().register_new_node_with_optimizer(cnt);
|
||||
}
|
||||
} else {
|
||||
if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
|
||||
cnt = ConNode::make(TypeInt::make(mask));
|
||||
_igvn.register_new_node_with_optimizer(cnt);
|
||||
igvn().register_new_node_with_optimizer(cnt);
|
||||
cnt = new AndINode(opd, cnt);
|
||||
_igvn.register_new_node_with_optimizer(cnt);
|
||||
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(cnt);
|
||||
phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
|
||||
}
|
||||
if (!opd->bottom_type()->isa_int()) {
|
||||
assert(false, "int type only");
|
||||
@ -2828,8 +2775,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
}
|
||||
// Move shift count into vector register.
|
||||
cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
|
||||
_igvn.register_new_node_with_optimizer(cnt);
|
||||
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(cnt);
|
||||
phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
|
||||
return cnt;
|
||||
}
|
||||
if (opd->is_StoreVector()) {
|
||||
@ -2847,8 +2794,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
if (p0->bottom_type()->isa_long()) {
|
||||
p0_t = TypeLong::LONG;
|
||||
conv = new ConvI2LNode(opd);
|
||||
_igvn.register_new_node_with_optimizer(conv);
|
||||
_phase->set_ctrl(conv, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(conv);
|
||||
phase()->set_ctrl(conv, phase()->get_ctrl(opd));
|
||||
}
|
||||
vn = VectorNode::scalar2vector(conv, vlen, p0_t);
|
||||
} else {
|
||||
@ -2856,8 +2803,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
vn = VectorNode::scalar2vector(opd, vlen, p0_t);
|
||||
}
|
||||
|
||||
_igvn.register_new_node_with_optimizer(vn);
|
||||
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(vn);
|
||||
phase()->set_ctrl(vn, phase()->get_ctrl(opd));
|
||||
VectorNode::trace_new_vector(vn, "SuperWord");
|
||||
return vn;
|
||||
}
|
||||
@ -2886,8 +2833,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||
pk->add_opd(in2);
|
||||
}
|
||||
}
|
||||
_igvn.register_new_node_with_optimizer(pk);
|
||||
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
|
||||
igvn().register_new_node_with_optimizer(pk);
|
||||
phase()->set_ctrl(pk, phase()->get_ctrl(opd));
|
||||
VectorNode::trace_new_vector(pk, "SuperWord");
|
||||
return pk;
|
||||
}
|
||||
@ -3050,8 +2997,8 @@ bool SuperWord::construct_bb() {
|
||||
VectorSet visited;
|
||||
VectorSet post_visited;
|
||||
|
||||
visited.set(bb_idx(bb()));
|
||||
stack.push(bb());
|
||||
visited.set(bb_idx(cl()));
|
||||
stack.push(cl());
|
||||
|
||||
// Do a depth first walk over out edges
|
||||
int rpo_idx = block_count - 1;
|
||||
@ -3066,7 +3013,7 @@ bool SuperWord::construct_bb() {
|
||||
Node* use = n->fast_out(i);
|
||||
if (in_bb(use) && !visited.test(bb_idx(use)) &&
|
||||
// Don't go around backedge
|
||||
(!use->is_Phi() || n == bb())) {
|
||||
(!use->is_Phi() || n == cl())) {
|
||||
stack.push(use);
|
||||
}
|
||||
}
|
||||
@ -3297,7 +3244,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
|
||||
tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
|
||||
}
|
||||
#endif
|
||||
VPointer p(s, phase(), lpt(), nullptr, false);
|
||||
VPointer p(s, vloop());
|
||||
if (!p.valid()) {
|
||||
NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
|
||||
return bottom_align;
|
||||
@ -3338,7 +3285,7 @@ const Type* SuperWord::container_type(Node* n) {
|
||||
}
|
||||
return Type::get_const_basic_type(bt);
|
||||
}
|
||||
const Type* t = _igvn.type(n);
|
||||
const Type* t = igvn().type(n);
|
||||
if (t->basic_type() == T_INT) {
|
||||
// A narrow type of arithmetic operations will be determined by
|
||||
// propagating the type of memory operations.
|
||||
@ -3358,7 +3305,7 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) {
|
||||
}
|
||||
|
||||
bool SuperWord::same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const {
|
||||
return _phase->C->get_alias_index(mem_ref->adr_type()) == _phase->C->get_alias_index(best_align_to_mem_ref->adr_type());
|
||||
return phase()->C->get_alias_index(mem_ref->adr_type()) == phase()->C->get_alias_index(best_align_to_mem_ref->adr_type());
|
||||
}
|
||||
|
||||
//------------------------------in_packset---------------------------
|
||||
@ -3438,22 +3385,22 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
|
||||
void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
const MemNode* align_to_ref = _align_to_ref;
|
||||
assert(align_to_ref != nullptr, "align_to_ref must be set");
|
||||
assert(lp()->is_main_loop(), "can only do alignment for main loop");
|
||||
assert(cl()->is_main_loop(), "can only do alignment for main loop");
|
||||
|
||||
// The opaque node for the limit, where we adjust the input
|
||||
Opaque1Node* pre_opaq = lp()->pre_loop_end()->limit()->as_Opaque1();
|
||||
Opaque1Node* pre_opaq = vloop().pre_loop_end()->limit()->as_Opaque1();
|
||||
|
||||
// Current pre-loop limit.
|
||||
Node* old_limit = pre_opaq->in(1);
|
||||
|
||||
// Where we put new limit calculations.
|
||||
Node* pre_ctrl = lp()->pre_loop_head()->in(LoopNode::EntryControl);
|
||||
Node* pre_ctrl = vloop().pre_loop_head()->in(LoopNode::EntryControl);
|
||||
|
||||
// Ensure the original loop limit is available from the pre-loop Opaque1 node.
|
||||
Node* orig_limit = pre_opaq->original_loop_limit();
|
||||
assert(orig_limit != nullptr && _igvn.type(orig_limit) != Type::TOP, "");
|
||||
assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
|
||||
|
||||
VPointer align_to_ref_p(align_to_ref, phase(), lpt(), nullptr, false);
|
||||
VPointer align_to_ref_p(align_to_ref, vloop());
|
||||
assert(align_to_ref_p.valid(), "sanity");
|
||||
|
||||
// For the main-loop, we want the address of align_to_ref to be memory aligned
|
||||
@ -3647,17 +3594,17 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
const bool is_sub = scale * stride > 0;
|
||||
|
||||
// 1.1: offset
|
||||
Node* xboi = _igvn.intcon(is_sub ? -offset : offset);
|
||||
Node* xboi = igvn().intcon(is_sub ? -offset : offset);
|
||||
TRACE_ALIGN_VECTOR_NODE(xboi);
|
||||
|
||||
// 1.2: invar (if it exists)
|
||||
if (invar != nullptr) {
|
||||
if (_igvn.type(invar)->isa_long()) {
|
||||
if (igvn().type(invar)->isa_long()) {
|
||||
// Computations are done % (vector width/element size) so it's
|
||||
// safe to simply convert invar to an int and loose the upper 32
|
||||
// bit half.
|
||||
invar = new ConvL2INode(invar);
|
||||
_igvn.register_new_node_with_optimizer(invar);
|
||||
igvn().register_new_node_with_optimizer(invar);
|
||||
TRACE_ALIGN_VECTOR_NODE(invar);
|
||||
}
|
||||
if (is_sub) {
|
||||
@ -3665,8 +3612,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
} else {
|
||||
xboi = new AddINode(xboi, invar);
|
||||
}
|
||||
_igvn.register_new_node_with_optimizer(xboi);
|
||||
_phase->set_ctrl(xboi, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(xboi);
|
||||
phase()->set_ctrl(xboi, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(xboi);
|
||||
}
|
||||
|
||||
@ -3676,11 +3623,11 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
// When the base() is top, we have no alignment guarantee at all.
|
||||
// Hence, we must now take the base into account for the calculation.
|
||||
Node* xbase = new CastP2XNode(nullptr, base);
|
||||
_igvn.register_new_node_with_optimizer(xbase);
|
||||
igvn().register_new_node_with_optimizer(xbase);
|
||||
TRACE_ALIGN_VECTOR_NODE(xbase);
|
||||
#ifdef _LP64
|
||||
xbase = new ConvL2INode(xbase);
|
||||
_igvn.register_new_node_with_optimizer(xbase);
|
||||
igvn().register_new_node_with_optimizer(xbase);
|
||||
TRACE_ALIGN_VECTOR_NODE(xbase);
|
||||
#endif
|
||||
if (is_sub) {
|
||||
@ -3688,18 +3635,18 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
} else {
|
||||
xboi = new AddINode(xboi, xbase);
|
||||
}
|
||||
_igvn.register_new_node_with_optimizer(xboi);
|
||||
_phase->set_ctrl(xboi, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(xboi);
|
||||
phase()->set_ctrl(xboi, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(xboi);
|
||||
}
|
||||
|
||||
// 2: Compute (14):
|
||||
// XBOI = xboi / abs(scale)
|
||||
// The division is executed as shift
|
||||
Node* log2_abs_scale = _igvn.intcon(exact_log2(abs(scale)));
|
||||
Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale)));
|
||||
Node* XBOI = new URShiftINode(xboi, log2_abs_scale);
|
||||
_igvn.register_new_node_with_optimizer(XBOI);
|
||||
_phase->set_ctrl(XBOI, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(XBOI);
|
||||
phase()->set_ctrl(XBOI, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(log2_abs_scale);
|
||||
TRACE_ALIGN_VECTOR_NODE(XBOI);
|
||||
|
||||
@ -3713,8 +3660,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
} else {
|
||||
XBOI_OP_old_limit = new AddINode(XBOI, old_limit);
|
||||
}
|
||||
_igvn.register_new_node_with_optimizer(XBOI_OP_old_limit);
|
||||
_phase->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(XBOI_OP_old_limit);
|
||||
phase()->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit);
|
||||
|
||||
// 3.2: Compute:
|
||||
@ -3723,10 +3670,10 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
// = XBOI_OP_old_limit AND (AW - 1)
|
||||
// Since AW is a power of 2, the modulo operation can be replaced with
|
||||
// a bitmask operation.
|
||||
Node* mask_AW = _igvn.intcon(AW-1);
|
||||
Node* mask_AW = igvn().intcon(AW-1);
|
||||
Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW);
|
||||
_igvn.register_new_node_with_optimizer(adjust_pre_iter);
|
||||
_phase->set_ctrl(adjust_pre_iter, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(adjust_pre_iter);
|
||||
phase()->set_ctrl(adjust_pre_iter, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(mask_AW);
|
||||
TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter);
|
||||
|
||||
@ -3739,8 +3686,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
} else {
|
||||
new_limit = new AddINode(old_limit, adjust_pre_iter);
|
||||
}
|
||||
_igvn.register_new_node_with_optimizer(new_limit);
|
||||
_phase->set_ctrl(new_limit, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(new_limit);
|
||||
phase()->set_ctrl(new_limit, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(new_limit);
|
||||
|
||||
// 5: Compute (15a, b):
|
||||
@ -3748,27 +3695,12 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
||||
Node* constrained_limit =
|
||||
(stride > 0) ? (Node*) new MinINode(new_limit, orig_limit)
|
||||
: (Node*) new MaxINode(new_limit, orig_limit);
|
||||
_igvn.register_new_node_with_optimizer(constrained_limit);
|
||||
_phase->set_ctrl(constrained_limit, pre_ctrl);
|
||||
igvn().register_new_node_with_optimizer(constrained_limit);
|
||||
phase()->set_ctrl(constrained_limit, pre_ctrl);
|
||||
TRACE_ALIGN_VECTOR_NODE(constrained_limit);
|
||||
|
||||
// 6: Hack the pre-loop limit
|
||||
_igvn.replace_input_of(pre_opaq, 1, constrained_limit);
|
||||
}
|
||||
|
||||
//------------------------------init---------------------------
|
||||
void SuperWord::init() {
|
||||
_dg.init();
|
||||
_packset.clear();
|
||||
_block.clear();
|
||||
_mem_slice_head.clear();
|
||||
_mem_slice_tail.clear();
|
||||
_node_info.clear();
|
||||
_align_to_ref = nullptr;
|
||||
_race_possible = 0;
|
||||
_early_return = false;
|
||||
_num_work_vecs = 0;
|
||||
_num_reductions = 0;
|
||||
igvn().replace_input_of(pre_opaq, 1, constrained_limit);
|
||||
}
|
||||
|
||||
//------------------------------print_packset---------------------------
|
||||
|
@ -139,8 +139,6 @@ class DepGraph {
|
||||
DepEdge* make_edge(DepMem* pred, Node* succ) { return make_edge(pred, dep(succ)); }
|
||||
DepEdge* make_edge(Node* pred, DepMem* succ) { return make_edge(dep(pred), succ); }
|
||||
|
||||
void init() { _map.clear(); } // initialize
|
||||
|
||||
void print(Node* n) { dep(n)->print(); }
|
||||
void print(DepMem* d) { d->print(); }
|
||||
};
|
||||
@ -200,18 +198,18 @@ class SWNodeInfo {
|
||||
// -----------------------------SuperWord---------------------------------
|
||||
// Transforms scalar operations into packed (superword) operations.
|
||||
class SuperWord : public ResourceObj {
|
||||
friend class VPointer;
|
||||
friend class CMoveKit;
|
||||
private:
|
||||
PhaseIdealLoop* _phase;
|
||||
Arena* _arena;
|
||||
PhaseIterGVN &_igvn;
|
||||
const VLoop& _vloop;
|
||||
|
||||
// Arena for small data structures. Large data structures are allocated in
|
||||
// VSharedData, and reused over many AutoVectorizations.
|
||||
Arena _arena;
|
||||
|
||||
enum consts { top_align = -1, bottom_align = -666 };
|
||||
|
||||
GrowableArray<Node_List*> _packset; // Packs for the current block
|
||||
|
||||
GrowableArray<int> _bb_idx; // Map from Node _idx to index within block
|
||||
GrowableArray<int> &_bb_idx; // Map from Node _idx to index within block
|
||||
|
||||
GrowableArray<Node*> _block; // Nodes in current block
|
||||
GrowableArray<PhiNode*> _mem_slice_head; // Memory slice head nodes
|
||||
@ -226,88 +224,87 @@ class SuperWord : public ResourceObj {
|
||||
GrowableArray<Node*> _nlist; // List of nodes
|
||||
|
||||
public:
|
||||
SuperWord(PhaseIdealLoop* phase);
|
||||
SuperWord(const VLoop &vloop, VSharedData &vshared);
|
||||
|
||||
bool transform_loop(IdealLoopTree* lpt, bool do_optimization);
|
||||
// Attempt to run the SuperWord algorithm on the loop. Return true if we succeed.
|
||||
bool transform_loop();
|
||||
|
||||
void unrolling_analysis(int &local_loop_unroll_factor);
|
||||
// Decide if loop can eventually be vectorized, and what unrolling factor is required.
|
||||
static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
|
||||
|
||||
// Accessors for VPointer
|
||||
PhaseIdealLoop* phase() const { return _phase; }
|
||||
IdealLoopTree* lpt() const { return _lpt; }
|
||||
PhiNode* iv() const { return _iv; }
|
||||
|
||||
bool early_return() const { return _early_return; }
|
||||
// VLoop Accessors
|
||||
const VLoop& vloop() const { return _vloop; }
|
||||
PhaseIdealLoop* phase() const { return vloop().phase(); }
|
||||
PhaseIterGVN& igvn() const { return vloop().phase()->igvn(); }
|
||||
IdealLoopTree* lpt() const { return vloop().lpt(); }
|
||||
CountedLoopNode* cl() const { return vloop().cl(); }
|
||||
PhiNode* iv() const { return vloop().iv(); }
|
||||
int iv_stride() const { return cl()->stride_con(); }
|
||||
bool in_bb(const Node* n) const { return vloop().in_bb(n); }
|
||||
|
||||
#ifndef PRODUCT
|
||||
// TraceAutoVectorization and TraceSuperWord
|
||||
bool is_trace_superword_precondition() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION);
|
||||
}
|
||||
|
||||
bool is_trace_superword_vector_element_type() const {
|
||||
// Too verbose for TraceSuperWord
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES);
|
||||
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES);
|
||||
}
|
||||
|
||||
bool is_trace_superword_alignment() const {
|
||||
// Too verbose for TraceSuperWord
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
|
||||
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
|
||||
}
|
||||
|
||||
bool is_trace_superword_memory_slices() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
|
||||
}
|
||||
|
||||
bool is_trace_superword_dependence_graph() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
|
||||
}
|
||||
|
||||
bool is_trace_superword_adjacent_memops() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
|
||||
}
|
||||
|
||||
bool is_trace_superword_rejections() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
|
||||
}
|
||||
|
||||
bool is_trace_superword_packset() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET);
|
||||
}
|
||||
|
||||
bool is_trace_superword_info() const {
|
||||
return TraceSuperWord ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
|
||||
}
|
||||
|
||||
bool is_trace_superword_verbose() const {
|
||||
// Too verbose for TraceSuperWord
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
|
||||
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
|
||||
}
|
||||
|
||||
bool is_trace_superword_any() const {
|
||||
return TraceSuperWord ||
|
||||
is_trace_align_vector() ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO) ||
|
||||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) ||
|
||||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
|
||||
}
|
||||
|
||||
bool is_trace_align_vector() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
|
||||
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
|
||||
is_trace_superword_verbose();
|
||||
}
|
||||
#endif
|
||||
@ -318,30 +315,14 @@ class SuperWord : public ResourceObj {
|
||||
const GrowableArray<Node*>& block() const { return _block; }
|
||||
const DepGraph& dg() const { return _dg; }
|
||||
private:
|
||||
IdealLoopTree* _lpt; // Current loop tree node
|
||||
CountedLoopNode* _lp; // Current CountedLoopNode
|
||||
VectorSet _loop_reductions; // Reduction nodes in the current loop
|
||||
Node* _bb; // Current basic block
|
||||
PhiNode* _iv; // Induction var
|
||||
bool _race_possible; // In cases where SDMU is true
|
||||
bool _early_return; // True if we do not initialize
|
||||
bool _do_vector_loop; // whether to do vectorization/simd style
|
||||
int _num_work_vecs; // Number of non memory vector operations
|
||||
int _num_reductions; // Number of reduction expressions applied
|
||||
NOT_PRODUCT(VTrace _vtrace);
|
||||
|
||||
// Accessors
|
||||
Arena* arena() { return _arena; }
|
||||
|
||||
Node* bb() { return _bb; }
|
||||
void set_bb(Node* bb) { _bb = bb; }
|
||||
void set_lpt(IdealLoopTree* lpt) { _lpt = lpt; }
|
||||
CountedLoopNode* lp() const { return _lp; }
|
||||
void set_lp(CountedLoopNode* lp) {
|
||||
_lp = lp;
|
||||
_iv = lp->as_CountedLoop()->phi()->as_Phi();
|
||||
}
|
||||
int iv_stride() const { return lp()->stride_con(); }
|
||||
Arena* arena() { return &_arena; }
|
||||
|
||||
int vector_width(const Node* n) const {
|
||||
BasicType bt = velt_basic_type(n);
|
||||
@ -355,11 +336,8 @@ class SuperWord : public ResourceObj {
|
||||
const MemNode* align_to_ref() const { return _align_to_ref; }
|
||||
void set_align_to_ref(const MemNode* m) { _align_to_ref = m; }
|
||||
|
||||
const Node* ctrl(const Node* n) const { return _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n; }
|
||||
|
||||
// block accessors
|
||||
public:
|
||||
bool in_bb(const Node* n) const { return n != nullptr && n->outcnt() > 0 && ctrl(n) == _bb; }
|
||||
int bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); }
|
||||
private:
|
||||
void set_bb_idx(Node* n, int i) { _bb_idx.at_put_grow(n->_idx, i); }
|
||||
@ -563,7 +541,6 @@ private:
|
||||
void adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
// Is the use of d1 in u1 at the same operand position as d2 in u2?
|
||||
bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
|
||||
void init();
|
||||
|
||||
// print methods
|
||||
void print_packset();
|
||||
|
@ -30,7 +30,7 @@
|
||||
|
||||
#define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
|
||||
flags(POINTER_ANALYSIS, "Trace VPointer") \
|
||||
flags(SW_PRECONDITION, "Trace SuperWord precondition") \
|
||||
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
|
||||
flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \
|
||||
flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \
|
||||
flags(SW_MEMORY_SLICES, "Trace SuperWord memory slices") \
|
||||
@ -112,7 +112,6 @@ class TraceAutoVectorizationTagValidator {
|
||||
} else if (ALL == tag) {
|
||||
_tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
|
||||
} else if (SW_VERBOSE == tag) {
|
||||
_tags.at_put(SW_PRECONDITION, set_bit);
|
||||
_tags.at_put(SW_TYPES, set_bit);
|
||||
_tags.at_put(SW_ALIGNMENT, set_bit);
|
||||
_tags.at_put(SW_MEMORY_SLICES, set_bit);
|
||||
@ -123,7 +122,6 @@ class TraceAutoVectorizationTagValidator {
|
||||
_tags.at_put(SW_INFO, set_bit);
|
||||
_tags.at_put(SW_VERBOSE, set_bit);
|
||||
} else if (SW_INFO == tag) {
|
||||
_tags.at_put(SW_PRECONDITION, set_bit);
|
||||
_tags.at_put(SW_MEMORY_SLICES, set_bit);
|
||||
_tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
|
||||
_tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
|
||||
|
@ -31,22 +31,103 @@
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/vectorization.hpp"
|
||||
|
||||
bool VLoop::check_preconditions() {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_preconditions()) {
|
||||
tty->print_cr("\nVLoop::check_preconditions");
|
||||
lpt()->dump_head();
|
||||
lpt()->head()->dump();
|
||||
}
|
||||
#endif
|
||||
|
||||
const char* return_state = check_preconditions_helper();
|
||||
assert(return_state != nullptr, "must have return state");
|
||||
if (return_state == VLoop::SUCCESS) {
|
||||
return true; // success
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_preconditions()) {
|
||||
tty->print_cr("VLoop::check_preconditions: failed: %s", return_state);
|
||||
}
|
||||
#endif
|
||||
return false; // failure
|
||||
}
|
||||
|
||||
const char* VLoop::check_preconditions_helper() {
|
||||
// Only accept vector width that is power of 2
|
||||
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
|
||||
if (vector_width < 2 || !is_power_of_2(vector_width)) {
|
||||
return VLoop::FAILURE_VECTOR_WIDTH;
|
||||
}
|
||||
|
||||
// Only accept valid counted loops (int)
|
||||
if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
|
||||
return VLoop::FAILURE_VALID_COUNTED_LOOP;
|
||||
}
|
||||
_cl = _lpt->_head->as_CountedLoop();
|
||||
_iv = _cl->phi()->as_Phi();
|
||||
|
||||
if (_cl->is_vectorized_loop()) {
|
||||
return VLoop::FAILURE_ALREADY_VECTORIZED;
|
||||
}
|
||||
|
||||
if (_cl->is_unroll_only()) {
|
||||
return VLoop::FAILURE_UNROLL_ONLY;
|
||||
}
|
||||
|
||||
// Check for control flow in the body
|
||||
_cl_exit = _cl->loopexit();
|
||||
bool has_cfg = _cl_exit->in(0) != _cl;
|
||||
if (has_cfg && !is_allow_cfg()) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_preconditions()) {
|
||||
tty->print_cr("VLoop::check_preconditions: fails because of control flow.");
|
||||
tty->print(" cl_exit %d", _cl_exit->_idx); _cl_exit->dump();
|
||||
tty->print(" cl_exit->in(0) %d", _cl_exit->in(0)->_idx); _cl_exit->in(0)->dump();
|
||||
tty->print(" lpt->_head %d", _cl->_idx); _cl->dump();
|
||||
_lpt->dump_head();
|
||||
}
|
||||
#endif
|
||||
return VLoop::FAILURE_CONTROL_FLOW;
|
||||
}
|
||||
|
||||
// Make sure the are no extra control users of the loop backedge
|
||||
if (_cl->back_control()->outcnt() != 1) {
|
||||
return VLoop::FAILURE_BACKEDGE;
|
||||
}
|
||||
|
||||
// To align vector memory accesses in the main-loop, we will have to adjust
|
||||
// the pre-loop limit.
|
||||
if (_cl->is_main_loop()) {
|
||||
CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
|
||||
if (pre_end == nullptr) {
|
||||
return VLoop::FAILURE_PRE_LOOP_LIMIT;
|
||||
}
|
||||
Node* pre_opaq1 = pre_end->limit();
|
||||
if (pre_opaq1->Opcode() != Op_Opaque1) {
|
||||
return VLoop::FAILURE_PRE_LOOP_LIMIT;
|
||||
}
|
||||
_pre_loop_end = pre_end;
|
||||
}
|
||||
|
||||
return VLoop::SUCCESS;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
int VPointer::Tracer::_depth = 0;
|
||||
#endif
|
||||
|
||||
VPointer::VPointer(const MemNode* mem,
|
||||
PhaseIdealLoop* phase, IdealLoopTree* lpt,
|
||||
VPointer::VPointer(const MemNode* mem, const VLoop& vloop,
|
||||
Node_Stack* nstack, bool analyze_only) :
|
||||
_mem(mem), _phase(phase), _lpt(lpt),
|
||||
_iv(lpt->_head->as_CountedLoop()->phi()->as_Phi()),
|
||||
_mem(mem), _vloop(vloop),
|
||||
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
|
||||
#ifdef ASSERT
|
||||
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
|
||||
#endif
|
||||
_nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
|
||||
#ifndef PRODUCT
|
||||
, _tracer(phase->C->directive()->trace_auto_vectorization_tags().at(TraceAutoVectorizationTag::POINTER_ANALYSIS))
|
||||
, _tracer(vloop.is_trace_pointer_analysis())
|
||||
#endif
|
||||
{
|
||||
NOT_PRODUCT(_tracer.ctor_1(mem);)
|
||||
@ -109,7 +190,7 @@ VPointer::VPointer(const MemNode* mem,
|
||||
// Following is used to create a temporary object during
|
||||
// the pattern match of an address expression.
|
||||
VPointer::VPointer(VPointer* p) :
|
||||
_mem(p->_mem), _phase(p->_phase), _lpt(p->_lpt), _iv(p->_iv),
|
||||
_mem(p->_mem), _vloop(p->_vloop),
|
||||
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
|
||||
#ifdef ASSERT
|
||||
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
|
||||
@ -153,7 +234,7 @@ bool VPointer::invariant(Node* n) const {
|
||||
// main loop (Illegal invariant happens when n_c is a CastII node that
|
||||
// prevents data nodes to flow above the main loop).
|
||||
Node* n_c = phase()->get_ctrl(n);
|
||||
return phase()->is_dominator(n_c, cl->pre_loop_head());
|
||||
return phase()->is_dominator(n_c, vloop().pre_loop_head());
|
||||
}
|
||||
}
|
||||
return is_not_member;
|
||||
|
@ -48,15 +48,131 @@ public:
|
||||
};
|
||||
#endif
|
||||
|
||||
// Basic loop structure accessors and vectorization preconditions checking
|
||||
class VLoop : public StackObj {
|
||||
private:
|
||||
PhaseIdealLoop* const _phase;
|
||||
IdealLoopTree* const _lpt;
|
||||
const bool _allow_cfg;
|
||||
CountedLoopNode* _cl;
|
||||
Node* _cl_exit;
|
||||
PhiNode* _iv;
|
||||
CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only
|
||||
|
||||
NOT_PRODUCT(VTrace _vtrace;)
|
||||
|
||||
static constexpr char const* SUCCESS = "success";
|
||||
static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized";
|
||||
static constexpr char const* FAILURE_UNROLL_ONLY = "loop only wants to be unrolled";
|
||||
static constexpr char const* FAILURE_VECTOR_WIDTH = "vector_width must be power of 2";
|
||||
static constexpr char const* FAILURE_VALID_COUNTED_LOOP = "must be valid counted loop (int)";
|
||||
static constexpr char const* FAILURE_CONTROL_FLOW = "control flow in loop not allowed";
|
||||
static constexpr char const* FAILURE_BACKEDGE = "nodes on backedge not allowed";
|
||||
static constexpr char const* FAILURE_PRE_LOOP_LIMIT = "main-loop must be able to adjust pre-loop-limit (not found)";
|
||||
|
||||
public:
|
||||
VLoop(IdealLoopTree* lpt, bool allow_cfg) :
|
||||
_phase (lpt->_phase),
|
||||
_lpt (lpt),
|
||||
_allow_cfg (allow_cfg),
|
||||
_cl (nullptr),
|
||||
_cl_exit (nullptr),
|
||||
_iv (nullptr) {}
|
||||
NONCOPYABLE(VLoop);
|
||||
|
||||
IdealLoopTree* lpt() const { return _lpt; };
|
||||
PhaseIdealLoop* phase() const { return _phase; }
|
||||
CountedLoopNode* cl() const { return _cl; };
|
||||
Node* cl_exit() const { return _cl_exit; };
|
||||
PhiNode* iv() const { return _iv; };
|
||||
int iv_stride() const { return cl()->stride_con(); };
|
||||
bool is_allow_cfg() const { return _allow_cfg; }
|
||||
|
||||
CountedLoopEndNode* pre_loop_end() const {
|
||||
assert(cl()->is_main_loop(), "only main loop can reference pre-loop");
|
||||
assert(_pre_loop_end != nullptr, "must have found it");
|
||||
return _pre_loop_end;
|
||||
};
|
||||
|
||||
CountedLoopNode* pre_loop_head() const {
|
||||
CountedLoopNode* head = pre_loop_end()->loopnode();
|
||||
assert(head != nullptr, "must find head");
|
||||
return head;
|
||||
};
|
||||
|
||||
// Estimate maximum size for data structures, to avoid repeated reallocation
|
||||
int estimated_body_length() const { return lpt()->_body.size(); };
|
||||
int estimated_node_count() const { return (int)(1.10 * phase()->C->unique()); };
|
||||
|
||||
#ifndef PRODUCT
|
||||
const VTrace& vtrace() const { return _vtrace; }
|
||||
|
||||
bool is_trace_preconditions() const {
|
||||
return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
|
||||
}
|
||||
|
||||
bool is_trace_pointer_analysis() const {
|
||||
return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Is the node in the basic block of the loop?
|
||||
// We only accept any nodes which have the loop head as their ctrl.
|
||||
bool in_bb(const Node* n) const {
|
||||
const Node* ctrl = _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n;
|
||||
return n != nullptr && n->outcnt() > 0 && ctrl == _cl;
|
||||
}
|
||||
|
||||
// Check if the loop passes some basic preconditions for vectorization.
|
||||
// Return indicates if analysis succeeded.
|
||||
bool check_preconditions();
|
||||
|
||||
private:
|
||||
const char* check_preconditions_helper();
|
||||
};
|
||||
|
||||
// Optimization to keep allocation of large arrays in AutoVectorization low.
|
||||
// We allocate the arrays once, and reuse them for multiple loops that we
|
||||
// AutoVectorize, clearing them before every new use.
|
||||
class VSharedData : public StackObj {
|
||||
private:
|
||||
// Arena, used to allocate all arrays from.
|
||||
Arena _arena;
|
||||
|
||||
// An array that maps node->_idx to a much smaller idx, which is at most the
|
||||
// size of a loop body. This allow us to have smaller arrays for other data
|
||||
// structures, since we are using smaller indices.
|
||||
GrowableArray<int> _node_idx_to_loop_body_idx;
|
||||
|
||||
public:
|
||||
VSharedData() :
|
||||
_arena(mtCompiler),
|
||||
_node_idx_to_loop_body_idx(&_arena, estimated_node_count(), 0, 0)
|
||||
{
|
||||
}
|
||||
|
||||
GrowableArray<int>& node_idx_to_loop_body_idx() {
|
||||
return _node_idx_to_loop_body_idx;
|
||||
}
|
||||
|
||||
// Must be cleared before each AutoVectorization use
|
||||
void clear() {
|
||||
_node_idx_to_loop_body_idx.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
static int estimated_node_count() {
|
||||
return (int)(1.10 * Compile::current()->unique());
|
||||
}
|
||||
};
|
||||
|
||||
// A vectorization pointer (VPointer) has information about an address for
|
||||
// dependence checking and vector alignment. It's usually bound to a memory
|
||||
// operation in a counted loop for vectorizable analysis.
|
||||
class VPointer : public ArenaObj {
|
||||
protected:
|
||||
const MemNode* _mem; // My memory reference node
|
||||
PhaseIdealLoop* _phase; // PhaseIdealLoop handle
|
||||
IdealLoopTree* _lpt; // Current IdealLoopTree
|
||||
PhiNode* _iv; // The loop induction variable
|
||||
const VLoop& _vloop;
|
||||
|
||||
Node* _base; // null if unsafe nonheap reference
|
||||
Node* _adr; // address pointer
|
||||
@ -74,9 +190,10 @@ class VPointer : public ArenaObj {
|
||||
bool _analyze_only; // Used in loop unrolling only for vpointer trace
|
||||
uint _stack_idx; // Used in loop unrolling only for vpointer trace
|
||||
|
||||
PhaseIdealLoop* phase() const { return _phase; }
|
||||
IdealLoopTree* lpt() const { return _lpt; }
|
||||
PhiNode* iv() const { return _iv; }
|
||||
const VLoop& vloop() const { return _vloop; }
|
||||
PhaseIdealLoop* phase() const { return vloop().phase(); }
|
||||
IdealLoopTree* lpt() const { return vloop().lpt(); }
|
||||
PhiNode* iv() const { return vloop().iv(); }
|
||||
|
||||
bool is_loop_member(Node* n) const;
|
||||
bool invariant(Node* n) const;
|
||||
@ -97,13 +214,19 @@ class VPointer : public ArenaObj {
|
||||
NotComparable = (Less | Greater | Equal)
|
||||
};
|
||||
|
||||
VPointer(const MemNode* mem,
|
||||
PhaseIdealLoop* phase, IdealLoopTree* lpt,
|
||||
VPointer(const MemNode* mem, const VLoop& vloop) :
|
||||
VPointer(mem, vloop, nullptr, false) {}
|
||||
VPointer(const MemNode* mem, const VLoop& vloop, Node_Stack* nstack) :
|
||||
VPointer(mem, vloop, nstack, true) {}
|
||||
private:
|
||||
VPointer(const MemNode* mem, const VLoop& vloop,
|
||||
Node_Stack* nstack, bool analyze_only);
|
||||
// Following is used to create a temporary object during
|
||||
// the pattern match of an address expression.
|
||||
VPointer(VPointer* p);
|
||||
NONCOPYABLE(VPointer);
|
||||
|
||||
public:
|
||||
bool valid() const { return _adr != nullptr; }
|
||||
bool has_iv() const { return _scale != 0; }
|
||||
|
||||
@ -143,7 +266,7 @@ class VPointer : public ArenaObj {
|
||||
bool overlap_possible_with_any_in(Node_List* p) {
|
||||
for (uint k = 0; k < p->size(); k++) {
|
||||
MemNode* mem = p->at(k)->as_Mem();
|
||||
VPointer p_mem(mem, phase(), lpt(), nullptr, false);
|
||||
VPointer p_mem(mem, vloop());
|
||||
// Only if we know that we have Less or Greater can we
|
||||
// be sure that there can never be an overlap between
|
||||
// the two memory regions.
|
||||
|
Loading…
x
Reference in New Issue
Block a user