8324890: C2 SuperWord: refactor out VLoop, make unrolling_analysis static, remove init/reset mechanism

Reviewed-by: kvn, roland
This commit is contained in:
Emanuel Peter 2024-02-10 14:19:01 +00:00
parent 71d2dbd0b6
commit 232d136885
9 changed files with 483 additions and 365 deletions

@ -1104,12 +1104,9 @@ void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLo
if (!cl->was_slp_analyzed()) {
Compile::TracePhase tp("autoVectorize", &Phase::timers[Phase::_t_autoVectorize]);
SuperWord sw(phase);
sw.transform_loop(this, false);
// If the loop is slp canonical analyze it
if (sw.early_return() == false) {
sw.unrolling_analysis(_local_loop_unroll_factor);
VLoop vloop(this, true);
if (vloop.check_preconditions()) {
SuperWord::unrolling_analysis(vloop, _local_loop_unroll_factor);
}
}

@ -45,7 +45,7 @@
#include "opto/predicates.hpp"
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
#include "opto/superword.hpp"
#include "opto/vectorization.hpp"
#include "runtime/sharedRuntime.hpp"
#include "utilities/checkedCast.hpp"
#include "utilities/powerOfTwo.hpp"
@ -4863,30 +4863,30 @@ void PhaseIdealLoop::build_and_optimize() {
C->set_major_progress();
}
// Convert scalar to superword operations at the end of all loop opts.
// Auto-vectorize main-loop
if (C->do_superword() && C->has_loops() && !C->major_progress()) {
Compile::TracePhase tp("autoVectorize", &timers[_t_autoVectorize]);
// SuperWord transform
SuperWord sw(this);
// Shared data structures for all AutoVectorizations, to reduce allocations
// of large arrays.
VSharedData vshared;
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted()) {
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (cl->is_main_loop()) {
if (!sw.transform_loop(lpt, true)) {
// Instigate more unrolling for optimization when vectorization fails.
if (cl->has_passed_slp()) {
C->set_major_progress();
cl->set_notpassed_slp();
cl->mark_do_unroll_only();
}
}
AutoVectorizeStatus status = auto_vectorize(lpt, vshared);
if (status == AutoVectorizeStatus::TriedAndFailed) {
// We tried vectorization, but failed. From now on only unroll the loop.
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
if (cl->has_passed_slp()) {
C->set_major_progress();
cl->set_notpassed_slp();
cl->mark_do_unroll_only();
}
}
}
}
// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
// Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
if (C->has_loops() && !C->major_progress()) {
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
@ -5963,30 +5963,6 @@ CountedLoopEndNode* CountedLoopNode::find_pre_loop_end() {
return pre_end;
}
CountedLoopNode* CountedLoopNode::pre_loop_head() const {
assert(is_main_loop(), "Only main loop has pre loop");
assert(_pre_loop_end != nullptr && _pre_loop_end->loopnode() != nullptr,
"should find head from pre loop end");
return _pre_loop_end->loopnode();
}
CountedLoopEndNode* CountedLoopNode::pre_loop_end() {
#ifdef ASSERT
assert(is_main_loop(), "Only main loop has pre loop");
assert(_pre_loop_end != nullptr, "should be set when fetched");
Node* found_pre_end = find_pre_loop_end();
assert(_pre_loop_end == found_pre_end && _pre_loop_end == pre_loop_head()->loopexit(),
"should find the pre loop end and must be the same result");
#endif
return _pre_loop_end;
}
void CountedLoopNode::set_pre_loop_end(CountedLoopEndNode* pre_loop_end) {
assert(is_main_loop(), "Only main loop has pre loop");
assert(pre_loop_end, "must be valid");
_pre_loop_end = pre_loop_end;
}
//------------------------------get_late_ctrl----------------------------------
// Compute latest legal control.
Node *PhaseIdealLoop::get_late_ctrl( Node *n, Node *early ) {

@ -43,6 +43,7 @@ class PredicateBlock;
class PathFrequency;
class PhaseIdealLoop;
class VectorSet;
class VSharedData;
class Invariance;
struct small_cache;
@ -231,14 +232,11 @@ class CountedLoopNode : public BaseCountedLoopNode {
// vector mapped unroll factor here
int _slp_maximum_unroll_factor;
// Cached CountedLoopEndNode of pre loop for main loops
CountedLoopEndNode* _pre_loop_end;
public:
CountedLoopNode(Node *entry, Node *backedge)
: BaseCountedLoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
_unrolled_count_log2(0), _node_count_before_unroll(0),
_slp_maximum_unroll_factor(0), _pre_loop_end(nullptr) {
_slp_maximum_unroll_factor(0) {
init_class_id(Class_CountedLoop);
// Initialize _trip_count to the largest possible value.
// Will be reset (lower) if the loop's trip count is known.
@ -330,9 +328,6 @@ public:
Node* is_canonical_loop_entry();
CountedLoopEndNode* find_pre_loop_end();
CountedLoopNode* pre_loop_head() const;
CountedLoopEndNode* pre_loop_end();
void set_pre_loop_end(CountedLoopEndNode* pre_loop_end);
#ifndef PRODUCT
virtual void dump_spec(outputStream *st) const;
@ -1437,6 +1432,14 @@ public:
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);
// AutoVectorize the loop: replace scalar ops with vector ops.
enum AutoVectorizeStatus {
Impossible, // This loop has the wrong shape to even try vectorization.
Success, // We just successfully vectorized the loop.
TriedAndFailed, // We tried to vectorize, but failed.
};
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
// Move UnorderedReduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

@ -41,6 +41,7 @@
#include "opto/rootnode.hpp"
#include "opto/subnode.hpp"
#include "opto/subtypenode.hpp"
#include "opto/superword.hpp"
#include "opto/vectornode.hpp"
#include "utilities/macros.hpp"
@ -4209,6 +4210,36 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old
return true;
}
// AutoVectorize the loop: replace scalar ops with vector ops.
PhaseIdealLoop::AutoVectorizeStatus
PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
// Counted loop only
if (!lpt->is_counted()) {
return AutoVectorizeStatus::Impossible;
}
// Main-loop only
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
if (!cl->is_main_loop()) {
return AutoVectorizeStatus::Impossible;
}
VLoop vloop(lpt, false);
if (!vloop.check_preconditions()) {
return AutoVectorizeStatus::TriedAndFailed;
}
// Ensure the shared data is cleared before each use
vshared.clear();
SuperWord sw(vloop, vshared);
if (!sw.transform_loop()) {
return AutoVectorizeStatus::TriedAndFailed;
}
return AutoVectorizeStatus::Success;
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values

@ -38,134 +38,40 @@
#include "opto/movenode.hpp"
#include "utilities/powerOfTwo.hpp"
//
// S U P E R W O R D T R A N S F O R M
//=============================================================================
//------------------------------SuperWord---------------------------
SuperWord::SuperWord(PhaseIdealLoop* phase) :
_phase(phase),
_arena(phase->C->comp_arena()),
_igvn(phase->_igvn),
SuperWord::SuperWord(const VLoop &vloop, VSharedData &vshared) :
_vloop(vloop),
_arena(mtCompiler),
_packset(arena(), 8, 0, nullptr), // packs for the current block
_bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
_block(arena(), 8, 0, nullptr), // nodes in current block
_bb_idx(vshared.node_idx_to_loop_body_idx()), // node idx to index in bb
_block(arena(), vloop.estimated_body_length(), 0, nullptr), // nodes in current block
_mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads
_mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails
_node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase->C->clone_map()), // map of nodes created in cloning
_node_info(arena(), vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
_align_to_ref(nullptr), // memory reference to align vectors to
_dg(_arena), // dependence graph
_nlist(arena(), 8, 0, nullptr), // scratch list of nodes
_lpt(nullptr), // loop tree node
_lp(nullptr), // CountedLoopNode
_dg(arena()), // dependence graph
_nlist(arena(), vloop.estimated_body_length(), 0, nullptr), // scratch list of nodes
_loop_reductions(arena()), // reduction nodes in the current loop
_bb(nullptr), // basic block
_iv(nullptr), // induction var
_race_possible(false), // cases where SDMU is true
_early_return(true), // analysis evaluations routine
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0) // amount of reduction work we have
{
}
//------------------------------transform_loop---------------------------
bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
assert(_phase->C->do_superword(), "SuperWord option should be enabled");
// SuperWord only works with power of two vector sizes.
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
if (vector_width < 2 || !is_power_of_2(vector_width)) {
return false;
}
void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) {
IdealLoopTree* lpt = vloop.lpt();
CountedLoopNode* cl = vloop.cl();
Node* cl_exit = vloop.cl_exit();
PhaseIdealLoop* phase = vloop.phase();
assert(lpt->_head->is_CountedLoop(), "must be");
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (!cl->is_valid_counted_loop(T_INT)) {
return false; // skip malformed counted loop
}
// Initialize simple data used by reduction marking early.
set_lpt(lpt);
set_lp(cl);
// For now, define one block which is the entire loop body.
set_bb(cl);
if (SuperWordReductions) {
mark_reductions();
}
// skip any loop that has not been assigned max unroll by analysis
if (do_optimization) {
if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
return false;
}
}
// Check for no control flow in body (other than exit)
Node *cl_exit = cl->loopexit();
if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
#ifndef PRODUCT
if (is_trace_superword_precondition()) {
tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
lpt->dump_head();
}
#endif
return false;
}
// Make sure the are no extra control users of the loop backedge
if (cl->back_control()->outcnt() != 1) {
return false;
}
// Skip any loops already optimized by slp
if (cl->is_vectorized_loop()) {
return false;
}
if (cl->is_unroll_only()) {
return false;
}
if (cl->is_main_loop()) {
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
CountedLoopEndNode* pre_end = cl->find_pre_loop_end();
if (pre_end == nullptr) {
return false;
}
Node* pre_opaq1 = pre_end->limit();
if (pre_opaq1->Opcode() != Op_Opaque1) {
return false;
}
cl->set_pre_loop_end(pre_end);
}
init(); // initialize data structures
bool success = true;
if (do_optimization) {
assert(_packset.length() == 0, "packset must be empty");
success = SLP_extract();
}
return success;
}
//------------------------------early unrolling analysis------------------------------
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
bool is_slp = true;
size_t ignored_size = lpt()->_body.size();
size_t ignored_size = lpt->_body.size();
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
Node_Stack nstack((int)ignored_size);
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
Node *cl_exit = cl->loopexit_or_null();
// First clear the entries
for (uint i = 0; i < lpt()->_body.size(); i++) {
for (uint i = 0; i < lpt->_body.size(); i++) {
ignored_loop_nodes[i] = -1;
}
@ -173,8 +79,8 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
// Process the loop, some/all of the stack entries will not be in order, ergo
// need to preprocess the ignored initial state before we process the loop
for (uint i = 0; i < lpt()->_body.size(); i++) {
Node* n = lpt()->_body.at(i);
for (uint i = 0; i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
if (n == cl->incr() ||
n->is_AddP() ||
n->is_Cmp() ||
@ -189,7 +95,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
if (n->is_If()) {
IfNode *iff = n->as_If();
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
if (lpt()->is_loop_exit(iff)) {
if (lpt->is_loop_exit(iff)) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
@ -233,10 +139,10 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
if (n->is_Mem()) {
MemNode* current = n->as_Mem();
Node* adr = n->in(MemNode::Address);
Node* n_ctrl = _phase->get_ctrl(adr);
Node* n_ctrl = phase->get_ctrl(adr);
// save a queue of post process nodes
if (n_ctrl != nullptr && lpt()->is_member(_phase->get_loop(n_ctrl))) {
if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) {
// Process the memory expression
int stack_idx = 0;
bool have_side_effects = true;
@ -244,15 +150,15 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
nstack.push(adr, stack_idx++);
} else {
// Mark the components of the memory operation in nstack
VPointer p1(current, phase(), lpt(), &nstack, true);
VPointer p1(current, vloop, &nstack);
have_side_effects = p1.node_stack()->is_nonempty();
}
// Process the pointer stack
while (have_side_effects) {
Node* pointer_node = nstack.node();
for (uint j = 0; j < lpt()->_body.size(); j++) {
Node* cur_node = lpt()->_body.at(j);
for (uint j = 0; j < lpt->_body.size(); j++) {
Node* cur_node = lpt->_body.at(j);
if (cur_node == pointer_node) {
ignored_loop_nodes[j] = cur_node->_idx;
break;
@ -269,11 +175,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
// Now we try to find the maximum supported consistent vector which the machine
// description can use
bool flag_small_bt = false;
for (uint i = 0; i < lpt()->_body.size(); i++) {
for (uint i = 0; i < lpt->_body.size(); i++) {
if (ignored_loop_nodes[i] != -1) continue;
BasicType bt;
Node* n = lpt()->_body.at(i);
Node* n = lpt->_body.at(i);
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
@ -313,11 +219,11 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
for (uint j = start; j < end; j++) {
Node* in = n->in(j);
// Don't propagate through a memory
if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
if (!in->is_Mem() && vloop.in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
bool same_type = true;
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
Node *use = in->fast_out(k);
if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
if (!vloop.in_bb(use) && use->bottom_type()->basic_type() != bt) {
same_type = false;
break;
}
@ -403,8 +309,8 @@ void SuperWord::mark_reductions() {
// Iterate through all phi nodes associated to the loop and search for
// reduction cycles in the basic block.
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
const Node* phi = lp()->fast_out(i);
for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
const Node* phi = cl()->fast_out(i);
if (!phi->is_Phi()) {
continue;
}
@ -482,6 +388,44 @@ void SuperWord::mark_reductions() {
}
}
bool SuperWord::transform_loop() {
assert(phase()->C->do_superword(), "SuperWord option should be enabled");
assert(cl()->is_main_loop(), "SLP should only work on main loops");
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop:");
lpt()->dump_head();
cl()->dump();
}
#endif
// Skip any loop that has not been assigned max unroll by analysis
if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop failed: slp max unroll analysis was not already done");
}
#endif
return false;
}
if (!SLP_extract()) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop failed: SuperWord::SLP_extract did not vectorize");
}
#endif
return false;
}
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop: success");
}
#endif
return true;
}
//------------------------------SLP_extract---------------------------
// Extract the superword level parallelism
//
@ -517,8 +461,11 @@ void SuperWord::mark_reductions() {
// extraction of scalar values from vectors.
//
bool SuperWord::SLP_extract() {
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
assert(cl()->is_main_loop(), "SLP should only work on main loops");
if (SuperWordReductions) {
mark_reductions();
}
// Find memory slices
find_memory_slices();
@ -629,13 +576,13 @@ void SuperWord::find_adjacent_refs() {
set_align_to_ref(align_to_mem_ref);
}
VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
VPointer align_to_ref_p(mem_ref, vloop());
// Set alignment relative to "align_to_ref" for all related memory operations.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (isomorphic(s, mem_ref) &&
(!_do_vector_loop || same_origin_idx(s, mem_ref))) {
VPointer p2(s, phase(), lpt(), nullptr, false);
VPointer p2(s, vloop());
if (p2.comparable(align_to_ref_p)) {
int align = memory_alignment(s, iv_adjustment);
set_alignment(s, align);
@ -694,11 +641,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
// Count number of comparable memory ops
for (uint i = 0; i < memops.size(); i++) {
MemNode* s1 = memops.at(i)->as_Mem();
VPointer p1(s1, phase(), lpt(), nullptr, false);
VPointer p1(s1, vloop());
for (uint j = i+1; j < memops.size(); j++) {
MemNode* s2 = memops.at(j)->as_Mem();
if (isomorphic(s1, s2)) {
VPointer p2(s2, phase(), lpt(), nullptr, false);
VPointer p2(s2, vloop());
if (p1.comparable(p2)) {
(*cmp_ct.adr_at(i))++;
(*cmp_ct.adr_at(j))++;
@ -719,7 +666,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
if (s->is_Store()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
VPointer p(s, phase(), lpt(), nullptr, false);
VPointer p(s, vloop());
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
@ -742,7 +689,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
if (s->is_Load()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
VPointer p(s, phase(), lpt(), nullptr, false);
VPointer p(s, vloop());
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
@ -815,7 +762,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) {
//---------------------------get_iv_adjustment---------------------------
// Calculate loop's iv adjustment for this memory ops.
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
VPointer align_to_ref_p(mem_ref, phase(), lpt(), nullptr, false);
VPointer align_to_ref_p(mem_ref, vloop());
int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes();
int elt_size = align_to_ref_p.memory_size();
@ -884,13 +831,13 @@ void SuperWord::dependence_graph() {
if (_dg.dep(s1)->in_cnt() == 0) {
_dg.make_edge(slice, s1);
}
VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
VPointer p1(s1->as_Mem(), vloop());
bool sink_dependent = true;
for (int k = j - 1; k >= 0; k--) {
Node* s2 = _nlist.at(k);
if (s1->is_Load() && s2->is_Load())
continue;
VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
VPointer p2(s2->as_Mem(), vloop());
int cmp = p1.cmp(p2);
if (!VPointer::not_equal(cmp)) {
@ -923,8 +870,8 @@ void SuperWord::find_memory_slices() {
assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
// Iterate over all memory phis
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
PhiNode* phi = lp()->fast_out(i)->isa_Phi();
for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
PhiNode* phi = cl()->fast_out(i)->isa_Phi();
if (phi != nullptr && in_bb(phi) && phi->is_memory_phi()) {
Node* phi_tail = phi->in(LoopNode::LoopBackControl);
if (phi_tail != phi->in(LoopNode::EntryControl)) {
@ -1060,8 +1007,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
// Adjacent memory references must have the same base, be comparable
// and have the correct distance between them.
VPointer p1(s1->as_Mem(), phase(), lpt(), nullptr, false);
VPointer p2(s2->as_Mem(), phase(), lpt(), nullptr, false);
VPointer p1(s1->as_Mem(), vloop());
VPointer p2(s2->as_Mem(), vloop());
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
return diff == data_size(s1);
@ -1352,7 +1299,7 @@ bool SuperWord::follow_def_uses(Node_List* p) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
if (!opnd_positions_match(s1, t1, s2, t2))
continue;
int adjusted_align = alignment(s1);
@ -1651,8 +1598,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(Node_List* pack) {
assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
const MemNode* mem_ref = pack->at(0)->as_Mem();
VPointer mem_ref_p(mem_ref, phase(), lpt(), nullptr, false);
const CountedLoopEndNode* pre_end = lp()->pre_loop_end();
VPointer mem_ref_p(mem_ref, vloop());
const CountedLoopEndNode* pre_end = vloop().pre_loop_end();
assert(pre_end->stride_is_con(), "pre loop stride is constant");
AlignmentSolver solver(pack->at(0)->as_Mem(),
@ -1971,8 +1918,8 @@ bool SuperWord::profitable(Node_List* p) {
// Reductions should only have a Phi use at the loop head or a non-phi use
// outside of the loop if it is the last element of the pack (e.g. SafePoint).
if (is_marked_reduction(def) &&
((use->is_Phi() && use->in(0) == _lpt->_head) ||
(!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
((use->is_Phi() && use->in(0) == lpt()->_head) ||
(!lpt()->is_member(phase()->get_loop(phase()->ctrl_or_self(use))) && i == p->size()-1))) {
continue;
}
if (!is_vector_use(use, k)) {
@ -2327,7 +2274,7 @@ void SuperWord::schedule() {
#endif
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
_phase->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
phase()->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
// (4) Use the memops_schedule to re-order the memops in all slices.
schedule_reorder_memops(memops_schedule);
@ -2337,7 +2284,7 @@ void SuperWord::schedule() {
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
// and track the current memory state of each slice.
void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
int max_slices = _phase->C->num_alias_types();
int max_slices = phase()->C->num_alias_types();
// When iterating over the memops_schedule, we keep track of the current memory state,
// which is the Phi or a store in the loop.
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
@ -2349,7 +2296,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* phi = _mem_slice_head.at(i);
assert(phi->is_Phi(), "must be phi");
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
current_state_in_slice.at_put(alias_idx, phi);
// If we have a memory phi, we have a last store in the loop, find it over backedge.
@ -2362,7 +2309,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
for (uint i = 0; i < memops_schedule.size(); i++) {
MemNode* n = memops_schedule.at(i)->as_Mem();
assert(n->is_Load() || n->is_Store(), "only loads or stores");
int alias_idx = _phase->C->get_alias_index(n->adr_type());
int alias_idx = phase()->C->get_alias_index(n->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
if (current_state == nullptr) {
// If there are only loads in a slice, we never update the memory
@ -2371,7 +2318,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
"only loads can have memory state from outside loop");
} else {
_igvn.replace_input_of(n, MemNode::Memory, current_state);
igvn().replace_input_of(n, MemNode::Memory, current_state);
if (n->is_Store()) {
current_state_in_slice.at_put(alias_idx, n);
}
@ -2384,12 +2331,12 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
Node_List uses_after_loop;
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* phi = _mem_slice_head.at(i);
int alias_idx = _phase->C->get_alias_index(phi->adr_type());
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
assert(current_state != nullptr, "slice is mapped");
assert(current_state != phi, "did some work in between");
assert(current_state->is_Store(), "sanity");
_igvn.replace_input_of(phi, 2, current_state);
igvn().replace_input_of(phi, 2, current_state);
// Replace uses of old last store with current_state (new last store)
// Do it in two loops: first find all the uses, and change the graph
@ -2408,7 +2355,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
for (uint j = 0; j < use->req(); j++) {
Node* def = use->in(j);
if (def == last_store) {
_igvn.replace_input_of(use, j, current_state);
igvn().replace_input_of(use, j, current_state);
}
}
}
@ -2425,7 +2372,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
bool SuperWord::output() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
Compile* C = _phase->C;
Compile* C = phase()->C;
if (_packset.length() == 0) {
return false;
}
@ -2436,7 +2383,7 @@ bool SuperWord::output() {
lpt()->dump_head();
}
#endif
_phase->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
phase()->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
adjust_pre_loop_limit_to_align_main_loop_vectors();
@ -2464,7 +2411,7 @@ bool SuperWord::output() {
// Walk up the memory chain, and ignore any StoreVector that provably
// does not have any memory dependency.
while (mem->is_StoreVector()) {
VPointer p_store(mem->as_Mem(), phase(), lpt(), nullptr, false);
VPointer p_store(mem->as_Mem(), vloop());
if (p_store.overlap_possible_with_any_in(p)) {
break;
} else {
@ -2598,13 +2545,13 @@ bool SuperWord::output() {
}
// VectorMaskCmp
ConINode* bol_test_node = _igvn.intcon((int)bol_test);
ConINode* bol_test_node = igvn().intcon((int)bol_test);
BasicType bt = velt_basic_type(cmp);
const TypeVect* vt = TypeVect::make(bt, vlen);
VectorNode* mask = new VectorMaskCmpNode(bol_test, cmp_in1, cmp_in2, bol_test_node, vt);
_igvn.register_new_node_with_optimizer(mask);
_phase->set_ctrl(mask, _phase->get_ctrl(p->at(0)));
_igvn._worklist.push(mask);
igvn().register_new_node_with_optimizer(mask);
phase()->set_ctrl(mask, phase()->get_ctrl(p->at(0)));
igvn()._worklist.push(mask);
// VectorBlend
vn = new VectorBlendNode(blend_in1, blend_in2, mask);
@ -2677,8 +2624,8 @@ bool SuperWord::output() {
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
_igvn.register_new_node_with_optimizer(longval);
_phase->set_ctrl(longval, _phase->get_ctrl(first));
igvn().register_new_node_with_optimizer(longval);
phase()->set_ctrl(longval, phase()->get_ctrl(first));
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_convert_opcode(opc)) {
@ -2719,13 +2666,13 @@ bool SuperWord::output() {
#endif
_block.at_put(i, vn);
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(first));
igvn().register_new_node_with_optimizer(vn);
phase()->set_ctrl(vn, phase()->get_ctrl(first));
for (uint j = 0; j < p->size(); j++) {
Node* pm = p->at(j);
_igvn.replace_node(pm, vn);
igvn().replace_node(pm, vn);
}
_igvn._worklist.push(vn);
igvn()._worklist.push(vn);
if (vlen > max_vlen) {
max_vlen = vlen;
@ -2764,7 +2711,7 @@ bool SuperWord::output() {
}
}
_phase->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
phase()->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
return true;
}
@ -2787,10 +2734,10 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
VectorNode::trace_new_vector(vn, "SuperWord");
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(vn);
phase()->set_ctrl(vn, phase()->get_ctrl(opd));
return vn;
}
@ -2811,15 +2758,15 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
juint shift = t->get_con();
if (shift > mask) { // Unsigned cmp
cnt = ConNode::make(TypeInt::make(shift & mask));
_igvn.register_new_node_with_optimizer(cnt);
igvn().register_new_node_with_optimizer(cnt);
}
} else {
if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
cnt = ConNode::make(TypeInt::make(mask));
_igvn.register_new_node_with_optimizer(cnt);
igvn().register_new_node_with_optimizer(cnt);
cnt = new AndINode(opd, cnt);
_igvn.register_new_node_with_optimizer(cnt);
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(cnt);
phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
}
if (!opd->bottom_type()->isa_int()) {
assert(false, "int type only");
@ -2828,8 +2775,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
}
// Move shift count into vector register.
cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
_igvn.register_new_node_with_optimizer(cnt);
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(cnt);
phase()->set_ctrl(cnt, phase()->get_ctrl(opd));
return cnt;
}
if (opd->is_StoreVector()) {
@ -2847,8 +2794,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
if (p0->bottom_type()->isa_long()) {
p0_t = TypeLong::LONG;
conv = new ConvI2LNode(opd);
_igvn.register_new_node_with_optimizer(conv);
_phase->set_ctrl(conv, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(conv);
phase()->set_ctrl(conv, phase()->get_ctrl(opd));
}
vn = VectorNode::scalar2vector(conv, vlen, p0_t);
} else {
@ -2856,8 +2803,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
vn = VectorNode::scalar2vector(opd, vlen, p0_t);
}
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(vn);
phase()->set_ctrl(vn, phase()->get_ctrl(opd));
VectorNode::trace_new_vector(vn, "SuperWord");
return vn;
}
@ -2886,8 +2833,8 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
pk->add_opd(in2);
}
}
_igvn.register_new_node_with_optimizer(pk);
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
igvn().register_new_node_with_optimizer(pk);
phase()->set_ctrl(pk, phase()->get_ctrl(opd));
VectorNode::trace_new_vector(pk, "SuperWord");
return pk;
}
@ -3050,8 +2997,8 @@ bool SuperWord::construct_bb() {
VectorSet visited;
VectorSet post_visited;
visited.set(bb_idx(bb()));
stack.push(bb());
visited.set(bb_idx(cl()));
stack.push(cl());
// Do a depth first walk over out edges
int rpo_idx = block_count - 1;
@ -3066,7 +3013,7 @@ bool SuperWord::construct_bb() {
Node* use = n->fast_out(i);
if (in_bb(use) && !visited.test(bb_idx(use)) &&
// Don't go around backedge
(!use->is_Phi() || n == bb())) {
(!use->is_Phi() || n == cl())) {
stack.push(use);
}
}
@ -3297,7 +3244,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
}
#endif
VPointer p(s, phase(), lpt(), nullptr, false);
VPointer p(s, vloop());
if (!p.valid()) {
NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
return bottom_align;
@ -3338,7 +3285,7 @@ const Type* SuperWord::container_type(Node* n) {
}
return Type::get_const_basic_type(bt);
}
const Type* t = _igvn.type(n);
const Type* t = igvn().type(n);
if (t->basic_type() == T_INT) {
// A narrow type of arithmetic operations will be determined by
// propagating the type of memory operations.
@ -3358,7 +3305,7 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) {
}
bool SuperWord::same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const {
return _phase->C->get_alias_index(mem_ref->adr_type()) == _phase->C->get_alias_index(best_align_to_mem_ref->adr_type());
return phase()->C->get_alias_index(mem_ref->adr_type()) == phase()->C->get_alias_index(best_align_to_mem_ref->adr_type());
}
//------------------------------in_packset---------------------------
@ -3438,22 +3385,22 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
const MemNode* align_to_ref = _align_to_ref;
assert(align_to_ref != nullptr, "align_to_ref must be set");
assert(lp()->is_main_loop(), "can only do alignment for main loop");
assert(cl()->is_main_loop(), "can only do alignment for main loop");
// The opaque node for the limit, where we adjust the input
Opaque1Node* pre_opaq = lp()->pre_loop_end()->limit()->as_Opaque1();
Opaque1Node* pre_opaq = vloop().pre_loop_end()->limit()->as_Opaque1();
// Current pre-loop limit.
Node* old_limit = pre_opaq->in(1);
// Where we put new limit calculations.
Node* pre_ctrl = lp()->pre_loop_head()->in(LoopNode::EntryControl);
Node* pre_ctrl = vloop().pre_loop_head()->in(LoopNode::EntryControl);
// Ensure the original loop limit is available from the pre-loop Opaque1 node.
Node* orig_limit = pre_opaq->original_loop_limit();
assert(orig_limit != nullptr && _igvn.type(orig_limit) != Type::TOP, "");
assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
VPointer align_to_ref_p(align_to_ref, phase(), lpt(), nullptr, false);
VPointer align_to_ref_p(align_to_ref, vloop());
assert(align_to_ref_p.valid(), "sanity");
// For the main-loop, we want the address of align_to_ref to be memory aligned
@ -3647,17 +3594,17 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
const bool is_sub = scale * stride > 0;
// 1.1: offset
Node* xboi = _igvn.intcon(is_sub ? -offset : offset);
Node* xboi = igvn().intcon(is_sub ? -offset : offset);
TRACE_ALIGN_VECTOR_NODE(xboi);
// 1.2: invar (if it exists)
if (invar != nullptr) {
if (_igvn.type(invar)->isa_long()) {
if (igvn().type(invar)->isa_long()) {
// Computations are done % (vector width/element size) so it's
// safe to simply convert invar to an int and loose the upper 32
// bit half.
invar = new ConvL2INode(invar);
_igvn.register_new_node_with_optimizer(invar);
igvn().register_new_node_with_optimizer(invar);
TRACE_ALIGN_VECTOR_NODE(invar);
}
if (is_sub) {
@ -3665,8 +3612,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
} else {
xboi = new AddINode(xboi, invar);
}
_igvn.register_new_node_with_optimizer(xboi);
_phase->set_ctrl(xboi, pre_ctrl);
igvn().register_new_node_with_optimizer(xboi);
phase()->set_ctrl(xboi, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xboi);
}
@ -3676,11 +3623,11 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
// When the base() is top, we have no alignment guarantee at all.
// Hence, we must now take the base into account for the calculation.
Node* xbase = new CastP2XNode(nullptr, base);
_igvn.register_new_node_with_optimizer(xbase);
igvn().register_new_node_with_optimizer(xbase);
TRACE_ALIGN_VECTOR_NODE(xbase);
#ifdef _LP64
xbase = new ConvL2INode(xbase);
_igvn.register_new_node_with_optimizer(xbase);
igvn().register_new_node_with_optimizer(xbase);
TRACE_ALIGN_VECTOR_NODE(xbase);
#endif
if (is_sub) {
@ -3688,18 +3635,18 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
} else {
xboi = new AddINode(xboi, xbase);
}
_igvn.register_new_node_with_optimizer(xboi);
_phase->set_ctrl(xboi, pre_ctrl);
igvn().register_new_node_with_optimizer(xboi);
phase()->set_ctrl(xboi, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xboi);
}
// 2: Compute (14):
// XBOI = xboi / abs(scale)
// The division is executed as shift
Node* log2_abs_scale = _igvn.intcon(exact_log2(abs(scale)));
Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale)));
Node* XBOI = new URShiftINode(xboi, log2_abs_scale);
_igvn.register_new_node_with_optimizer(XBOI);
_phase->set_ctrl(XBOI, pre_ctrl);
igvn().register_new_node_with_optimizer(XBOI);
phase()->set_ctrl(XBOI, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(log2_abs_scale);
TRACE_ALIGN_VECTOR_NODE(XBOI);
@ -3713,8 +3660,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
} else {
XBOI_OP_old_limit = new AddINode(XBOI, old_limit);
}
_igvn.register_new_node_with_optimizer(XBOI_OP_old_limit);
_phase->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
igvn().register_new_node_with_optimizer(XBOI_OP_old_limit);
phase()->set_ctrl(XBOI_OP_old_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit);
// 3.2: Compute:
@ -3723,10 +3670,10 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
// = XBOI_OP_old_limit AND (AW - 1)
// Since AW is a power of 2, the modulo operation can be replaced with
// a bitmask operation.
Node* mask_AW = _igvn.intcon(AW-1);
Node* mask_AW = igvn().intcon(AW-1);
Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW);
_igvn.register_new_node_with_optimizer(adjust_pre_iter);
_phase->set_ctrl(adjust_pre_iter, pre_ctrl);
igvn().register_new_node_with_optimizer(adjust_pre_iter);
phase()->set_ctrl(adjust_pre_iter, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(mask_AW);
TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter);
@ -3739,8 +3686,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
} else {
new_limit = new AddINode(old_limit, adjust_pre_iter);
}
_igvn.register_new_node_with_optimizer(new_limit);
_phase->set_ctrl(new_limit, pre_ctrl);
igvn().register_new_node_with_optimizer(new_limit);
phase()->set_ctrl(new_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(new_limit);
// 5: Compute (15a, b):
@ -3748,27 +3695,12 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
Node* constrained_limit =
(stride > 0) ? (Node*) new MinINode(new_limit, orig_limit)
: (Node*) new MaxINode(new_limit, orig_limit);
_igvn.register_new_node_with_optimizer(constrained_limit);
_phase->set_ctrl(constrained_limit, pre_ctrl);
igvn().register_new_node_with_optimizer(constrained_limit);
phase()->set_ctrl(constrained_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(constrained_limit);
// 6: Hack the pre-loop limit
_igvn.replace_input_of(pre_opaq, 1, constrained_limit);
}
//------------------------------init---------------------------
void SuperWord::init() {
_dg.init();
_packset.clear();
_block.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_node_info.clear();
_align_to_ref = nullptr;
_race_possible = 0;
_early_return = false;
_num_work_vecs = 0;
_num_reductions = 0;
igvn().replace_input_of(pre_opaq, 1, constrained_limit);
}
//------------------------------print_packset---------------------------

@ -139,8 +139,6 @@ class DepGraph {
DepEdge* make_edge(DepMem* pred, Node* succ) { return make_edge(pred, dep(succ)); }
DepEdge* make_edge(Node* pred, DepMem* succ) { return make_edge(dep(pred), succ); }
void init() { _map.clear(); } // initialize
void print(Node* n) { dep(n)->print(); }
void print(DepMem* d) { d->print(); }
};
@ -200,18 +198,18 @@ class SWNodeInfo {
// -----------------------------SuperWord---------------------------------
// Transforms scalar operations into packed (superword) operations.
class SuperWord : public ResourceObj {
friend class VPointer;
friend class CMoveKit;
private:
PhaseIdealLoop* _phase;
Arena* _arena;
PhaseIterGVN &_igvn;
const VLoop& _vloop;
// Arena for small data structures. Large data structures are allocated in
// VSharedData, and reused over many AutoVectorizations.
Arena _arena;
enum consts { top_align = -1, bottom_align = -666 };
GrowableArray<Node_List*> _packset; // Packs for the current block
GrowableArray<int> _bb_idx; // Map from Node _idx to index within block
GrowableArray<int> &_bb_idx; // Map from Node _idx to index within block
GrowableArray<Node*> _block; // Nodes in current block
GrowableArray<PhiNode*> _mem_slice_head; // Memory slice head nodes
@ -226,88 +224,87 @@ class SuperWord : public ResourceObj {
GrowableArray<Node*> _nlist; // List of nodes
public:
SuperWord(PhaseIdealLoop* phase);
SuperWord(const VLoop &vloop, VSharedData &vshared);
bool transform_loop(IdealLoopTree* lpt, bool do_optimization);
// Attempt to run the SuperWord algorithm on the loop. Return true if we succeed.
bool transform_loop();
void unrolling_analysis(int &local_loop_unroll_factor);
// Decide if loop can eventually be vectorized, and what unrolling factor is required.
static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
// Accessors for VPointer
PhaseIdealLoop* phase() const { return _phase; }
IdealLoopTree* lpt() const { return _lpt; }
PhiNode* iv() const { return _iv; }
bool early_return() const { return _early_return; }
// VLoop Accessors
const VLoop& vloop() const { return _vloop; }
PhaseIdealLoop* phase() const { return vloop().phase(); }
PhaseIterGVN& igvn() const { return vloop().phase()->igvn(); }
IdealLoopTree* lpt() const { return vloop().lpt(); }
CountedLoopNode* cl() const { return vloop().cl(); }
PhiNode* iv() const { return vloop().iv(); }
int iv_stride() const { return cl()->stride_con(); }
bool in_bb(const Node* n) const { return vloop().in_bb(n); }
#ifndef PRODUCT
// TraceAutoVectorization and TraceSuperWord
bool is_trace_superword_precondition() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION);
}
bool is_trace_superword_vector_element_type() const {
// Too verbose for TraceSuperWord
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES);
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES);
}
bool is_trace_superword_alignment() const {
// Too verbose for TraceSuperWord
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
}
bool is_trace_superword_memory_slices() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
}
bool is_trace_superword_dependence_graph() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
}
bool is_trace_superword_adjacent_memops() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
}
bool is_trace_superword_rejections() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
}
bool is_trace_superword_packset() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET);
}
bool is_trace_superword_info() const {
return TraceSuperWord ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
}
bool is_trace_superword_verbose() const {
// Too verbose for TraceSuperWord
return _vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
}
bool is_trace_superword_any() const {
return TraceSuperWord ||
is_trace_align_vector() ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PRECONDITION) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_INFO) ||
_vtrace.is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) ||
vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
}
bool is_trace_align_vector() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
return vloop().vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
is_trace_superword_verbose();
}
#endif
@ -318,30 +315,14 @@ class SuperWord : public ResourceObj {
const GrowableArray<Node*>& block() const { return _block; }
const DepGraph& dg() const { return _dg; }
private:
IdealLoopTree* _lpt; // Current loop tree node
CountedLoopNode* _lp; // Current CountedLoopNode
VectorSet _loop_reductions; // Reduction nodes in the current loop
Node* _bb; // Current basic block
PhiNode* _iv; // Induction var
bool _race_possible; // In cases where SDMU is true
bool _early_return; // True if we do not initialize
bool _do_vector_loop; // whether to do vectorization/simd style
int _num_work_vecs; // Number of non memory vector operations
int _num_reductions; // Number of reduction expressions applied
NOT_PRODUCT(VTrace _vtrace);
// Accessors
Arena* arena() { return _arena; }
Node* bb() { return _bb; }
void set_bb(Node* bb) { _bb = bb; }
void set_lpt(IdealLoopTree* lpt) { _lpt = lpt; }
CountedLoopNode* lp() const { return _lp; }
void set_lp(CountedLoopNode* lp) {
_lp = lp;
_iv = lp->as_CountedLoop()->phi()->as_Phi();
}
int iv_stride() const { return lp()->stride_con(); }
Arena* arena() { return &_arena; }
int vector_width(const Node* n) const {
BasicType bt = velt_basic_type(n);
@ -355,11 +336,8 @@ class SuperWord : public ResourceObj {
const MemNode* align_to_ref() const { return _align_to_ref; }
void set_align_to_ref(const MemNode* m) { _align_to_ref = m; }
const Node* ctrl(const Node* n) const { return _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n; }
// block accessors
public:
bool in_bb(const Node* n) const { return n != nullptr && n->outcnt() > 0 && ctrl(n) == _bb; }
int bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); }
private:
void set_bb_idx(Node* n, int i) { _bb_idx.at_put_grow(n->_idx, i); }
@ -563,7 +541,6 @@ private:
void adjust_pre_loop_limit_to_align_main_loop_vectors();
// Is the use of d1 in u1 at the same operand position as d2 in u2?
bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
void init();
// print methods
void print_packset();

@ -30,7 +30,7 @@
#define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
flags(POINTER_ANALYSIS, "Trace VPointer") \
flags(SW_PRECONDITION, "Trace SuperWord precondition") \
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \
flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \
flags(SW_MEMORY_SLICES, "Trace SuperWord memory slices") \
@ -112,7 +112,6 @@ class TraceAutoVectorizationTagValidator {
} else if (ALL == tag) {
_tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
} else if (SW_VERBOSE == tag) {
_tags.at_put(SW_PRECONDITION, set_bit);
_tags.at_put(SW_TYPES, set_bit);
_tags.at_put(SW_ALIGNMENT, set_bit);
_tags.at_put(SW_MEMORY_SLICES, set_bit);
@ -123,7 +122,6 @@ class TraceAutoVectorizationTagValidator {
_tags.at_put(SW_INFO, set_bit);
_tags.at_put(SW_VERBOSE, set_bit);
} else if (SW_INFO == tag) {
_tags.at_put(SW_PRECONDITION, set_bit);
_tags.at_put(SW_MEMORY_SLICES, set_bit);
_tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
_tags.at_put(SW_ADJACENT_MEMOPS, set_bit);

@ -31,22 +31,103 @@
#include "opto/rootnode.hpp"
#include "opto/vectorization.hpp"
bool VLoop::check_preconditions() {
#ifndef PRODUCT
if (is_trace_preconditions()) {
tty->print_cr("\nVLoop::check_preconditions");
lpt()->dump_head();
lpt()->head()->dump();
}
#endif
const char* return_state = check_preconditions_helper();
assert(return_state != nullptr, "must have return state");
if (return_state == VLoop::SUCCESS) {
return true; // success
}
#ifndef PRODUCT
if (is_trace_preconditions()) {
tty->print_cr("VLoop::check_preconditions: failed: %s", return_state);
}
#endif
return false; // failure
}
const char* VLoop::check_preconditions_helper() {
// Only accept vector width that is power of 2
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
if (vector_width < 2 || !is_power_of_2(vector_width)) {
return VLoop::FAILURE_VECTOR_WIDTH;
}
// Only accept valid counted loops (int)
if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
return VLoop::FAILURE_VALID_COUNTED_LOOP;
}
_cl = _lpt->_head->as_CountedLoop();
_iv = _cl->phi()->as_Phi();
if (_cl->is_vectorized_loop()) {
return VLoop::FAILURE_ALREADY_VECTORIZED;
}
if (_cl->is_unroll_only()) {
return VLoop::FAILURE_UNROLL_ONLY;
}
// Check for control flow in the body
_cl_exit = _cl->loopexit();
bool has_cfg = _cl_exit->in(0) != _cl;
if (has_cfg && !is_allow_cfg()) {
#ifndef PRODUCT
if (is_trace_preconditions()) {
tty->print_cr("VLoop::check_preconditions: fails because of control flow.");
tty->print(" cl_exit %d", _cl_exit->_idx); _cl_exit->dump();
tty->print(" cl_exit->in(0) %d", _cl_exit->in(0)->_idx); _cl_exit->in(0)->dump();
tty->print(" lpt->_head %d", _cl->_idx); _cl->dump();
_lpt->dump_head();
}
#endif
return VLoop::FAILURE_CONTROL_FLOW;
}
// Make sure the are no extra control users of the loop backedge
if (_cl->back_control()->outcnt() != 1) {
return VLoop::FAILURE_BACKEDGE;
}
// To align vector memory accesses in the main-loop, we will have to adjust
// the pre-loop limit.
if (_cl->is_main_loop()) {
CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
if (pre_end == nullptr) {
return VLoop::FAILURE_PRE_LOOP_LIMIT;
}
Node* pre_opaq1 = pre_end->limit();
if (pre_opaq1->Opcode() != Op_Opaque1) {
return VLoop::FAILURE_PRE_LOOP_LIMIT;
}
_pre_loop_end = pre_end;
}
return VLoop::SUCCESS;
}
#ifndef PRODUCT
int VPointer::Tracer::_depth = 0;
#endif
VPointer::VPointer(const MemNode* mem,
PhaseIdealLoop* phase, IdealLoopTree* lpt,
VPointer::VPointer(const MemNode* mem, const VLoop& vloop,
Node_Stack* nstack, bool analyze_only) :
_mem(mem), _phase(phase), _lpt(lpt),
_iv(lpt->_head->as_CountedLoop()->phi()->as_Phi()),
_mem(mem), _vloop(vloop),
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
#ifdef ASSERT
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
#endif
_nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
#ifndef PRODUCT
, _tracer(phase->C->directive()->trace_auto_vectorization_tags().at(TraceAutoVectorizationTag::POINTER_ANALYSIS))
, _tracer(vloop.is_trace_pointer_analysis())
#endif
{
NOT_PRODUCT(_tracer.ctor_1(mem);)
@ -109,7 +190,7 @@ VPointer::VPointer(const MemNode* mem,
// Following is used to create a temporary object during
// the pattern match of an address expression.
VPointer::VPointer(VPointer* p) :
_mem(p->_mem), _phase(p->_phase), _lpt(p->_lpt), _iv(p->_iv),
_mem(p->_mem), _vloop(p->_vloop),
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
#ifdef ASSERT
_debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
@ -153,7 +234,7 @@ bool VPointer::invariant(Node* n) const {
// main loop (Illegal invariant happens when n_c is a CastII node that
// prevents data nodes to flow above the main loop).
Node* n_c = phase()->get_ctrl(n);
return phase()->is_dominator(n_c, cl->pre_loop_head());
return phase()->is_dominator(n_c, vloop().pre_loop_head());
}
}
return is_not_member;

@ -48,15 +48,131 @@ public:
};
#endif
// Basic loop structure accessors and vectorization preconditions checking
class VLoop : public StackObj {
private:
PhaseIdealLoop* const _phase;
IdealLoopTree* const _lpt;
const bool _allow_cfg;
CountedLoopNode* _cl;
Node* _cl_exit;
PhiNode* _iv;
CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only
NOT_PRODUCT(VTrace _vtrace;)
static constexpr char const* SUCCESS = "success";
static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized";
static constexpr char const* FAILURE_UNROLL_ONLY = "loop only wants to be unrolled";
static constexpr char const* FAILURE_VECTOR_WIDTH = "vector_width must be power of 2";
static constexpr char const* FAILURE_VALID_COUNTED_LOOP = "must be valid counted loop (int)";
static constexpr char const* FAILURE_CONTROL_FLOW = "control flow in loop not allowed";
static constexpr char const* FAILURE_BACKEDGE = "nodes on backedge not allowed";
static constexpr char const* FAILURE_PRE_LOOP_LIMIT = "main-loop must be able to adjust pre-loop-limit (not found)";
public:
VLoop(IdealLoopTree* lpt, bool allow_cfg) :
_phase (lpt->_phase),
_lpt (lpt),
_allow_cfg (allow_cfg),
_cl (nullptr),
_cl_exit (nullptr),
_iv (nullptr) {}
NONCOPYABLE(VLoop);
IdealLoopTree* lpt() const { return _lpt; };
PhaseIdealLoop* phase() const { return _phase; }
CountedLoopNode* cl() const { return _cl; };
Node* cl_exit() const { return _cl_exit; };
PhiNode* iv() const { return _iv; };
int iv_stride() const { return cl()->stride_con(); };
bool is_allow_cfg() const { return _allow_cfg; }
CountedLoopEndNode* pre_loop_end() const {
assert(cl()->is_main_loop(), "only main loop can reference pre-loop");
assert(_pre_loop_end != nullptr, "must have found it");
return _pre_loop_end;
};
CountedLoopNode* pre_loop_head() const {
CountedLoopNode* head = pre_loop_end()->loopnode();
assert(head != nullptr, "must find head");
return head;
};
// Estimate maximum size for data structures, to avoid repeated reallocation
int estimated_body_length() const { return lpt()->_body.size(); };
int estimated_node_count() const { return (int)(1.10 * phase()->C->unique()); };
#ifndef PRODUCT
const VTrace& vtrace() const { return _vtrace; }
bool is_trace_preconditions() const {
return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
}
bool is_trace_pointer_analysis() const {
return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
}
#endif
// Is the node in the basic block of the loop?
// We only accept any nodes which have the loop head as their ctrl.
bool in_bb(const Node* n) const {
const Node* ctrl = _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n;
return n != nullptr && n->outcnt() > 0 && ctrl == _cl;
}
// Check if the loop passes some basic preconditions for vectorization.
// Return indicates if analysis succeeded.
bool check_preconditions();
private:
const char* check_preconditions_helper();
};
// Optimization to keep allocation of large arrays in AutoVectorization low.
// We allocate the arrays once, and reuse them for multiple loops that we
// AutoVectorize, clearing them before every new use.
class VSharedData : public StackObj {
private:
// Arena, used to allocate all arrays from.
Arena _arena;
// An array that maps node->_idx to a much smaller idx, which is at most the
// size of a loop body. This allow us to have smaller arrays for other data
// structures, since we are using smaller indices.
GrowableArray<int> _node_idx_to_loop_body_idx;
public:
VSharedData() :
_arena(mtCompiler),
_node_idx_to_loop_body_idx(&_arena, estimated_node_count(), 0, 0)
{
}
GrowableArray<int>& node_idx_to_loop_body_idx() {
return _node_idx_to_loop_body_idx;
}
// Must be cleared before each AutoVectorization use
void clear() {
_node_idx_to_loop_body_idx.clear();
}
private:
static int estimated_node_count() {
return (int)(1.10 * Compile::current()->unique());
}
};
// A vectorization pointer (VPointer) has information about an address for
// dependence checking and vector alignment. It's usually bound to a memory
// operation in a counted loop for vectorizable analysis.
class VPointer : public ArenaObj {
protected:
const MemNode* _mem; // My memory reference node
PhaseIdealLoop* _phase; // PhaseIdealLoop handle
IdealLoopTree* _lpt; // Current IdealLoopTree
PhiNode* _iv; // The loop induction variable
const VLoop& _vloop;
Node* _base; // null if unsafe nonheap reference
Node* _adr; // address pointer
@ -74,9 +190,10 @@ class VPointer : public ArenaObj {
bool _analyze_only; // Used in loop unrolling only for vpointer trace
uint _stack_idx; // Used in loop unrolling only for vpointer trace
PhaseIdealLoop* phase() const { return _phase; }
IdealLoopTree* lpt() const { return _lpt; }
PhiNode* iv() const { return _iv; }
const VLoop& vloop() const { return _vloop; }
PhaseIdealLoop* phase() const { return vloop().phase(); }
IdealLoopTree* lpt() const { return vloop().lpt(); }
PhiNode* iv() const { return vloop().iv(); }
bool is_loop_member(Node* n) const;
bool invariant(Node* n) const;
@ -97,13 +214,19 @@ class VPointer : public ArenaObj {
NotComparable = (Less | Greater | Equal)
};
VPointer(const MemNode* mem,
PhaseIdealLoop* phase, IdealLoopTree* lpt,
VPointer(const MemNode* mem, const VLoop& vloop) :
VPointer(mem, vloop, nullptr, false) {}
VPointer(const MemNode* mem, const VLoop& vloop, Node_Stack* nstack) :
VPointer(mem, vloop, nstack, true) {}
private:
VPointer(const MemNode* mem, const VLoop& vloop,
Node_Stack* nstack, bool analyze_only);
// Following is used to create a temporary object during
// the pattern match of an address expression.
VPointer(VPointer* p);
NONCOPYABLE(VPointer);
public:
bool valid() const { return _adr != nullptr; }
bool has_iv() const { return _scale != 0; }
@ -143,7 +266,7 @@ class VPointer : public ArenaObj {
bool overlap_possible_with_any_in(Node_List* p) {
for (uint k = 0; k < p->size(); k++) {
MemNode* mem = p->at(k)->as_Mem();
VPointer p_mem(mem, phase(), lpt(), nullptr, false);
VPointer p_mem(mem, vloop());
// Only if we know that we have Less or Greater can we
// be sure that there can never be an overlap between
// the two memory regions.