8311691: C2: Remove legacy code related to PostLoopMultiversioning

Reviewed-by: kvn, sviswanathan
This commit is contained in:
Pengfei Li 2023-07-13 01:45:52 +00:00
parent 38f74125d1
commit a38582e941
9 changed files with 49 additions and 606 deletions

@ -512,15 +512,6 @@ bool CompilerConfig::check_args_consistency(bool status) {
FLAG_SET_CMDLINE(BackgroundCompilation, false);
}
#ifdef COMPILER2
if (PostLoopMultiversioning && !RangeCheckElimination) {
if (!FLAG_IS_DEFAULT(PostLoopMultiversioning)) {
warning("PostLoopMultiversioning disabled because RangeCheckElimination is disabled.");
}
FLAG_SET_CMDLINE(PostLoopMultiversioning, false);
}
#endif // COMPILER2
if (CompilerConfig::is_interpreter_only()) {
if (UseCompiler) {
if (!FLAG_IS_DEFAULT(UseCompiler)) {

@ -182,9 +182,6 @@
"Map number of unrolls for main loop via " \
"Superword Level Parallelism analysis") \
\
product(bool, PostLoopMultiversioning, false, EXPERIMENTAL, \
"Multi versioned post loops to eliminate range checks") \
\
notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false, \
"Trace what Superword Level Parallelism analysis applies") \
\

@ -1888,55 +1888,6 @@ void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old
loop->record_for_igvn();
}
//-------------------------insert_scalar_rced_post_loop------------------------
// Insert a copy of the rce'd main loop as a post loop,
// We have not unrolled the main loop, so this is the right time to inject this.
// Later we will examine the partner of this post loop pair which still has range checks
// to see inject code which tests at runtime if the range checks are applicable.
void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) {
if (!loop->_head->is_CountedLoop()) return;
CountedLoopNode *cl = loop->_head->as_CountedLoop();
// only process RCE'd main loops
if (!cl->is_main_loop() || loop->range_checks_present()) return;
#ifndef PRODUCT
if (TraceLoopOpts) {
tty->print("PostScalarRce ");
loop->dump_head();
}
#endif
C->set_major_progress();
// Find common pieces of the loop being guarded with pre & post loops
CountedLoopNode *main_head = loop->_head->as_CountedLoop();
CountedLoopEndNode *main_end = main_head->loopexit();
// diagnostic to show loop end is not properly formed
assert(main_end->outcnt() == 2, "1 true, 1 false path only");
Node *incr = main_end->incr();
Node *limit = main_end->limit();
// In this case we throw away the result as we are not using it to connect anything else.
CountedLoopNode *post_head = nullptr;
insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
copy_assertion_predicates_to_post_loop(main_head->skip_strip_mined(), post_head, incr, main_head->stride());
// It's difficult to be precise about the trip-counts
// for post loops. They are usually very short,
// so guess that unit vector trips is a reasonable value.
post_head->set_profile_trip_cnt(4.0);
post_head->set_is_rce_post_loop();
// Now force out all loop-invariant dominating tests. The optimizer
// finds some, but we _know_ they are all useless.
peeled_dom_test_elim(loop, old_new);
loop->record_for_igvn();
}
//------------------------------insert_post_loop-------------------------------
// Insert post loops. Add a post loop to the given loop passed.
Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree* loop, Node_List& old_new,
@ -3198,143 +3149,6 @@ bool IdealLoopTree::compute_has_range_checks() const {
return false;
}
//-------------------------multi_version_post_loops----------------------------
// Check the range checks that remain, if simple, use the bounds to guard
// which version to a post loop we execute, one with range checks or one without
bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) {
bool multi_version_succeeded = false;
assert(RangeCheckElimination, "");
CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop();
assert(legacy_cl->is_post_loop(), "");
// Check for existence of range checks using the unique instance to make a guard with
Unique_Node_List worklist;
for (uint i = 0; i < legacy_loop->_body.size(); i++) {
Node *iff = legacy_loop->_body[i];
int iff_opc = iff->Opcode();
if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
worklist.push(iff);
}
}
// Find RCE'd post loop so that we can stage its guard.
if (legacy_cl->is_canonical_loop_entry() == nullptr) {
return multi_version_succeeded;
}
Node* ctrl = legacy_cl->in(LoopNode::EntryControl);
Node* iffm = ctrl->in(0);
// Now we test that both the post loops are connected
Node* post_loop_region = iffm->in(0);
if (post_loop_region == nullptr) return multi_version_succeeded;
if (!post_loop_region->is_Region()) return multi_version_succeeded;
Node* covering_region = post_loop_region->in(RegionNode::Control+1);
if (covering_region == nullptr) return multi_version_succeeded;
if (!covering_region->is_Region()) return multi_version_succeeded;
Node* p_f = covering_region->in(RegionNode::Control);
if (p_f == nullptr) return multi_version_succeeded;
if (!p_f->is_IfFalse()) return multi_version_succeeded;
if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded;
CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd();
if (rce_loop_end == nullptr) return multi_version_succeeded;
CountedLoopNode* rce_cl = rce_loop_end->loopnode();
if (rce_cl == nullptr || !rce_cl->is_post_loop()) return multi_version_succeeded;
CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop();
if (rce_cl != known_rce_cl) return multi_version_succeeded;
// Then we fetch the cover entry test
ctrl = rce_cl->in(LoopNode::EntryControl);
if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded;
#ifndef PRODUCT
if (TraceLoopOpts) {
tty->print("PostMultiVersion\n");
rce_loop->dump_head();
legacy_loop->dump_head();
}
#endif
// Now fetch the limit we want to compare against
Node *limit = rce_cl->limit();
bool first_time = true;
// If we got this far, we identified the post loop which has been RCE'd and
// we have a work list. Now we will try to transform the if guard to cause
// the loop pair to be multi version executed with the determination left to runtime
// or the optimizer if full information is known about the given arrays at compile time.
Node *last_min = nullptr;
multi_version_succeeded = true;
while (worklist.size()) {
Node* rc_iffm = worklist.pop();
if (rc_iffm->is_If()) {
Node *rc_bolzm = rc_iffm->in(1);
if (rc_bolzm->is_Bool()) {
Node *rc_cmpzm = rc_bolzm->in(1);
if (rc_cmpzm->is_Cmp()) {
Node *rc_left = rc_cmpzm->in(2);
if (rc_left->Opcode() != Op_LoadRange) {
multi_version_succeeded = false;
break;
}
if (first_time) {
last_min = rc_left;
first_time = false;
} else {
Node *cur_min = new MinINode(last_min, rc_left);
last_min = cur_min;
_igvn.register_new_node_with_optimizer(last_min);
}
}
}
}
}
// All we have to do is update the limit of the rce loop
// with the min of our expression and the current limit.
// We will use this expression to replace the current limit.
if (last_min && multi_version_succeeded) {
Node *cur_min = new MinINode(last_min, limit);
_igvn.register_new_node_with_optimizer(cur_min);
Node *cmp_node = rce_loop_end->cmp_node();
_igvn.replace_input_of(cmp_node, 2, cur_min);
set_ctrl(cur_min, ctrl);
set_loop(cur_min, rce_loop->_parent);
legacy_cl->mark_is_multiversioned();
rce_cl->mark_is_multiversioned();
multi_version_succeeded = true;
C->set_major_progress();
}
return multi_version_succeeded;
}
//-------------------------poison_rce_post_loop--------------------------------
// Causes the rce'd post loop to be optimized away if multiversioning fails
void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) {
CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop();
Node* ctrl = rce_cl->in(LoopNode::EntryControl);
if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) {
Node* iffm = ctrl->in(0);
if (iffm->is_If()) {
Node* cur_bool = iffm->in(1);
if (cur_bool->is_Bool()) {
Node* cur_cmp = cur_bool->in(1);
if (cur_cmp->is_Cmp()) {
BoolTest::mask new_test = BoolTest::gt;
BoolNode *new_bool = new BoolNode(cur_cmp, new_test);
_igvn.replace_node(cur_bool, new_bool);
_igvn._worklist.push(new_bool);
Node* left_op = cur_cmp->in(1);
_igvn.replace_input_of(cur_cmp, 2, left_op);
C->set_major_progress();
}
}
}
}
}
//------------------------------DCE_loop_body----------------------------------
// Remove simplistic dead code from loop body
void IdealLoopTree::DCE_loop_body() {
@ -3864,14 +3678,6 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
phase->do_range_check(this, old_new);
}
if (should_unroll && !should_peel && PostLoopMultiversioning &&
Matcher::has_predicated_vectors()) {
// Try to setup multiversioning on main loops before they are unrolled
if (cl->is_main_loop() && (cl->unrolled_count() == 1)) {
phase->insert_scalar_rced_post_loop(this, old_new);
}
}
// Double loop body for unrolling. Adjust the minimum-trip test (will do
// twice as many iterations as before) and the main body limit (only do
// an even number of trips). If we are peeling, we might enable some RCE

@ -4013,7 +4013,6 @@ void IdealLoopTree::dump_head() {
if (cl->is_post_loop()) tty->print(" post");
if (cl->is_vectorized_loop()) tty->print(" vector");
if (range_checks_present()) tty->print(" rc ");
if (cl->is_multiversioned()) tty->print(" multi ");
}
if (_has_call) tty->print(" has_call");
if (_has_sfpt) tty->print(" has_sfpt");
@ -4653,29 +4652,7 @@ void PhaseIdealLoop::build_and_optimize() {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted()) {
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (cl->is_rce_post_loop() && !cl->is_vectorized_loop()) {
assert(PostLoopMultiversioning, "multiversioning must be enabled");
// Check that the rce'd post loop is encountered first, multiversion after all
// major main loop optimization are concluded
if (!C->major_progress()) {
IdealLoopTree *lpt_next = lpt->_next;
if (lpt_next && lpt_next->is_counted()) {
CountedLoopNode *cl = lpt_next->_head->as_CountedLoop();
if (cl->is_post_loop() && lpt_next->range_checks_present()) {
if (!cl->is_multiversioned()) {
if (multi_version_post_loops(lpt, lpt_next) == false) {
// Cause the rce loop to be optimized away if we fail
cl->mark_is_multiversioned();
cl->set_slp_max_unroll(0);
poison_rce_post_loop(lpt);
}
}
}
}
sw.transform_loop(lpt, true);
}
} else if (cl->is_main_loop()) {
if (cl->is_main_loop()) {
if (!sw.transform_loop(lpt, true)) {
// Instigate more unrolling for optimization when vectorization fails.
if (cl->has_passed_slp()) {

@ -72,16 +72,13 @@ protected:
DoUnrollOnly = 1<<9,
VectorizedLoop = 1<<10,
HasAtomicPostLoop = 1<<11,
IsMultiversioned = 1<<12,
StripMined = 1<<13,
SubwordLoop = 1<<14,
ProfileTripFailed = 1<<15,
LoopNestInnerLoop = 1<<16,
LoopNestLongOuterLoop = 1<<17};
StripMined = 1<<12,
SubwordLoop = 1<<13,
ProfileTripFailed = 1<<14,
LoopNestInnerLoop = 1<<15,
LoopNestLongOuterLoop = 1<<16 };
char _unswitch_count;
enum { _unswitch_max=3 };
char _postloop_flags;
enum { RCEPostLoop = 1 };
// Expected trip count from profile data
float _profile_trip_cnt;
@ -93,7 +90,6 @@ public:
bool is_inner_loop() const { return _loop_flags & InnerLoop; }
void set_inner_loop() { _loop_flags |= InnerLoop; }
bool is_multiversioned() const { return _loop_flags & IsMultiversioned; }
bool is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
bool is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
@ -110,7 +106,6 @@ public:
void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
void mark_strip_mined() { _loop_flags |= StripMined; }
void clear_strip_mined() { _loop_flags &= ~StripMined; }
void mark_profile_trip_failed() { _loop_flags |= ProfileTripFailed; }
@ -121,9 +116,6 @@ public:
int unswitch_max() { return _unswitch_max; }
int unswitch_count() { return _unswitch_count; }
int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; }
void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; }
void set_unswitch_count(int val) {
assert (val <= unswitch_max(), "too many unswitches");
_unswitch_count = val;
@ -134,7 +126,7 @@ public:
LoopNode(Node *entry, Node *backedge)
: RegionNode(3), _loop_flags(0), _unswitch_count(0),
_postloop_flags(0), _profile_trip_cnt(COUNT_UNKNOWN) {
_profile_trip_cnt(COUNT_UNKNOWN) {
init_class_id(Class_Loop);
init_req(EntryControl, entry);
init_req(LoopBackControl, backedge);
@ -322,8 +314,6 @@ public:
int node_count_before_unroll() { return _node_count_before_unroll; }
void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
int slp_max_unroll() const { return _slp_maximum_unroll_factor; }
void set_slp_pack_count(int pack_count) { _slp_vector_pack_count = pack_count; }
int slp_pack_count() const { return _slp_vector_pack_count; }
virtual LoopNode* skip_strip_mined(int expect_skeleton = 1);
OuterStripMinedLoopNode* outer_loop() const;
@ -1305,9 +1295,6 @@ public:
CountedLoopNode* main_head, CountedLoopEndNode* main_end,
Node*& incr, Node* limit, CountedLoopNode*& post_head);
// Add an RCE'd post loop which we will multi-version adapt for run time test path usage
void insert_scalar_rced_post_loop( IdealLoopTree *loop, Node_List &old_new );
// Add a vector post loop between a vector main loop and the current post loop
void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
// If Node n lives in the back_ctrl block, we clone a private version of n
@ -1402,13 +1389,6 @@ public:
// Eliminate range-checks and other trip-counter vs loop-invariant tests.
void do_range_check(IdealLoopTree *loop, Node_List &old_new);
// Process post loops which have range checks and try to build a multi-version
// guard to safely determine if we can execute the post loop which was RCE'd.
bool multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop);
// Cause the rce'd post loop to optimized away, this happens if we cannot complete multiverioning
void poison_rce_post_loop(IdealLoopTree *rce_loop);
// Create a slow version of the loop by cloning the loop
// and inserting an if to select fast-slow versions.
// Return the inserted if.

@ -54,7 +54,6 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
_packset(arena(), 8, 0, nullptr), // packs for the current block
_bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
_block(arena(), 8, 0, nullptr), // nodes in current block
_post_block(arena(), 8, 0, nullptr), // nodes common to current block which are marked as post loop vectorizable
_data_entry(arena(), 8, 0, nullptr), // nodes with all inputs from outside
_mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads
_mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails
@ -116,11 +115,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
mark_reductions();
}
if (cl->is_rce_post_loop() && is_marked_reduction_loop()) {
// Post loop vectorization doesn't support reductions
return false;
}
// skip any loop that has not been assigned max unroll by analysis
if (do_optimization) {
if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
@ -176,24 +170,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
if (do_optimization) {
assert(_packset.length() == 0, "packset must be empty");
success = SLP_extract();
if (PostLoopMultiversioning) {
if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) {
IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
// Main loop SLP works well for manually unrolled loops. But post loop
// vectorization doesn't work for these. To bail out the optimization
// earlier, we have range check and loop stride conditions below.
if (cl_next->is_post_loop() && !lpt_next->range_checks_present() &&
cl_next->stride_is_con() && abs(cl_next->stride_con()) == 1) {
if (!cl_next->is_vectorized_loop()) {
// Propagate some main loop attributes to its corresponding scalar
// rce'd post loop for vectorization with vector masks
cl_next->set_slp_max_unroll(cl->slp_max_unroll());
cl_next->set_slp_pack_count(cl->slp_pack_count());
}
}
}
}
}
return success;
}
@ -206,9 +182,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
Node_Stack nstack((int)ignored_size);
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
Node *cl_exit = cl->loopexit_or_null();
int rpo_idx = _post_block.length();
assert(rpo_idx == 0, "post loop block is empty");
// First clear the entries
for (uint i = 0; i < lpt()->_body.size(); i++) {
@ -313,27 +286,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
}
if (is_slp) {
// In the main loop, SLP works well if parts of the operations in the loop body
// are not vectorizable and those non-vectorizable parts will be unrolled only.
// But in post loops with vector masks, we create singleton packs directly from
// scalars so all operations should be vectorized together. This compares the
// number of packs in the post loop with the main loop and bail out if the post
// loop potentially has more packs.
if (cl->is_rce_post_loop()) {
for (uint i = 0; i < lpt()->_body.size(); i++) {
if (ignored_loop_nodes[i] == -1) {
_post_block.at_put_grow(rpo_idx++, lpt()->_body.at(i));
}
}
if (_post_block.length() > cl->slp_pack_count()) {
// Clear local_loop_unroll_factor and bail out directly from here
local_loop_unroll_factor = 0;
cl->mark_was_slp();
cl->set_slp_max_unroll(0);
return;
}
}
// Now we try to find the maximum supported consistent vector which the machine
// description can use
bool flag_small_bt = false;
@ -404,7 +356,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
cl->mark_passed_slp();
}
cl->mark_was_slp();
if (cl->is_main_loop() || cl->is_rce_post_loop()) {
if (cl->is_main_loop()) {
cl->set_slp_max_unroll(local_loop_unroll_factor);
}
}
@ -590,80 +542,43 @@ bool SuperWord::SLP_extract() {
}
}
#endif
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
// Ready the block
if (!construct_bb()) {
return false; // Exit if no interesting nodes or complex graph.
}
// build _dg, _disjoint_ptrs
// build _dg, _disjoint_ptrs
dependence_graph();
// compute function depth(Node*)
compute_max_depth();
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
if (cl->is_main_loop()) {
compute_vector_element_type();
// Compute vector element types
compute_vector_element_type();
// Attempt vectorization
// Attempt vectorization
find_adjacent_refs();
find_adjacent_refs();
if (align_to_ref() == nullptr) {
return false; // Did not find memory reference to align vectors
}
extend_packlist();
combine_packs();
construct_my_pack_map();
filter_packs();
DEBUG_ONLY(verify_packs();)
schedule();
// Record eventual count of vector packs for checks in post loop vectorization
if (PostLoopMultiversioning) {
cl->set_slp_pack_count(_packset.length());
}
} else {
assert(cl->is_rce_post_loop(), "Must be an rce'd post loop");
int saved_mapped_unroll_factor = cl->slp_max_unroll();
if (saved_mapped_unroll_factor) {
int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
// now reset the slp_unroll_factor so that we can check the analysis mapped
// what the vector loop was mapped to
cl->set_slp_max_unroll(0);
// do the analysis on the post loop
unrolling_analysis(vector_mapped_unroll_factor);
// if our analyzed loop is a canonical fit, start processing it
if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
// now add the vector nodes to packsets
for (int i = 0; i < _post_block.length(); i++) {
Node* n = _post_block.at(i);
Node_List* singleton = new Node_List();
singleton->push(n);
_packset.append(singleton);
set_my_pack(n, singleton);
}
// map base types for vector usage
compute_vector_element_type();
} else {
return false;
}
} else {
// for some reason we could not map the slp analysis state of the vectorized loop
return false;
}
if (align_to_ref() == nullptr) {
return false; // Did not find memory reference to align vectors
}
extend_packlist();
combine_packs();
construct_my_pack_map();
filter_packs();
DEBUG_ONLY(verify_packs();)
schedule();
return output();
}
@ -1143,6 +1058,8 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
void SuperWord::dependence_graph() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
// First, assign a dependence node to each memory node
for (int i = 0; i < _block.length(); i++ ) {
Node *n = _block.at(i);
@ -1157,9 +1074,7 @@ void SuperWord::dependence_graph() {
Node* n_tail = _mem_slice_tail.at(i);
// Get slice in predecessor order (last is first)
if (cl->is_main_loop()) {
mem_slice_preds(n_tail, n, _nlist);
}
mem_slice_preds(n_tail, n, _nlist);
#ifndef PRODUCT
if(TraceSuperWord && Verbose) {
@ -2591,6 +2506,7 @@ void SuperWord::print_loop(bool whole) {
// Convert packs into vector node operations
bool SuperWord::output() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
Compile* C = _phase->C;
if (_packset.length() == 0) {
return false;
@ -2603,16 +2519,13 @@ bool SuperWord::output() {
}
#endif
if (cl->is_main_loop()) {
// MUST ENSURE main loop's initial value is properly aligned:
// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
// Ensure main loop's initial value is properly aligned
// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
align_initial_loop_index(align_to_ref());
align_initial_loop_index(align_to_ref());
// Insert extract (unpack) operations for scalar uses
for (int i = 0; i < _packset.length(); i++) {
insert_extracts(_packset.at(i));
}
// Insert extract (unpack) operations for scalar uses
for (int i = 0; i < _packset.length(); i++) {
insert_extracts(_packset.at(i));
}
uint max_vlen_in_bytes = 0;
@ -2629,16 +2542,6 @@ bool SuperWord::output() {
return false;
}
Node* vmask = nullptr;
if (cl->is_rce_post_loop() && do_reserve_copy()) {
// Create a vector mask node for post loop, bail out if not created
vmask = create_post_loop_vmask();
if (vmask == nullptr) {
// create_post_loop_vmask checks many conditions, any of them could fail
return false; // and reverse to backup IG
}
}
for (int i = 0; i < _block.length(); i++) {
Node* n = _block.at(i);
Node_List* p = my_pack(n);
@ -2650,10 +2553,6 @@ bool SuperWord::output() {
uint vlen = p->size();
uint vlen_in_bytes = 0;
Node* vn = nullptr;
if (cl->is_rce_post_loop()) {
// override vlen with the main loops vector length
vlen = cl->slp_max_unroll();
}
NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
int opc = n->Opcode();
if (n->is_Load()) {
@ -2675,13 +2574,7 @@ bool SuperWord::output() {
}
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
if (cl->is_rce_post_loop()) {
assert(vmask != nullptr, "vector mask should be generated");
const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen);
vn = new LoadVectorMaskedNode(ctl, mem, adr, atyp, vt, vmask);
} else {
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
}
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
vlen_in_bytes = vn->as_LoadVector()->memory_size();
} else if (n->is_Store()) {
// Promote value to be stored to vector
@ -2699,13 +2592,7 @@ bool SuperWord::output() {
Node* mem = first->in(MemNode::Memory);
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
if (cl->is_rce_post_loop()) {
assert(vmask != nullptr, "vector mask should be generated");
const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen);
vn = new StoreVectorMaskedNode(ctl, mem, adr, val, atyp, vmask);
} else {
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
}
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
vlen_in_bytes = vn->as_StoreVector()->memory_size();
} else if (VectorNode::is_scalar_rotate(n)) {
Node* in1 = first->in(1);
@ -2961,20 +2848,12 @@ bool SuperWord::output() {
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
}
// For atomic unrolled loops which are vector mapped, instigate more unrolling
cl->set_notpassed_slp();
if (cl->is_main_loop()) {
// if vector resources are limited, do not allow additional unrolling, also
// do not unroll more on pure vector loops which were not reduced so that we can
// program the post loop to single iteration execution.
if (Matcher::float_pressure_limit() > 8) {
C->set_major_progress();
cl->mark_do_unroll_only();
}
}
if (cl->is_rce_post_loop() && do_reserve_copy()) {
cl->mark_is_multiversioned();
// if vector resources are limited, do not allow additional unrolling
if (Matcher::float_pressure_limit() > 8) {
C->set_major_progress();
cl->mark_do_unroll_only();
}
}
}
@ -2988,107 +2867,6 @@ bool SuperWord::output() {
return true;
}
//-------------------------create_post_loop_vmask-------------------------
// Check the post loop vectorizability and create a vector mask if yes.
// Return null to bail out if post loop is not vectorizable.
Node* SuperWord::create_post_loop_vmask() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_rce_post_loop(), "Must be an rce post loop");
assert(!is_marked_reduction_loop(), "no vector reduction in post loop");
assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1");
// Collect vector element types of all post loop packs. Also collect
// superword pointers of each memory access operation if the address
// expression is supported. (Note that vectorizable post loop should
// only have positive scale in counting-up loop and negative scale in
// counting-down loop.) Collected SWPointer(s) are also used for data
// dependence check next.
VectorElementSizeStats stats(_arena);
GrowableArray<SWPointer*> swptrs(_arena, _packset.length(), 0, nullptr);
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
assert(p->size() == 1, "all post loop packs should be singleton");
Node* n = p->at(0);
BasicType bt = velt_basic_type(n);
if (!is_java_primitive(bt)) {
return nullptr;
}
if (n->is_Mem()) {
SWPointer* mem_p = new (_arena) SWPointer(n->as_Mem(), this, nullptr, false);
// For each memory access, we check if the scale (in bytes) in its
// address expression is equal to the data size times loop stride.
// With this, Only positive scales exist in counting-up loops and
// negative scales exist in counting-down loops.
if (mem_p->scale_in_bytes() != type2aelembytes(bt) * cl->stride_con()) {
return nullptr;
}
swptrs.append(mem_p);
}
stats.record_size(type2aelembytes(bt));
}
// Find the vector data type for generating vector masks. Currently we
// don't support post loops with mixed vector data sizes
int unique_size = stats.unique_size();
BasicType vmask_bt;
switch (unique_size) {
case 1: vmask_bt = T_BYTE; break;
case 2: vmask_bt = T_SHORT; break;
case 4: vmask_bt = T_INT; break;
case 8: vmask_bt = T_LONG; break;
default: return nullptr;
}
// Currently we can't remove this MaxVectorSize constraint. Without it,
// it's not guaranteed that the RCE'd post loop runs at most "vlen - 1"
// iterations, because the vector drain loop may not be cloned from the
// vectorized main loop. We should re-engineer PostLoopMultiversioning
// to fix this problem.
int vlen = cl->slp_max_unroll();
if (unique_size * vlen != MaxVectorSize) {
return nullptr;
}
// Bail out if target doesn't support mask generator or masked load/store
if (!Matcher::match_rule_supported_vector(Op_LoadVectorMasked, vlen, vmask_bt) ||
!Matcher::match_rule_supported_vector(Op_StoreVectorMasked, vlen, vmask_bt) ||
!Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, vmask_bt)) {
return nullptr;
}
// Bail out if potential data dependence exists between memory accesses
if (SWPointer::has_potential_dependence(swptrs)) {
return nullptr;
}
// Create vector mask with the post loop trip count. Note there's another
// vector drain loop which is cloned from main loop before super-unrolling
// so the scalar post loop runs at most vlen-1 trips. Hence, this version
// only runs at most 1 iteration after vector mask transformation.
Node* trip_cnt;
Node* new_incr;
if (cl->stride_con() > 0) {
trip_cnt = new SubINode(cl->limit(), cl->init_trip());
new_incr = new AddINode(cl->phi(), trip_cnt);
} else {
trip_cnt = new SubINode(cl->init_trip(), cl->limit());
new_incr = new SubINode(cl->phi(), trip_cnt);
}
_igvn.register_new_node_with_optimizer(trip_cnt);
_igvn.register_new_node_with_optimizer(new_incr);
_igvn.replace_node(cl->incr(), new_incr);
Node* length = new ConvI2LNode(trip_cnt);
_igvn.register_new_node_with_optimizer(length);
Node* vmask = VectorMaskGenNode::make(length, vmask_bt);
_igvn.register_new_node_with_optimizer(vmask);
// Remove exit test to transform 1-iteration loop to straight-line code.
// This results in redundant cmp+branch instructions been eliminated.
Node *cl_exit = cl->loopexit();
_igvn.replace_input_of(cl_exit, 1, _igvn.intcon(0));
return vmask;
}
//------------------------------vector_opd---------------------------
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
@ -3098,19 +2876,11 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
bool have_same_inputs = same_inputs(p, opd_idx);
if (cl->is_rce_post_loop()) {
// override vlen with the main loops vector length
assert(p->size() == 1, "Packs in post loop should have only one node");
vlen = cl->slp_max_unroll();
}
// Insert index population operation to create a vector of increasing
// indices starting from the iv value. In some special unrolled loops
// (see JDK-8286125), we need scalar replications of the iv value if
// all inputs are the same iv, so we do a same inputs check here. But
// in post loops, "have_same_inputs" is always true because all packs
// are singleton. That's why a pack size check is also required.
if (opd == iv() && (!have_same_inputs || p->size() == 1)) {
// all inputs are the same iv, so we do a same inputs check here.
if (opd == iv() && !have_same_inputs) {
BasicType p0_bt = velt_basic_type(p0);
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
@ -4026,7 +3796,6 @@ void SuperWord::init() {
_packset.clear();
_disjoint_ptrs.clear();
_block.clear();
_post_block.clear();
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
@ -4447,34 +4216,6 @@ void SWPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
_invar = register_if_new(add);
}
//-----------------has_potential_dependence-----------------
// Check potential data dependence among all memory accesses.
// We require every two accesses (with at least one store) of
// the same element type has the same address expression.
bool SWPointer::has_potential_dependence(GrowableArray<SWPointer*> swptrs) {
for (int i1 = 0; i1 < swptrs.length(); i1++) {
SWPointer* p1 = swptrs.at(i1);
MemNode* n1 = p1->mem();
BasicType bt1 = n1->memory_type();
// Iterate over remaining SWPointers
for (int i2 = i1 + 1; i2 < swptrs.length(); i2++) {
SWPointer* p2 = swptrs.at(i2);
MemNode* n2 = p2->mem();
BasicType bt2 = n2->memory_type();
// Data dependence exists between load-store, store-load
// or store-store with the same element type or subword
// size (subword load/store may have inaccurate type)
if ((n1->is_Store() || n2->is_Store()) &&
same_type_or_subword_size(bt1, bt2) && !p1->equal(*p2)) {
return true;
}
}
}
return false;
}
//----------------------------print------------------------
void SWPointer::print() {
#ifndef PRODUCT

@ -285,7 +285,6 @@ class SuperWord : public ResourceObj {
GrowableArray<int> _bb_idx; // Map from Node _idx to index within block
GrowableArray<Node*> _block; // Nodes in current block
GrowableArray<Node*> _post_block; // Nodes in post loop block
GrowableArray<Node*> _data_entry; // Nodes with all inputs from outside
GrowableArray<Node*> _mem_slice_head; // Memory slice head nodes
GrowableArray<Node*> _mem_slice_tail; // Memory slice tail nodes
@ -579,8 +578,6 @@ private:
// Convert packs into vector node operations
bool output();
// Create vector mask for post loop vectorization
Node* create_post_loop_vmask();
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
Node* vector_opd(Node_List* p, int opd_idx);
// Can code be generated for pack p?
@ -725,8 +722,6 @@ class SWPointer : public ArenaObj {
static bool equal(int cmp) { return cmp == Equal; }
static bool comparable(int cmp) { return cmp < NotComparable; }
static bool has_potential_dependence(GrowableArray<SWPointer*> swptrs);
void print();
#ifndef PRODUCT

@ -1,41 +0,0 @@
/*
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test TestRangeCheckEliminationDisabled
* @bug 8154763
* @summary Tests PostLoopMultiversioning with RangeCheckElimination disabled.
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
* -XX:+UnlockExperimentalVMOptions -XX:+PostLoopMultiversioning -XX:-RangeCheckElimination
* compiler.rangechecks.TestRangeCheckEliminationDisabled
*/
package compiler.rangechecks;
public class TestRangeCheckEliminationDisabled {
public static void main(String[] args) {
System.out.println("Passed");
}
}

@ -61,9 +61,6 @@ public class VectorizationTestRunner {
// each test method returning a primitive value or an array of primitive type.
// And each test method should not throw any exceptions.
Class klass = getClass();
// Add extra VM options to verify experimental auto-vectorization
WB.setBooleanVMFlag("UnlockExperimentalVMOptions", true);
WB.setBooleanVMFlag("PostLoopMultiversioning", true);
for (Method method : klass.getDeclaredMethods()) {
try {
if (method.isAnnotationPresent(Test.class)) {