8287087: C2: perform SLP reduction analysis on-demand

Reviewed-by: epeter, jbhateja, thartmann
This commit is contained in:
Roberto Castañeda Lozano 2023-04-27 09:39:53 +00:00
parent ba43649530
commit 1be80a4445
17 changed files with 715 additions and 167 deletions

View File

@ -5374,7 +5374,7 @@ instruct loadD(regD dst, memory mem)
// max = java.lang.Math.max(float a, float b) // max = java.lang.Math.max(float a, float b)
instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
predicate(UseAVX > 0 && !n->is_reduction()); predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
match(Set dst (MaxF a b)); match(Set dst (MaxF a b));
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
format %{ format %{
@ -5396,7 +5396,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
%} %}
instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
predicate(UseAVX > 0 && n->is_reduction()); predicate(UseAVX > 0 && SuperWord::is_reduction(n));
match(Set dst (MaxF a b)); match(Set dst (MaxF a b));
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
@ -5410,7 +5410,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
// max = java.lang.Math.max(double a, double b) // max = java.lang.Math.max(double a, double b)
instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
predicate(UseAVX > 0 && !n->is_reduction()); predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
match(Set dst (MaxD a b)); match(Set dst (MaxD a b));
effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp); effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
format %{ format %{
@ -5432,7 +5432,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
%} %}
instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
predicate(UseAVX > 0 && n->is_reduction()); predicate(UseAVX > 0 && SuperWord::is_reduction(n));
match(Set dst (MaxD a b)); match(Set dst (MaxD a b));
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
@ -5446,7 +5446,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe
// min = java.lang.Math.min(float a, float b) // min = java.lang.Math.min(float a, float b)
instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
predicate(UseAVX > 0 && !n->is_reduction()); predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
match(Set dst (MinF a b)); match(Set dst (MinF a b));
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
format %{ format %{
@ -5468,7 +5468,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
%} %}
instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
predicate(UseAVX > 0 && n->is_reduction()); predicate(UseAVX > 0 && SuperWord::is_reduction(n));
match(Set dst (MinF a b)); match(Set dst (MinF a b));
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
@ -5482,7 +5482,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
// min = java.lang.Math.min(double a, double b) // min = java.lang.Math.min(double a, double b)
instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
predicate(UseAVX > 0 && !n->is_reduction()); predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
match(Set dst (MinD a b)); match(Set dst (MinD a b));
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
format %{ format %{
@ -5504,7 +5504,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
%} %}
instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
predicate(UseAVX > 0 && n->is_reduction()); predicate(UseAVX > 0 && SuperWord::is_reduction(n));
match(Set dst (MinD a b)); match(Set dst (MinD a b));
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -271,6 +271,7 @@ int main(int argc, char *argv[])
AD.addInclude(AD._DFA_file, "opto/narrowptrnode.hpp"); AD.addInclude(AD._DFA_file, "opto/narrowptrnode.hpp");
AD.addInclude(AD._DFA_file, "opto/opcodes.hpp"); AD.addInclude(AD._DFA_file, "opto/opcodes.hpp");
AD.addInclude(AD._DFA_file, "opto/convertnode.hpp"); AD.addInclude(AD._DFA_file, "opto/convertnode.hpp");
AD.addInclude(AD._DFA_file, "opto/superword.hpp");
AD.addInclude(AD._DFA_file, "utilities/powerOfTwo.hpp"); AD.addInclude(AD._DFA_file, "utilities/powerOfTwo.hpp");
// Make sure each .cpp file starts with include lines: // Make sure each .cpp file starts with include lines:

View File

@ -462,8 +462,8 @@ void IdealGraphPrinter::visit_node(Node *n, bool edges, VectorSet* temp_set) {
if (flags & Node::Flag_has_call) { if (flags & Node::Flag_has_call) {
print_prop("has_call", "true"); print_prop("has_call", "true");
} }
if (flags & Node::Flag_is_reduction) { if (flags & Node::Flag_has_swapped_edges) {
print_prop("is_reduction", "true"); print_prop("has_swapped_edges", "true");
} }
if (C->matcher() != nullptr) { if (C->matcher() != nullptr) {

View File

@ -1037,10 +1037,6 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
} }
if (UseSuperWord) { if (UseSuperWord) {
if (!cl->is_reduction_loop()) {
phase->mark_reductions(this);
}
// Only attempt slp analysis when user controls do not prohibit it // Only attempt slp analysis when user controls do not prohibit it
if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) { if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
// Once policy_slp_analysis succeeds, mark the loop with the // Once policy_slp_analysis succeeds, mark the loop with the
@ -1694,15 +1690,6 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
set_idom(new_pre_exit, pre_end, dd_main_head); set_idom(new_pre_exit, pre_end, dd_main_head);
set_loop(new_pre_exit, outer_loop->_parent); set_loop(new_pre_exit, outer_loop->_parent);
if (peel_only) {
// Nodes in the peeled iteration that were marked as reductions within the
// original loop might not be reductions within their new outer loop.
for (uint i = 0; i < loop->_body.size(); i++) {
Node* n = old_new[loop->_body[i]->_idx];
n->remove_flag(Node::Flag_is_reduction);
}
}
// Step B2: Build a zero-trip guard for the main-loop. After leaving the // Step B2: Build a zero-trip guard for the main-loop. After leaving the
// pre-loop, the main-loop may not execute at all. Later in life this // pre-loop, the main-loop may not execute at all. Later in life this
// zero-trip guard will become the minimum-trip guard when we unroll // zero-trip guard will become the minimum-trip guard when we unroll
@ -2456,69 +2443,6 @@ void PhaseIdealLoop::do_maximally_unroll(IdealLoopTree *loop, Node_List &old_new
} }
} }
void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
if (SuperWordReductions == false) return;
CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
if (loop_head->unrolled_count() > 1) {
return;
}
Node* trip_phi = loop_head->phi();
for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
Node* phi = loop_head->fast_out(i);
if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
// For definitions which are loop inclusive and not tripcounts.
Node* def_node = phi->in(LoopNode::LoopBackControl);
if (def_node != nullptr) {
Node* n_ctrl = get_ctrl(def_node);
if (n_ctrl != nullptr && loop->is_member(get_loop(n_ctrl))) {
// Now test it to see if it fits the standard pattern for a reduction operator.
int opc = def_node->Opcode();
if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())
|| opc == Op_MinD || opc == Op_MinF || opc == Op_MaxD || opc == Op_MaxF) {
if (!def_node->is_reduction()) { // Not marked yet
// To be a reduction, the arithmetic node must have the phi as input and provide a def to it
bool ok = false;
for (unsigned j = 1; j < def_node->req(); j++) {
Node* in = def_node->in(j);
if (in == phi) {
ok = true;
break;
}
}
// do nothing if we did not match the initial criteria
if (ok == false) {
continue;
}
// The result of the reduction must not be used in the loop
for (DUIterator_Fast imax, i = def_node->fast_outs(imax); i < imax && ok; i++) {
Node* u = def_node->fast_out(i);
if (!loop->is_member(get_loop(ctrl_or_self(u)))) {
continue;
}
if (u == phi) {
continue;
}
ok = false;
}
// iff the uses conform
if (ok) {
def_node->add_flag(Node::Flag_is_reduction);
loop_head->mark_has_reductions();
}
}
}
}
}
}
}
}
//------------------------------adjust_limit----------------------------------- //------------------------------adjust_limit-----------------------------------
// Helper function that computes new loop limit as (rc_limit-offset)/scale // Helper function that computes new loop limit as (rc_limit-offset)/scale
Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) { Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) {

View File

@ -2249,7 +2249,6 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
if (is_pre_loop ()) st->print("pre of N%d" , _main_idx); if (is_pre_loop ()) st->print("pre of N%d" , _main_idx);
if (is_main_loop()) st->print("main of N%d", _idx); if (is_main_loop()) st->print("main of N%d", _idx);
if (is_post_loop()) st->print("post of N%d", _main_idx); if (is_post_loop()) st->print("post of N%d", _main_idx);
if (is_reduction_loop()) st->print(" reduction");
if (is_strip_mined()) st->print(" strip mined"); if (is_strip_mined()) st->print(" strip mined");
} }
#endif #endif
@ -3991,7 +3990,6 @@ void IdealLoopTree::dump_head() {
if (cl->is_pre_loop ()) tty->print(" pre" ); if (cl->is_pre_loop ()) tty->print(" pre" );
if (cl->is_main_loop()) tty->print(" main"); if (cl->is_main_loop()) tty->print(" main");
if (cl->is_post_loop()) tty->print(" post"); if (cl->is_post_loop()) tty->print(" post");
if (cl->is_reduction_loop()) tty->print(" reduction");
if (cl->is_vectorized_loop()) tty->print(" vector"); if (cl->is_vectorized_loop()) tty->print(" vector");
if (range_checks_present()) tty->print(" rc "); if (range_checks_present()) tty->print(" rc ");
if (cl->is_multiversioned()) tty->print(" multi "); if (cl->is_multiversioned()) tty->print(" multi ");

View File

@ -61,23 +61,22 @@ protected:
uint _loop_flags; uint _loop_flags;
// Names for flag bitfields // Names for flag bitfields
enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3, enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3,
MainHasNoPreLoop = 1<<2, MainHasNoPreLoop = 1<<2,
HasExactTripCount = 1<<3, HasExactTripCount = 1<<3,
InnerLoop = 1<<4, InnerLoop = 1<<4,
PartialPeelLoop = 1<<5, PartialPeelLoop = 1<<5,
PartialPeelFailed = 1<<6, PartialPeelFailed = 1<<6,
HasReductions = 1<<7, WasSlpAnalyzed = 1<<7,
WasSlpAnalyzed = 1<<8, PassedSlpAnalysis = 1<<8,
PassedSlpAnalysis = 1<<9, DoUnrollOnly = 1<<9,
DoUnrollOnly = 1<<10, VectorizedLoop = 1<<10,
VectorizedLoop = 1<<11, HasAtomicPostLoop = 1<<11,
HasAtomicPostLoop = 1<<12, IsMultiversioned = 1<<12,
IsMultiversioned = 1<<13, StripMined = 1<<13,
StripMined = 1<<14, SubwordLoop = 1<<14,
SubwordLoop = 1<<15, ProfileTripFailed = 1<<15,
ProfileTripFailed = 1<<16, LoopNestInnerLoop = 1<<16,
LoopNestInnerLoop = 1 << 17, LoopNestLongOuterLoop = 1<<17};
LoopNestLongOuterLoop = 1 << 18};
char _unswitch_count; char _unswitch_count;
enum { _unswitch_max=3 }; enum { _unswitch_max=3 };
char _postloop_flags; char _postloop_flags;
@ -105,7 +104,6 @@ public:
bool is_loop_nest_outer_loop() const { return _loop_flags & LoopNestLongOuterLoop; } bool is_loop_nest_outer_loop() const { return _loop_flags & LoopNestLongOuterLoop; }
void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; } void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
void mark_has_reductions() { _loop_flags |= HasReductions; }
void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; } void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; } void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; } void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
@ -286,7 +284,6 @@ public:
bool is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; } bool is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
bool is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; } bool is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
bool is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; } bool is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
bool is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
bool was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; } bool was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
bool has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; } bool has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
bool is_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; } bool is_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
@ -1313,9 +1310,6 @@ public:
// Unroll the loop body one step - make each trip do 2 iterations. // Unroll the loop body one step - make each trip do 2 iterations.
void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip ); void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );
// Mark vector reduction candidates before loop unrolling
void mark_reductions( IdealLoopTree *loop );
// Return true if exp is a constant times an induction var // Return true if exp is a constant times an induction var
bool is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth = 0); bool is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth = 0);

View File

@ -2622,10 +2622,6 @@ void PhaseIdealLoop::clone_loop_body(const Node_List& body, Node_List &old_new,
Node* old = body.at(i); Node* old = body.at(i);
Node* nnn = old->clone(); Node* nnn = old->clone();
old_new.map(old->_idx, nnn); old_new.map(old->_idx, nnn);
if (old->is_reduction()) {
// Reduction flag is not copied by default. Copy it here when cloning the entire loop body.
nnn->add_flag(Node::Flag_is_reduction);
}
if (C->do_vector_loop() && cm != nullptr) { if (C->do_vector_loop() && cm != nullptr) {
cm->verify_insert_and_clone(old, nnn, cm->clone_idx()); cm->verify_insert_and_clone(old, nnn, cm->clone_idx());
} }

View File

@ -521,10 +521,6 @@ Node *Node::clone() const {
// If it is applicable, it will happen anyway when the cloned node is registered with IGVN. // If it is applicable, it will happen anyway when the cloned node is registered with IGVN.
n->remove_flag(Node::NodeFlags::Flag_for_post_loop_opts_igvn); n->remove_flag(Node::NodeFlags::Flag_for_post_loop_opts_igvn);
} }
if (n->is_reduction()) {
// Do not copy reduction information. This must be explicitly set by the calling code.
n->remove_flag(Node::Flag_is_reduction);
}
BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2(); BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
bs->register_potential_barrier_node(n); bs->register_potential_barrier_node(n);

View File

@ -578,6 +578,12 @@ public:
_in[i2] = n1; _in[i2] = n1;
// If this node is in the hash table, make sure it doesn't need a rehash. // If this node is in the hash table, make sure it doesn't need a rehash.
assert(check_hash == NO_HASH || check_hash == hash(), "edge swap must preserve hash code"); assert(check_hash == NO_HASH || check_hash == hash(), "edge swap must preserve hash code");
// Flip swapped edges flag.
if (has_swapped_edges()) {
remove_flag(Node::Flag_has_swapped_edges);
} else {
add_flag(Node::Flag_has_swapped_edges);
}
} }
// Iterators over input Nodes for a Node X are written as: // Iterators over input Nodes for a Node X are written as:
@ -784,7 +790,7 @@ public:
Flag_avoid_back_to_back_before = 1 << 8, Flag_avoid_back_to_back_before = 1 << 8,
Flag_avoid_back_to_back_after = 1 << 9, Flag_avoid_back_to_back_after = 1 << 9,
Flag_has_call = 1 << 10, Flag_has_call = 1 << 10,
Flag_is_reduction = 1 << 11, Flag_has_swapped_edges = 1 << 11,
Flag_is_scheduled = 1 << 12, Flag_is_scheduled = 1 << 12,
Flag_is_expensive = 1 << 13, Flag_is_expensive = 1 << 13,
Flag_is_predicated_vector = 1 << 14, Flag_is_predicated_vector = 1 << 14,
@ -1001,10 +1007,8 @@ public:
bool is_macro() const { return (_flags & Flag_is_macro) != 0; } bool is_macro() const { return (_flags & Flag_is_macro) != 0; }
// The node is expensive: the best control is set during loop opts // The node is expensive: the best control is set during loop opts
bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != nullptr; } bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != nullptr; }
// The node's original edge position is swapped.
// An arithmetic node which accumulates a data in a loop. bool has_swapped_edges() const { return (_flags & Flag_has_swapped_edges) != 0; }
// It must have the loop's phi as input and provide a def to the phi.
bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; } bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; }

View File

@ -72,6 +72,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
_lpt(nullptr), // loop tree node _lpt(nullptr), // loop tree node
_lp(nullptr), // CountedLoopNode _lp(nullptr), // CountedLoopNode
_pre_loop_end(nullptr), // Pre loop CountedLoopEndNode _pre_loop_end(nullptr), // Pre loop CountedLoopEndNode
_loop_reductions(arena()), // reduction nodes in the current loop
_bb(nullptr), // basic block _bb(nullptr), // basic block
_iv(nullptr), // induction var _iv(nullptr), // induction var
_race_possible(false), // cases where SDMU is true _race_possible(false), // cases where SDMU is true
@ -111,7 +112,17 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
return false; // skip malformed counted loop return false; // skip malformed counted loop
} }
if (cl->is_rce_post_loop() && cl->is_reduction_loop()) { // Initialize simple data used by reduction marking early.
set_lpt(lpt);
set_lp(cl);
// For now, define one block which is the entire loop body.
set_bb(cl);
if (SuperWordReductions) {
mark_reductions();
}
if (cl->is_rce_post_loop() && is_marked_reduction_loop()) {
// Post loop vectorization doesn't support reductions // Post loop vectorization doesn't support reductions
return false; return false;
} }
@ -167,18 +178,12 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
init(); // initialize data structures init(); // initialize data structures
set_lpt(lpt);
set_lp(cl);
// For now, define one block which is the entire loop body
set_bb(cl);
bool success = true; bool success = true;
if (do_optimization) { if (do_optimization) {
assert(_packset.length() == 0, "packset must be empty"); assert(_packset.length() == 0, "packset must be empty");
success = SLP_extract(); success = SLP_extract();
if (PostLoopMultiversioning) { if (PostLoopMultiversioning) {
if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) { if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) {
IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next; IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop(); CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
// Main loop SLP works well for manually unrolled loops. But post loop // Main loop SLP works well for manually unrolled loops. But post loop
@ -223,7 +228,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
for (uint i = 0; i < lpt()->_body.size(); i++) { for (uint i = 0; i < lpt()->_body.size(); i++) {
Node* n = lpt()->_body.at(i); Node* n = lpt()->_body.at(i);
if (n == cl->incr() || if (n == cl->incr() ||
n->is_reduction() || is_marked_reduction(n) ||
n->is_AddP() || n->is_AddP() ||
n->is_Cmp() || n->is_Cmp() ||
n->is_Bool() || n->is_Bool() ||
@ -411,6 +416,139 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
} }
} }
bool SuperWord::is_reduction(const Node* n) {
if (!is_reduction_operator(n)) {
return false;
}
// Test whether there is a reduction cycle via every edge index
// (typically indices 1 and 2).
for (uint input = 1; input < n->req(); input++) {
if (in_reduction_cycle(n, input)) {
return true;
}
}
return false;
}
bool SuperWord::is_reduction_operator(const Node* n) {
int opc = n->Opcode();
return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
}
bool SuperWord::in_reduction_cycle(const Node* n, uint input) {
// First find input reduction path to phi node.
auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
[&](const Node* m) { return m->is_Phi(); });
const Node* phi = path_to_phi.first;
if (phi == nullptr) {
return false;
}
// If there is an input reduction path from the phi's loop-back to n, then n
// is part of a reduction cycle.
const Node* first = phi->in(LoopNode::LoopBackControl);
PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode,
[&](const Node* m) { return m == n; });
return path_from_phi.first != nullptr;
}
Node* SuperWord::original_input(const Node* n, uint i) {
if (n->has_swapped_edges()) {
assert(n->is_Add() || n->is_Mul(), "n should be commutative");
if (i == 1) {
return n->in(2);
} else if (i == 2) {
return n->in(1);
}
}
return n->in(i);
}
void SuperWord::mark_reductions() {
_loop_reductions.clear();
// Iterate through all phi nodes associated to the loop and search for
// reduction cycles in the basic block.
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
const Node* phi = lp()->fast_out(i);
if (!phi->is_Phi()) {
continue;
}
if (phi->outcnt() == 0) {
continue;
}
if (phi == iv()) {
continue;
}
// The phi's loop-back is considered the first node in the reduction cycle.
const Node* first = phi->in(LoopNode::LoopBackControl);
if (first == nullptr) {
continue;
}
// Test that the node fits the standard pattern for a reduction operator.
if (!is_reduction_operator(first)) {
continue;
}
// Test that 'first' is the beginning of a reduction cycle ending in 'phi'.
// To contain the number of searched paths, assume that all nodes in a
// reduction cycle are connected via the same edge index, modulo swapped
// inputs. This assumption is realistic because reduction cycles usually
// consist of nodes cloned by loop unrolling.
int reduction_input = -1;
int path_nodes = -1;
for (uint input = 1; input < first->req(); input++) {
// Test whether there is a reduction path in the basic block from 'first'
// to the phi node following edge index 'input'.
PathEnd path =
find_in_path(
first, input, lpt()->_body.size(),
[&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); },
[&](const Node* n) { return n == phi; });
if (path.first != nullptr) {
reduction_input = input;
path_nodes = path.second;
break;
}
}
if (reduction_input == -1) {
continue;
}
// Test that reduction nodes do not have any users in the loop besides their
// reduction cycle successors.
const Node* current = first;
const Node* succ = phi; // current's successor in the reduction cycle.
bool used_in_loop = false;
for (int i = 0; i < path_nodes; i++) {
for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
Node* u = current->fast_out(j);
if (!in_bb(u)) {
continue;
}
if (u == succ) {
continue;
}
used_in_loop = true;
break;
}
if (used_in_loop) {
break;
}
succ = current;
current = original_input(current, reduction_input);
}
if (used_in_loop) {
continue;
}
// Reduction cycle found. Mark all nodes in the found path as reductions.
current = first;
for (int i = 0; i < path_nodes; i++) {
_loop_reductions.set(current->_idx);
current = original_input(current, reduction_input);
}
}
}
//------------------------------SLP_extract--------------------------- //------------------------------SLP_extract---------------------------
// Extract the superword level parallelism // Extract the superword level parallelism
// //
@ -1378,7 +1516,7 @@ bool SuperWord::independent(Node* s1, Node* s2) {
// those nodes, and have not found another node from the pack, we know // those nodes, and have not found another node from the pack, we know
// that all nodes in the pack are independent. // that all nodes in the pack are independent.
Node* SuperWord::find_dependence(Node_List* p) { Node* SuperWord::find_dependence(Node_List* p) {
if (p->at(0)->is_reduction()) { if (is_marked_reduction(p->at(0))) {
return nullptr; // ignore reductions return nullptr; // ignore reductions
} }
ResourceMark rm; ResourceMark rm;
@ -1436,7 +1574,7 @@ bool SuperWord::reduction(Node* s1, Node* s2) {
int d1 = depth(s1); int d1 = depth(s1);
int d2 = depth(s2); int d2 = depth(s2);
if (d2 > d1) { if (d2 > d1) {
if (s1->is_reduction() && s2->is_reduction()) { if (is_marked_reduction(s1) && is_marked_reduction(s2)) {
// This is an ordered set, so s1 should define s2 // This is an ordered set, so s1 should define s2
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i); Node* t1 = s1->fast_out(i);
@ -1653,7 +1791,7 @@ void SuperWord::order_def_uses(Node_List* p) {
if (s1->is_Store()) return; if (s1->is_Store()) return;
// reductions are always managed beforehand // reductions are always managed beforehand
if (s1->is_reduction()) return; if (is_marked_reduction(s1)) return;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i); Node* t1 = s1->fast_out(i);
@ -1689,15 +1827,15 @@ void SuperWord::order_def_uses(Node_List* p) {
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) { bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
// check reductions to see if they are marshalled to represent the reduction // check reductions to see if they are marshalled to represent the reduction
// operator in a specified opnd // operator in a specified opnd
if (u1->is_reduction() && u2->is_reduction()) { if (is_marked_reduction(u1) && is_marked_reduction(u2)) {
// ensure reductions have phis and reduction definitions feeding the 1st operand // ensure reductions have phis and reduction definitions feeding the 1st operand
Node* first = u1->in(2); Node* first = u1->in(2);
if (first->is_Phi() || first->is_reduction()) { if (first->is_Phi() || is_marked_reduction(first)) {
u1->swap_edges(1, 2); u1->swap_edges(1, 2);
} }
// ensure reductions have phis and reduction definitions feeding the 1st operand // ensure reductions have phis and reduction definitions feeding the 1st operand
first = u2->in(2); first = u2->in(2);
if (first->is_Phi() || first->is_reduction()) { if (first->is_Phi() || is_marked_reduction(first)) {
u2->swap_edges(1, 2); u2->swap_edges(1, 2);
} }
return true; return true;
@ -1920,7 +2058,7 @@ void SuperWord::filter_packs() {
remove_pack_at(i); remove_pack_at(i);
} }
Node *n = pk->at(0); Node *n = pk->at(0);
if (n->is_reduction()) { if (is_marked_reduction(n)) {
_num_reductions++; _num_reductions++;
} else { } else {
_num_work_vecs++; _num_work_vecs++;
@ -2171,7 +2309,7 @@ bool SuperWord::implemented(Node_List* p) {
if (p0 != nullptr) { if (p0 != nullptr) {
int opc = p0->Opcode(); int opc = p0->Opcode();
uint size = p->size(); uint size = p->size();
if (p0->is_reduction()) { if (is_marked_reduction(p0)) {
const Type *arith_type = p0->bottom_type(); const Type *arith_type = p0->bottom_type();
// Length 2 reductions of INT/LONG do not offer performance benefits // Length 2 reductions of INT/LONG do not offer performance benefits
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) { if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
@ -2261,13 +2399,13 @@ bool SuperWord::profitable(Node_List* p) {
} }
} }
// Check if reductions are connected // Check if reductions are connected
if (p0->is_reduction()) { if (is_marked_reduction(p0)) {
Node* second_in = p0->in(2); Node* second_in = p0->in(2);
Node_List* second_pk = my_pack(second_in); Node_List* second_pk = my_pack(second_in);
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) { if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
// Remove reduction flag if no parent pack or if not enough work // Unmark reduction if no parent pack or if not enough work
// to cover reduction expansion overhead // to cover reduction expansion overhead
p0->remove_flag(Node::Flag_is_reduction); _loop_reductions.remove(p0->_idx);
return false; return false;
} else if (second_pk->size() != p->size()) { } else if (second_pk->size() != p->size()) {
return false; return false;
@ -2299,7 +2437,7 @@ bool SuperWord::profitable(Node_List* p) {
if (def == n) { if (def == n) {
// Reductions should only have a Phi use at the loop head or a non-phi use // Reductions should only have a Phi use at the loop head or a non-phi use
// outside of the loop if it is the last element of the pack (e.g. SafePoint). // outside of the loop if it is the last element of the pack (e.g. SafePoint).
if (def->is_reduction() && if (is_marked_reduction(def) &&
((use->is_Phi() && use->in(0) == _lpt->_head) || ((use->is_Phi() && use->in(0) == _lpt->_head) ||
(!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) { (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
continue; continue;
@ -2442,7 +2580,7 @@ public:
for (DepPreds preds(n, dg); !preds.done(); preds.next()) { for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
Node* pred = preds.current(); Node* pred = preds.current();
int pred_pid = get_pid_or_zero(pred); int pred_pid = get_pid_or_zero(pred);
if (pred_pid == pid && n->is_reduction()) { if (pred_pid == pid && _slp->is_marked_reduction(n)) {
continue; // reduction -> self-cycle is not a cyclic dependency continue; // reduction -> self-cycle is not a cyclic dependency
} }
// Only add edges once, and only for mapped nodes (in block) // Only add edges once, and only for mapped nodes (in block)
@ -2992,7 +3130,7 @@ bool SuperWord::output() {
} else if (n->req() == 3 && !is_cmov_pack(p)) { } else if (n->req() == 3 && !is_cmov_pack(p)) {
// Promote operands to vector // Promote operands to vector
Node* in1 = nullptr; Node* in1 = nullptr;
bool node_isa_reduction = n->is_reduction(); bool node_isa_reduction = is_marked_reduction(n);
if (node_isa_reduction) { if (node_isa_reduction) {
// the input to the first reduction operation is retained // the input to the first reduction operation is retained
in1 = low_adr->in(1); in1 = low_adr->in(1);
@ -3246,7 +3384,7 @@ bool SuperWord::output() {
Node* SuperWord::create_post_loop_vmask() { Node* SuperWord::create_post_loop_vmask() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_rce_post_loop(), "Must be an rce post loop"); assert(cl->is_rce_post_loop(), "Must be an rce post loop");
assert(!cl->is_reduction_loop(), "no vector reduction in post loop"); assert(!is_marked_reduction_loop(), "no vector reduction in post loop");
assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1"); assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1");
// Collect vector element types of all post loop packs. Also collect // Collect vector element types of all post loop packs. Also collect
@ -3524,7 +3662,7 @@ void SuperWord::insert_extracts(Node_List* p) {
_n_idx_list.pop(); _n_idx_list.pop();
Node* def = use->in(idx); Node* def = use->in(idx);
if (def->is_reduction()) continue; if (is_marked_reduction(def)) continue;
// Insert extract operation // Insert extract operation
_igvn.hash_delete(def); _igvn.hash_delete(def);
@ -3547,7 +3685,7 @@ void SuperWord::insert_extracts(Node_List* p) {
bool SuperWord::is_vector_use(Node* use, int u_idx) { bool SuperWord::is_vector_use(Node* use, int u_idx) {
Node_List* u_pk = my_pack(use); Node_List* u_pk = my_pack(use);
if (u_pk == nullptr) return false; if (u_pk == nullptr) return false;
if (use->is_reduction()) return true; if (is_marked_reduction(use)) return true;
Node* def = use->in(u_idx); Node* def = use->in(u_idx);
Node_List* d_pk = my_pack(def); Node_List* d_pk = my_pack(def);
if (d_pk == nullptr) { if (d_pk == nullptr) {
@ -3708,7 +3846,7 @@ bool SuperWord::construct_bb() {
if (in_bb(use) && !visited_test(use) && if (in_bb(use) && !visited_test(use) &&
// Don't go around backedge // Don't go around backedge
(!use->is_Phi() || n == entry)) { (!use->is_Phi() || n == entry)) {
if (use->is_reduction()) { if (is_marked_reduction(use)) {
// First see if we can map the reduction on the given system we are on, then // First see if we can map the reduction on the given system we are on, then
// make a data entry operation for each reduction we see. // make a data entry operation for each reduction we see.
BasicType bt = use->bottom_type()->basic_type(); BasicType bt = use->bottom_type()->basic_type();
@ -4345,10 +4483,6 @@ void SuperWord::init() {
_iteration_last.clear(); _iteration_last.clear();
_node_info.clear(); _node_info.clear();
_align_to_ref = nullptr; _align_to_ref = nullptr;
_lpt = nullptr;
_lp = nullptr;
_bb = nullptr;
_iv = nullptr;
_race_possible = 0; _race_possible = 0;
_early_return = false; _early_return = false;
_num_work_vecs = 0; _num_work_vecs = 0;

View File

@ -29,6 +29,7 @@
#include "opto/phaseX.hpp" #include "opto/phaseX.hpp"
#include "opto/vectornode.hpp" #include "opto/vectornode.hpp"
#include "utilities/growableArray.hpp" #include "utilities/growableArray.hpp"
#include "utilities/pair.hpp"
#include "libadt/dict.hpp" #include "libadt/dict.hpp"
// //
@ -357,6 +358,7 @@ class SuperWord : public ResourceObj {
IdealLoopTree* _lpt; // Current loop tree node IdealLoopTree* _lpt; // Current loop tree node
CountedLoopNode* _lp; // Current CountedLoopNode CountedLoopNode* _lp; // Current CountedLoopNode
CountedLoopEndNode* _pre_loop_end; // Current CountedLoopEndNode of pre loop CountedLoopEndNode* _pre_loop_end; // Current CountedLoopEndNode of pre loop
VectorSet _loop_reductions; // Reduction nodes in the current loop
Node* _bb; // Current basic block Node* _bb; // Current basic block
PhiNode* _iv; // Induction var PhiNode* _iv; // Induction var
bool _race_possible; // In cases where SDMU is true bool _race_possible; // In cases where SDMU is true
@ -471,6 +473,62 @@ class SuperWord : public ResourceObj {
// methods // methods
typedef const Pair<const Node*, int> PathEnd;
// Search for a path P = (n_1, n_2, ..., n_k) such that:
// - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
// - path(n) for all n in P,
// - k <= max, and
// - there exists a node e such that original_input(n_k, input) = e and end(e).
// Return <e, k>, if P is found, or <nullptr, -1> otherwise.
// Note that original_input(n, i) has the same behavior as n->in(i) except
// that it commutes the inputs of binary nodes whose edges have been swapped.
template <typename NodePredicate1, typename NodePredicate2>
static PathEnd find_in_path(const Node *n1, uint input, int max,
NodePredicate1 path, NodePredicate2 end) {
const PathEnd no_path(nullptr, -1);
const Node* current = n1;
int k = 0;
for (int i = 0; i <= max; i++) {
if (current == nullptr) {
return no_path;
}
if (end(current)) {
return PathEnd(current, k);
}
if (!path(current)) {
return no_path;
}
current = original_input(current, input);
k++;
}
return no_path;
}
public:
// Whether n is a reduction operator and part of a reduction cycle.
// This function can be used for individual queries outside the SLP analysis,
// e.g. to inform matching in target-specific code. Otherwise, the
// almost-equivalent but faster SuperWord::mark_reductions() is preferable.
static bool is_reduction(const Node* n);
// Whether n is marked as a reduction node.
bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); }
// Whether the current loop has any reduction node.
bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); }
private:
// Whether n is a standard reduction operator.
static bool is_reduction_operator(const Node* n);
// Whether n is part of a reduction cycle via the 'input' edge index. To bound
// the search, constrain the size of reduction cycles to LoopMaxUnroll.
static bool in_reduction_cycle(const Node* n, uint input);
// Reference to the i'th input node of n, commuting the inputs of binary nodes
// whose edges have been swapped. Assumes n is a commutative operation.
static Node* original_input(const Node* n, uint i);
// Find and mark reductions in a loop. Running mark_reductions() is similar to
// querying is_reduction(n) for every n in the SuperWord loop, but stricter in
// that it assumes counted loops and requires that reduction nodes are not
// used within the loop except by their reduction cycle predecessors.
void mark_reductions();
// Extract the superword level parallelism // Extract the superword level parallelism
bool SLP_extract(); bool SLP_extract();
// Find the adjacent memory references and create pack pairs for them. // Find the adjacent memory references and create pack pairs for them.

View File

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2022, Red Hat, Inc. All rights reserved. * Copyright (c) 2022, Red Hat, Inc. All rights reserved.
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -44,9 +45,9 @@ public class TestSuperwordFailsUnrolling {
public static void main(String[] args) { public static void main(String[] args) {
Object avx = wb.getVMFlag("UseAVX"); Object avx = wb.getVMFlag("UseAVX");
if (avx != null && ((Long)avx) > 2) { if (avx != null && ((Long)avx) > 2) {
TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8"); TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
} }
TestFramework.runWithFlags("-XX:LoopMaxUnroll=8"); TestFramework.runWithFlags("-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
} }
@Test @Test

View File

@ -0,0 +1,170 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8287087
* @summary Test that floating-point min/max x64 operations are implemented
* differently depending on whether they are part of a reduction. These
* tests complement those in TestFpMinMaxIntrinsics, which focus more
* on correctness aspects.
* @library /test/lib /
* @requires os.simpleArch == "x64"& (vm.opt.UseAVX == "null" | vm.opt.UseAVX > 0)
* @run driver compiler.intrinsics.math.TestFpMinMaxReductions
*/
package compiler.intrinsics.math;
import compiler.lib.ir_framework.*;
public class TestFpMinMaxReductions {
private static float acc;
private static float floatInput1;
private static float floatInput2;
private static float[] floatArray = new float[1000];
private static double doubleInput1;
private static double doubleInput2;
private static double[] doubleArray = new double[1000];
private static int stride = 1;
public static void main(String[] args) throws Exception {
TestFramework.run();
}
@Test
@IR(counts = {IRNode.MIN_F_REG, "1"},
failOn = {IRNode.MIN_F_REDUCTION_REG})
private static float testFloatMin() {
return Math.min(floatInput1, floatInput2);
}
@Test
@IR(counts = {IRNode.MAX_F_REG, "1"},
failOn = {IRNode.MAX_F_REDUCTION_REG})
private static float testFloatMax() {
return Math.max(floatInput1, floatInput2);
}
@Test
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
private static float testFloatMinReduction() {
float fmin = Float.POSITIVE_INFINITY;
for (int i = 0; i < floatArray.length; i++) {
fmin = Math.min(fmin, floatArray[i]);
}
return fmin;
}
@Test
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
private static float testFloatMinReductionPartiallyUnrolled() {
float fmin = Float.POSITIVE_INFINITY;
for (int i = 0; i < floatArray.length / 2; i++) {
fmin = Math.min(fmin, floatArray[2*i]);
fmin = Math.min(fmin, floatArray[2*i + 1]);
}
return fmin;
}
@Test
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
private static float testFloatMinReductionNonCounted() {
float fmin = Float.POSITIVE_INFINITY;
for (int i = 0; i < floatArray.length; i += stride) {
fmin = Math.min(fmin, floatArray[i]);
}
return fmin;
}
@Test
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
private static float testFloatMinReductionGlobalAccumulator() {
acc = Float.POSITIVE_INFINITY;
for (int i = 0; i < floatArray.length; i++) {
acc = Math.min(acc, floatArray[i]);
}
return acc;
}
@Test
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
private static float testFloatMinReductionInOuterLoop() {
float fmin = Float.POSITIVE_INFINITY;
int count = 0;
for (int i = 0; i < floatArray.length; i++) {
fmin = Math.min(fmin, floatArray[i]);
for (int j = 0; j < 10; j += stride) {
count++;
}
}
return fmin + count;
}
@Test
@IR(counts = {IRNode.MAX_F_REDUCTION_REG, ">= 1"})
private static float testFloatMaxReduction() {
float fmax = Float.NEGATIVE_INFINITY;
for (int i = 0; i < floatArray.length; i++) {
fmax = Math.max(fmax, floatArray[i]);
}
return fmax;
}
@Test
@IR(counts = {IRNode.MIN_D_REG, "1"},
failOn = {IRNode.MIN_D_REDUCTION_REG})
private static double testDoubleMin() {
return Math.min(doubleInput1, doubleInput2);
}
@Test
@IR(counts = {IRNode.MAX_D_REG, "1"},
failOn = {IRNode.MAX_D_REDUCTION_REG})
private static double testDoubleMax() {
return Math.max(doubleInput1, doubleInput2);
}
@Test
@IR(counts = {IRNode.MIN_D_REDUCTION_REG, ">= 1"})
private static double testDoubleMinReduction() {
double fmin = Double.POSITIVE_INFINITY;
for (int i = 0; i < doubleArray.length; i++) {
fmin = Math.min(fmin, doubleArray[i]);
}
return fmin;
}
@Test
@IR(counts = {IRNode.MAX_D_REDUCTION_REG, ">= 1"})
private static double testDoubleMaxReduction() {
double fmax = Double.NEGATIVE_INFINITY;
for (int i = 0; i < doubleArray.length; i++) {
fmax = Math.max(fmax, doubleArray[i]);
}
return fmax;
}
}

View File

@ -649,6 +649,26 @@ public class IRNode {
beforeMatchingNameRegex(MAX, "Max(I|L)"); beforeMatchingNameRegex(MAX, "Max(I|L)");
} }
public static final String MAX_D_REDUCTION_REG = PREFIX + "MAX_D_REDUCTION_REG" + POSTFIX;
static {
machOnlyNameRegex(MAX_D_REDUCTION_REG, "maxD_reduction_reg");
}
public static final String MAX_D_REG = PREFIX + "MAX_D_REG" + POSTFIX;
static {
machOnlyNameRegex(MAX_D_REG, "maxD_reg");
}
public static final String MAX_F_REDUCTION_REG = PREFIX + "MAX_F_REDUCTION_REG" + POSTFIX;
static {
machOnlyNameRegex(MAX_F_REDUCTION_REG, "maxF_reduction_reg");
}
public static final String MAX_F_REG = PREFIX + "MAX_F_REG" + POSTFIX;
static {
machOnlyNameRegex(MAX_F_REG, "maxF_reg");
}
public static final String MAX_I = PREFIX + "MAX_I" + POSTFIX; public static final String MAX_I = PREFIX + "MAX_I" + POSTFIX;
static { static {
beforeMatchingNameRegex(MAX_I, "MaxI"); beforeMatchingNameRegex(MAX_I, "MaxI");
@ -679,6 +699,26 @@ public class IRNode {
beforeMatchingNameRegex(MIN, "Min(I|L)"); beforeMatchingNameRegex(MIN, "Min(I|L)");
} }
public static final String MIN_D_REDUCTION_REG = PREFIX + "MIN_D_REDUCTION_REG" + POSTFIX;
static {
machOnlyNameRegex(MIN_D_REDUCTION_REG, "minD_reduction_reg");
}
public static final String MIN_D_REG = PREFIX + "MIN_D_REG" + POSTFIX;
static {
machOnlyNameRegex(MIN_D_REG, "minD_reg");
}
public static final String MIN_F_REDUCTION_REG = PREFIX + "MIN_F_REDUCTION_REG" + POSTFIX;
static {
machOnlyNameRegex(MIN_F_REDUCTION_REG, "minF_reduction_reg");
}
public static final String MIN_F_REG = PREFIX + "MIN_F_REG" + POSTFIX;
static {
machOnlyNameRegex(MIN_F_REG, "minF_reg");
}
public static final String MIN_I = PREFIX + "MIN_I" + POSTFIX; public static final String MIN_I = PREFIX + "MIN_I" + POSTFIX;
static { static {
beforeMatchingNameRegex(MIN_I, "MinI"); beforeMatchingNameRegex(MIN_I, "MinI");

View File

@ -0,0 +1,154 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8287087
* @summary Test reduction vectorizations that are enabled by performing SLP
* reduction analysis on unrolled loops.
* @library /test/lib /
* @requires vm.bits == 64
* @run driver compiler.loopopts.superword.TestGeneralizedReductions
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
import jdk.test.lib.Asserts;
public class TestGeneralizedReductions {
private static int acc = 0;
public static void main(String[] args) throws Exception {
// Fix maximum number of unrolls for test stability.
TestFramework.runWithFlags("-XX:LoopMaxUnroll=16");
}
@Run(test = {"testReductionOnGlobalAccumulator",
"testReductionOnPartiallyUnrolledLoop",
"testReductionOnLargePartiallyUnrolledLoop",
"testReductionOnPartiallyUnrolledLoopWithSwappedInputs",
"testMapReductionOnGlobalAccumulator"})
void run() {
long[] array = new long[128];
long result;
initArray(array);
result = testReductionOnGlobalAccumulator(array);
Asserts.assertEQ(result, 8128L, "unexpected result");
initArray(array);
result = testReductionOnPartiallyUnrolledLoop(array);
Asserts.assertEQ(result, 8128L, "unexpected result");
initArray(array);
result = testReductionOnLargePartiallyUnrolledLoop(array);
Asserts.assertEQ(result, 8128L, "unexpected result");
initArray(array);
result = testReductionOnPartiallyUnrolledLoopWithSwappedInputs(array);
Asserts.assertEQ(result, 8128L, "unexpected result");
initArray(array);
result = testMapReductionOnGlobalAccumulator(array);
Asserts.assertEQ(result, 448L, "unexpected result");
}
private static void initArray(long[] array) {
for (int i = 0; i < array.length; i++) {
array[i] = i;
}
}
@Test
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
private static long testReductionOnGlobalAccumulator(long[] array) {
acc = 0;
for (int i = 0; i < array.length; i++) {
acc += array[i];
}
return acc;
}
@Test
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
private static long testReductionOnPartiallyUnrolledLoop(long[] array) {
int sum = 0;
for (int i = 0; i < array.length / 2; i++) {
sum += array[2*i];
sum += array[2*i + 1];
}
return sum;
}
@Test
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
private static long testReductionOnLargePartiallyUnrolledLoop(long[] array) {
int sum = 0;
for (int i = 0; i < array.length / 8; i++) {
sum += array[8*i];
sum += array[8*i + 1];
sum += array[8*i + 2];
sum += array[8*i + 3];
sum += array[8*i + 4];
sum += array[8*i + 5];
sum += array[8*i + 6];
sum += array[8*i + 7];
}
return sum;
}
// This test illustrates a limitation of the current reduction analysis: it
// fails to detect reduction cycles where nodes are connected via different
// input indices (except if the differences result from C2 edge swapping).
// If this limitation is overcome in the future, the test case should be
// turned into a positive one.
@Test
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
failOn = {IRNode.ADD_REDUCTION_VI})
private static long testReductionOnPartiallyUnrolledLoopWithSwappedInputs(long[] array) {
int sum = 0;
for (int i = 0; i < array.length / 2; i++) {
sum = sum + (int)array[2*i];
sum = (int)array[2*i + 1] + sum;
}
return sum;
}
@Test
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true","UsePopCountInstruction", "true"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1",
IRNode.POPCOUNT_VL, ">= 1"})
private static long testMapReductionOnGlobalAccumulator(long[] array) {
acc = 0;
for (int i = 0; i < array.length; i++) {
acc += Long.bitCount(array[i]);
}
return acc;
}
}

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -44,6 +44,9 @@ public class FpMinMaxIntrinsics {
private Random r = new Random(); private Random r = new Random();
private static int stride = 1;
private static float acc;
@Setup @Setup
public void init() { public void init() {
c1 = s1 = step(); c1 = s1 = step();
@ -127,4 +130,44 @@ public class FpMinMaxIntrinsics {
return result; return result;
} }
@Benchmark
public float fMinReducePartiallyUnrolled() {
float result = Float.MAX_VALUE;
for (int i = 0; i < COUNT / 2; i++) {
result = Math.min(result, floats[2*i]);
result = Math.min(result, floats[2*i + 1]);
}
return result;
}
@Benchmark
public float fMinReduceNonCounted() {
float result = Float.MAX_VALUE;
for (int i = 0; i < COUNT; i += stride)
result = Math.min(result, floats[i]);
return result;
}
@Benchmark
public float fMinReduceGlobalAccumulator() {
acc = Float.MAX_VALUE;
for (int i = 0; i < COUNT; i += stride)
acc = Math.min(acc, floats[i]);
return acc;
}
@Benchmark
public float fMinReduceInOuterLoop() {
float result = Float.MAX_VALUE;
int count = 0;
for (int i = 0; i < COUNT; i++) {
result = Math.min(result, floats[i]);
for (int j = 0; j < 10; j += stride) {
count++;
}
}
return result + count;
}
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -42,17 +42,17 @@ public abstract class VectorReduction {
private int[] intsB; private int[] intsB;
private int[] intsC; private int[] intsC;
private int[] intsD; private int[] intsD;
private int resI;
private long[] longsA; private long[] longsA;
private long[] longsB; private long[] longsB;
private long[] longsC; private long[] longsC;
private long[] longsD; private long[] longsD;
private long resL;
@Param("0") @Param("0")
private int seed; private int seed;
private Random r = new Random(seed); private Random r = new Random(seed);
private static int globalResI;
@Setup @Setup
public void init() { public void init() {
intsA = new int[COUNT]; intsA = new int[COUNT];
@ -75,51 +75,86 @@ public abstract class VectorReduction {
} }
@Benchmark @Benchmark
public void andRedI() { public void andRedI(Blackhole bh) {
int resI = 0xFFFF;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI &= intsD[i]; resI &= intsD[i];
} }
bh.consume(resI);
} }
@Benchmark @Benchmark
public void orRedI() { public void orRedI(Blackhole bh) {
int resI = 0x0000;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI |= intsD[i]; resI |= intsD[i];
} }
bh.consume(resI);
} }
@Benchmark @Benchmark
public void xorRedI() { public void xorRedI(Blackhole bh) {
int resI = 0x0000;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI ^= intsD[i]; resI ^= intsD[i];
} }
bh.consume(resI);
} }
@Benchmark @Benchmark
public void andRedL() { public void andRedL(Blackhole bh) {
long resL = 0xFFFFFFFF;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL &= longsD[i]; resL &= longsD[i];
} }
bh.consume(resL);
} }
@Benchmark @Benchmark
public void orRedL() { public void orRedL(Blackhole bh) {
long resL = 0x00000000;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL |= longsD[i]; resL |= longsD[i];
} }
bh.consume(resL);
} }
@Benchmark @Benchmark
public void xorRedL() { public void xorRedL(Blackhole bh) {
long resL = 0x00000000;
for (int i = 0; i < COUNT; i++) { for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL ^= longsD[i]; resL ^= longsD[i];
} }
bh.consume(resL);
}
@Benchmark
public void andRedIPartiallyUnrolled(Blackhole bh) {
int resI = 0xFFFF;
for (int i = 0; i < COUNT / 2; i++) {
int j = 2*i;
intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
resI &= intsD[j];
j = 2*i + 1;
intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
resI &= intsD[j];
}
bh.consume(resI);
}
@Benchmark
public void andRedIOnGlobalAccumulator() {
globalResI = 0xFFFF;
for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
globalResI &= intsD[i];
}
} }
@Fork(value = 2, jvmArgsPrepend = { @Fork(value = 2, jvmArgsPrepend = {