8287087: C2: perform SLP reduction analysis on-demand
Reviewed-by: epeter, jbhateja, thartmann
This commit is contained in:
parent
ba43649530
commit
1be80a4445
@ -5374,7 +5374,7 @@ instruct loadD(regD dst, memory mem)
|
||||
|
||||
// max = java.lang.Math.max(float a, float b)
|
||||
instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
|
||||
predicate(UseAVX > 0 && !n->is_reduction());
|
||||
predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
|
||||
match(Set dst (MaxF a b));
|
||||
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
|
||||
format %{
|
||||
@ -5396,7 +5396,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
|
||||
%}
|
||||
|
||||
instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
|
||||
predicate(UseAVX > 0 && n->is_reduction());
|
||||
predicate(UseAVX > 0 && SuperWord::is_reduction(n));
|
||||
match(Set dst (MaxF a b));
|
||||
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
|
||||
|
||||
@ -5410,7 +5410,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
|
||||
|
||||
// max = java.lang.Math.max(double a, double b)
|
||||
instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
|
||||
predicate(UseAVX > 0 && !n->is_reduction());
|
||||
predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
|
||||
match(Set dst (MaxD a b));
|
||||
effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
|
||||
format %{
|
||||
@ -5432,7 +5432,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
|
||||
%}
|
||||
|
||||
instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
|
||||
predicate(UseAVX > 0 && n->is_reduction());
|
||||
predicate(UseAVX > 0 && SuperWord::is_reduction(n));
|
||||
match(Set dst (MaxD a b));
|
||||
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
|
||||
|
||||
@ -5446,7 +5446,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe
|
||||
|
||||
// min = java.lang.Math.min(float a, float b)
|
||||
instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
|
||||
predicate(UseAVX > 0 && !n->is_reduction());
|
||||
predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
|
||||
match(Set dst (MinF a b));
|
||||
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
|
||||
format %{
|
||||
@ -5468,7 +5468,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
|
||||
%}
|
||||
|
||||
instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
|
||||
predicate(UseAVX > 0 && n->is_reduction());
|
||||
predicate(UseAVX > 0 && SuperWord::is_reduction(n));
|
||||
match(Set dst (MinF a b));
|
||||
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
|
||||
|
||||
@ -5482,7 +5482,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
|
||||
|
||||
// min = java.lang.Math.min(double a, double b)
|
||||
instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
|
||||
predicate(UseAVX > 0 && !n->is_reduction());
|
||||
predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
|
||||
match(Set dst (MinD a b));
|
||||
effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
|
||||
format %{
|
||||
@ -5504,7 +5504,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
|
||||
%}
|
||||
|
||||
instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
|
||||
predicate(UseAVX > 0 && n->is_reduction());
|
||||
predicate(UseAVX > 0 && SuperWord::is_reduction(n));
|
||||
match(Set dst (MinD a b));
|
||||
effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -271,6 +271,7 @@ int main(int argc, char *argv[])
|
||||
AD.addInclude(AD._DFA_file, "opto/narrowptrnode.hpp");
|
||||
AD.addInclude(AD._DFA_file, "opto/opcodes.hpp");
|
||||
AD.addInclude(AD._DFA_file, "opto/convertnode.hpp");
|
||||
AD.addInclude(AD._DFA_file, "opto/superword.hpp");
|
||||
AD.addInclude(AD._DFA_file, "utilities/powerOfTwo.hpp");
|
||||
|
||||
// Make sure each .cpp file starts with include lines:
|
||||
|
@ -462,8 +462,8 @@ void IdealGraphPrinter::visit_node(Node *n, bool edges, VectorSet* temp_set) {
|
||||
if (flags & Node::Flag_has_call) {
|
||||
print_prop("has_call", "true");
|
||||
}
|
||||
if (flags & Node::Flag_is_reduction) {
|
||||
print_prop("is_reduction", "true");
|
||||
if (flags & Node::Flag_has_swapped_edges) {
|
||||
print_prop("has_swapped_edges", "true");
|
||||
}
|
||||
|
||||
if (C->matcher() != nullptr) {
|
||||
|
@ -1037,10 +1037,6 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
}
|
||||
|
||||
if (UseSuperWord) {
|
||||
if (!cl->is_reduction_loop()) {
|
||||
phase->mark_reductions(this);
|
||||
}
|
||||
|
||||
// Only attempt slp analysis when user controls do not prohibit it
|
||||
if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
|
||||
// Once policy_slp_analysis succeeds, mark the loop with the
|
||||
@ -1694,15 +1690,6 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
|
||||
set_idom(new_pre_exit, pre_end, dd_main_head);
|
||||
set_loop(new_pre_exit, outer_loop->_parent);
|
||||
|
||||
if (peel_only) {
|
||||
// Nodes in the peeled iteration that were marked as reductions within the
|
||||
// original loop might not be reductions within their new outer loop.
|
||||
for (uint i = 0; i < loop->_body.size(); i++) {
|
||||
Node* n = old_new[loop->_body[i]->_idx];
|
||||
n->remove_flag(Node::Flag_is_reduction);
|
||||
}
|
||||
}
|
||||
|
||||
// Step B2: Build a zero-trip guard for the main-loop. After leaving the
|
||||
// pre-loop, the main-loop may not execute at all. Later in life this
|
||||
// zero-trip guard will become the minimum-trip guard when we unroll
|
||||
@ -2456,69 +2443,6 @@ void PhaseIdealLoop::do_maximally_unroll(IdealLoopTree *loop, Node_List &old_new
|
||||
}
|
||||
}
|
||||
|
||||
void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
|
||||
if (SuperWordReductions == false) return;
|
||||
|
||||
CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
|
||||
if (loop_head->unrolled_count() > 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
Node* trip_phi = loop_head->phi();
|
||||
for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
|
||||
Node* phi = loop_head->fast_out(i);
|
||||
if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
|
||||
// For definitions which are loop inclusive and not tripcounts.
|
||||
Node* def_node = phi->in(LoopNode::LoopBackControl);
|
||||
|
||||
if (def_node != nullptr) {
|
||||
Node* n_ctrl = get_ctrl(def_node);
|
||||
if (n_ctrl != nullptr && loop->is_member(get_loop(n_ctrl))) {
|
||||
// Now test it to see if it fits the standard pattern for a reduction operator.
|
||||
int opc = def_node->Opcode();
|
||||
if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())
|
||||
|| opc == Op_MinD || opc == Op_MinF || opc == Op_MaxD || opc == Op_MaxF) {
|
||||
if (!def_node->is_reduction()) { // Not marked yet
|
||||
// To be a reduction, the arithmetic node must have the phi as input and provide a def to it
|
||||
bool ok = false;
|
||||
for (unsigned j = 1; j < def_node->req(); j++) {
|
||||
Node* in = def_node->in(j);
|
||||
if (in == phi) {
|
||||
ok = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// do nothing if we did not match the initial criteria
|
||||
if (ok == false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// The result of the reduction must not be used in the loop
|
||||
for (DUIterator_Fast imax, i = def_node->fast_outs(imax); i < imax && ok; i++) {
|
||||
Node* u = def_node->fast_out(i);
|
||||
if (!loop->is_member(get_loop(ctrl_or_self(u)))) {
|
||||
continue;
|
||||
}
|
||||
if (u == phi) {
|
||||
continue;
|
||||
}
|
||||
ok = false;
|
||||
}
|
||||
|
||||
// iff the uses conform
|
||||
if (ok) {
|
||||
def_node->add_flag(Node::Flag_is_reduction);
|
||||
loop_head->mark_has_reductions();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------adjust_limit-----------------------------------
|
||||
// Helper function that computes new loop limit as (rc_limit-offset)/scale
|
||||
Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) {
|
||||
|
@ -2249,7 +2249,6 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
|
||||
if (is_pre_loop ()) st->print("pre of N%d" , _main_idx);
|
||||
if (is_main_loop()) st->print("main of N%d", _idx);
|
||||
if (is_post_loop()) st->print("post of N%d", _main_idx);
|
||||
if (is_reduction_loop()) st->print(" reduction");
|
||||
if (is_strip_mined()) st->print(" strip mined");
|
||||
}
|
||||
#endif
|
||||
@ -3991,7 +3990,6 @@ void IdealLoopTree::dump_head() {
|
||||
if (cl->is_pre_loop ()) tty->print(" pre" );
|
||||
if (cl->is_main_loop()) tty->print(" main");
|
||||
if (cl->is_post_loop()) tty->print(" post");
|
||||
if (cl->is_reduction_loop()) tty->print(" reduction");
|
||||
if (cl->is_vectorized_loop()) tty->print(" vector");
|
||||
if (range_checks_present()) tty->print(" rc ");
|
||||
if (cl->is_multiversioned()) tty->print(" multi ");
|
||||
|
@ -61,23 +61,22 @@ protected:
|
||||
uint _loop_flags;
|
||||
// Names for flag bitfields
|
||||
enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3,
|
||||
MainHasNoPreLoop = 1<<2,
|
||||
HasExactTripCount = 1<<3,
|
||||
InnerLoop = 1<<4,
|
||||
PartialPeelLoop = 1<<5,
|
||||
PartialPeelFailed = 1<<6,
|
||||
HasReductions = 1<<7,
|
||||
WasSlpAnalyzed = 1<<8,
|
||||
PassedSlpAnalysis = 1<<9,
|
||||
DoUnrollOnly = 1<<10,
|
||||
VectorizedLoop = 1<<11,
|
||||
HasAtomicPostLoop = 1<<12,
|
||||
IsMultiversioned = 1<<13,
|
||||
StripMined = 1<<14,
|
||||
SubwordLoop = 1<<15,
|
||||
ProfileTripFailed = 1<<16,
|
||||
LoopNestInnerLoop = 1 << 17,
|
||||
LoopNestLongOuterLoop = 1 << 18};
|
||||
MainHasNoPreLoop = 1<<2,
|
||||
HasExactTripCount = 1<<3,
|
||||
InnerLoop = 1<<4,
|
||||
PartialPeelLoop = 1<<5,
|
||||
PartialPeelFailed = 1<<6,
|
||||
WasSlpAnalyzed = 1<<7,
|
||||
PassedSlpAnalysis = 1<<8,
|
||||
DoUnrollOnly = 1<<9,
|
||||
VectorizedLoop = 1<<10,
|
||||
HasAtomicPostLoop = 1<<11,
|
||||
IsMultiversioned = 1<<12,
|
||||
StripMined = 1<<13,
|
||||
SubwordLoop = 1<<14,
|
||||
ProfileTripFailed = 1<<15,
|
||||
LoopNestInnerLoop = 1<<16,
|
||||
LoopNestLongOuterLoop = 1<<17};
|
||||
char _unswitch_count;
|
||||
enum { _unswitch_max=3 };
|
||||
char _postloop_flags;
|
||||
@ -105,7 +104,6 @@ public:
|
||||
bool is_loop_nest_outer_loop() const { return _loop_flags & LoopNestLongOuterLoop; }
|
||||
|
||||
void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
|
||||
void mark_has_reductions() { _loop_flags |= HasReductions; }
|
||||
void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
|
||||
void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
|
||||
void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
|
||||
@ -286,7 +284,6 @@ public:
|
||||
bool is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
|
||||
bool is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
|
||||
bool is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
|
||||
bool is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
|
||||
bool was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
|
||||
bool has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
|
||||
bool is_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
|
||||
@ -1313,9 +1310,6 @@ public:
|
||||
// Unroll the loop body one step - make each trip do 2 iterations.
|
||||
void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );
|
||||
|
||||
// Mark vector reduction candidates before loop unrolling
|
||||
void mark_reductions( IdealLoopTree *loop );
|
||||
|
||||
// Return true if exp is a constant times an induction var
|
||||
bool is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth = 0);
|
||||
|
||||
|
@ -2622,10 +2622,6 @@ void PhaseIdealLoop::clone_loop_body(const Node_List& body, Node_List &old_new,
|
||||
Node* old = body.at(i);
|
||||
Node* nnn = old->clone();
|
||||
old_new.map(old->_idx, nnn);
|
||||
if (old->is_reduction()) {
|
||||
// Reduction flag is not copied by default. Copy it here when cloning the entire loop body.
|
||||
nnn->add_flag(Node::Flag_is_reduction);
|
||||
}
|
||||
if (C->do_vector_loop() && cm != nullptr) {
|
||||
cm->verify_insert_and_clone(old, nnn, cm->clone_idx());
|
||||
}
|
||||
|
@ -521,10 +521,6 @@ Node *Node::clone() const {
|
||||
// If it is applicable, it will happen anyway when the cloned node is registered with IGVN.
|
||||
n->remove_flag(Node::NodeFlags::Flag_for_post_loop_opts_igvn);
|
||||
}
|
||||
if (n->is_reduction()) {
|
||||
// Do not copy reduction information. This must be explicitly set by the calling code.
|
||||
n->remove_flag(Node::Flag_is_reduction);
|
||||
}
|
||||
BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
|
||||
bs->register_potential_barrier_node(n);
|
||||
|
||||
|
@ -578,6 +578,12 @@ public:
|
||||
_in[i2] = n1;
|
||||
// If this node is in the hash table, make sure it doesn't need a rehash.
|
||||
assert(check_hash == NO_HASH || check_hash == hash(), "edge swap must preserve hash code");
|
||||
// Flip swapped edges flag.
|
||||
if (has_swapped_edges()) {
|
||||
remove_flag(Node::Flag_has_swapped_edges);
|
||||
} else {
|
||||
add_flag(Node::Flag_has_swapped_edges);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterators over input Nodes for a Node X are written as:
|
||||
@ -784,7 +790,7 @@ public:
|
||||
Flag_avoid_back_to_back_before = 1 << 8,
|
||||
Flag_avoid_back_to_back_after = 1 << 9,
|
||||
Flag_has_call = 1 << 10,
|
||||
Flag_is_reduction = 1 << 11,
|
||||
Flag_has_swapped_edges = 1 << 11,
|
||||
Flag_is_scheduled = 1 << 12,
|
||||
Flag_is_expensive = 1 << 13,
|
||||
Flag_is_predicated_vector = 1 << 14,
|
||||
@ -1001,10 +1007,8 @@ public:
|
||||
bool is_macro() const { return (_flags & Flag_is_macro) != 0; }
|
||||
// The node is expensive: the best control is set during loop opts
|
||||
bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != nullptr; }
|
||||
|
||||
// An arithmetic node which accumulates a data in a loop.
|
||||
// It must have the loop's phi as input and provide a def to the phi.
|
||||
bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
|
||||
// The node's original edge position is swapped.
|
||||
bool has_swapped_edges() const { return (_flags & Flag_has_swapped_edges) != 0; }
|
||||
|
||||
bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; }
|
||||
|
||||
|
@ -72,6 +72,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
|
||||
_lpt(nullptr), // loop tree node
|
||||
_lp(nullptr), // CountedLoopNode
|
||||
_pre_loop_end(nullptr), // Pre loop CountedLoopEndNode
|
||||
_loop_reductions(arena()), // reduction nodes in the current loop
|
||||
_bb(nullptr), // basic block
|
||||
_iv(nullptr), // induction var
|
||||
_race_possible(false), // cases where SDMU is true
|
||||
@ -111,7 +112,17 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
return false; // skip malformed counted loop
|
||||
}
|
||||
|
||||
if (cl->is_rce_post_loop() && cl->is_reduction_loop()) {
|
||||
// Initialize simple data used by reduction marking early.
|
||||
set_lpt(lpt);
|
||||
set_lp(cl);
|
||||
// For now, define one block which is the entire loop body.
|
||||
set_bb(cl);
|
||||
|
||||
if (SuperWordReductions) {
|
||||
mark_reductions();
|
||||
}
|
||||
|
||||
if (cl->is_rce_post_loop() && is_marked_reduction_loop()) {
|
||||
// Post loop vectorization doesn't support reductions
|
||||
return false;
|
||||
}
|
||||
@ -167,18 +178,12 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
|
||||
init(); // initialize data structures
|
||||
|
||||
set_lpt(lpt);
|
||||
set_lp(cl);
|
||||
|
||||
// For now, define one block which is the entire loop body
|
||||
set_bb(cl);
|
||||
|
||||
bool success = true;
|
||||
if (do_optimization) {
|
||||
assert(_packset.length() == 0, "packset must be empty");
|
||||
success = SLP_extract();
|
||||
if (PostLoopMultiversioning) {
|
||||
if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
|
||||
if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) {
|
||||
IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
|
||||
CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
|
||||
// Main loop SLP works well for manually unrolled loops. But post loop
|
||||
@ -223,7 +228,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
Node* n = lpt()->_body.at(i);
|
||||
if (n == cl->incr() ||
|
||||
n->is_reduction() ||
|
||||
is_marked_reduction(n) ||
|
||||
n->is_AddP() ||
|
||||
n->is_Cmp() ||
|
||||
n->is_Bool() ||
|
||||
@ -411,6 +416,139 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
}
|
||||
}
|
||||
|
||||
bool SuperWord::is_reduction(const Node* n) {
|
||||
if (!is_reduction_operator(n)) {
|
||||
return false;
|
||||
}
|
||||
// Test whether there is a reduction cycle via every edge index
|
||||
// (typically indices 1 and 2).
|
||||
for (uint input = 1; input < n->req(); input++) {
|
||||
if (in_reduction_cycle(n, input)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SuperWord::is_reduction_operator(const Node* n) {
|
||||
int opc = n->Opcode();
|
||||
return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
|
||||
}
|
||||
|
||||
bool SuperWord::in_reduction_cycle(const Node* n, uint input) {
|
||||
// First find input reduction path to phi node.
|
||||
auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
|
||||
PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
|
||||
[&](const Node* m) { return m->is_Phi(); });
|
||||
const Node* phi = path_to_phi.first;
|
||||
if (phi == nullptr) {
|
||||
return false;
|
||||
}
|
||||
// If there is an input reduction path from the phi's loop-back to n, then n
|
||||
// is part of a reduction cycle.
|
||||
const Node* first = phi->in(LoopNode::LoopBackControl);
|
||||
PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode,
|
||||
[&](const Node* m) { return m == n; });
|
||||
return path_from_phi.first != nullptr;
|
||||
}
|
||||
|
||||
Node* SuperWord::original_input(const Node* n, uint i) {
|
||||
if (n->has_swapped_edges()) {
|
||||
assert(n->is_Add() || n->is_Mul(), "n should be commutative");
|
||||
if (i == 1) {
|
||||
return n->in(2);
|
||||
} else if (i == 2) {
|
||||
return n->in(1);
|
||||
}
|
||||
}
|
||||
return n->in(i);
|
||||
}
|
||||
|
||||
void SuperWord::mark_reductions() {
|
||||
|
||||
_loop_reductions.clear();
|
||||
|
||||
// Iterate through all phi nodes associated to the loop and search for
|
||||
// reduction cycles in the basic block.
|
||||
for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
|
||||
const Node* phi = lp()->fast_out(i);
|
||||
if (!phi->is_Phi()) {
|
||||
continue;
|
||||
}
|
||||
if (phi->outcnt() == 0) {
|
||||
continue;
|
||||
}
|
||||
if (phi == iv()) {
|
||||
continue;
|
||||
}
|
||||
// The phi's loop-back is considered the first node in the reduction cycle.
|
||||
const Node* first = phi->in(LoopNode::LoopBackControl);
|
||||
if (first == nullptr) {
|
||||
continue;
|
||||
}
|
||||
// Test that the node fits the standard pattern for a reduction operator.
|
||||
if (!is_reduction_operator(first)) {
|
||||
continue;
|
||||
}
|
||||
// Test that 'first' is the beginning of a reduction cycle ending in 'phi'.
|
||||
// To contain the number of searched paths, assume that all nodes in a
|
||||
// reduction cycle are connected via the same edge index, modulo swapped
|
||||
// inputs. This assumption is realistic because reduction cycles usually
|
||||
// consist of nodes cloned by loop unrolling.
|
||||
int reduction_input = -1;
|
||||
int path_nodes = -1;
|
||||
for (uint input = 1; input < first->req(); input++) {
|
||||
// Test whether there is a reduction path in the basic block from 'first'
|
||||
// to the phi node following edge index 'input'.
|
||||
PathEnd path =
|
||||
find_in_path(
|
||||
first, input, lpt()->_body.size(),
|
||||
[&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); },
|
||||
[&](const Node* n) { return n == phi; });
|
||||
if (path.first != nullptr) {
|
||||
reduction_input = input;
|
||||
path_nodes = path.second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (reduction_input == -1) {
|
||||
continue;
|
||||
}
|
||||
// Test that reduction nodes do not have any users in the loop besides their
|
||||
// reduction cycle successors.
|
||||
const Node* current = first;
|
||||
const Node* succ = phi; // current's successor in the reduction cycle.
|
||||
bool used_in_loop = false;
|
||||
for (int i = 0; i < path_nodes; i++) {
|
||||
for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
|
||||
Node* u = current->fast_out(j);
|
||||
if (!in_bb(u)) {
|
||||
continue;
|
||||
}
|
||||
if (u == succ) {
|
||||
continue;
|
||||
}
|
||||
used_in_loop = true;
|
||||
break;
|
||||
}
|
||||
if (used_in_loop) {
|
||||
break;
|
||||
}
|
||||
succ = current;
|
||||
current = original_input(current, reduction_input);
|
||||
}
|
||||
if (used_in_loop) {
|
||||
continue;
|
||||
}
|
||||
// Reduction cycle found. Mark all nodes in the found path as reductions.
|
||||
current = first;
|
||||
for (int i = 0; i < path_nodes; i++) {
|
||||
_loop_reductions.set(current->_idx);
|
||||
current = original_input(current, reduction_input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------SLP_extract---------------------------
|
||||
// Extract the superword level parallelism
|
||||
//
|
||||
@ -1378,7 +1516,7 @@ bool SuperWord::independent(Node* s1, Node* s2) {
|
||||
// those nodes, and have not found another node from the pack, we know
|
||||
// that all nodes in the pack are independent.
|
||||
Node* SuperWord::find_dependence(Node_List* p) {
|
||||
if (p->at(0)->is_reduction()) {
|
||||
if (is_marked_reduction(p->at(0))) {
|
||||
return nullptr; // ignore reductions
|
||||
}
|
||||
ResourceMark rm;
|
||||
@ -1436,7 +1574,7 @@ bool SuperWord::reduction(Node* s1, Node* s2) {
|
||||
int d1 = depth(s1);
|
||||
int d2 = depth(s2);
|
||||
if (d2 > d1) {
|
||||
if (s1->is_reduction() && s2->is_reduction()) {
|
||||
if (is_marked_reduction(s1) && is_marked_reduction(s2)) {
|
||||
// This is an ordered set, so s1 should define s2
|
||||
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
||||
Node* t1 = s1->fast_out(i);
|
||||
@ -1653,7 +1791,7 @@ void SuperWord::order_def_uses(Node_List* p) {
|
||||
if (s1->is_Store()) return;
|
||||
|
||||
// reductions are always managed beforehand
|
||||
if (s1->is_reduction()) return;
|
||||
if (is_marked_reduction(s1)) return;
|
||||
|
||||
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
||||
Node* t1 = s1->fast_out(i);
|
||||
@ -1689,15 +1827,15 @@ void SuperWord::order_def_uses(Node_List* p) {
|
||||
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
|
||||
// check reductions to see if they are marshalled to represent the reduction
|
||||
// operator in a specified opnd
|
||||
if (u1->is_reduction() && u2->is_reduction()) {
|
||||
if (is_marked_reduction(u1) && is_marked_reduction(u2)) {
|
||||
// ensure reductions have phis and reduction definitions feeding the 1st operand
|
||||
Node* first = u1->in(2);
|
||||
if (first->is_Phi() || first->is_reduction()) {
|
||||
if (first->is_Phi() || is_marked_reduction(first)) {
|
||||
u1->swap_edges(1, 2);
|
||||
}
|
||||
// ensure reductions have phis and reduction definitions feeding the 1st operand
|
||||
first = u2->in(2);
|
||||
if (first->is_Phi() || first->is_reduction()) {
|
||||
if (first->is_Phi() || is_marked_reduction(first)) {
|
||||
u2->swap_edges(1, 2);
|
||||
}
|
||||
return true;
|
||||
@ -1920,7 +2058,7 @@ void SuperWord::filter_packs() {
|
||||
remove_pack_at(i);
|
||||
}
|
||||
Node *n = pk->at(0);
|
||||
if (n->is_reduction()) {
|
||||
if (is_marked_reduction(n)) {
|
||||
_num_reductions++;
|
||||
} else {
|
||||
_num_work_vecs++;
|
||||
@ -2171,7 +2309,7 @@ bool SuperWord::implemented(Node_List* p) {
|
||||
if (p0 != nullptr) {
|
||||
int opc = p0->Opcode();
|
||||
uint size = p->size();
|
||||
if (p0->is_reduction()) {
|
||||
if (is_marked_reduction(p0)) {
|
||||
const Type *arith_type = p0->bottom_type();
|
||||
// Length 2 reductions of INT/LONG do not offer performance benefits
|
||||
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
|
||||
@ -2261,13 +2399,13 @@ bool SuperWord::profitable(Node_List* p) {
|
||||
}
|
||||
}
|
||||
// Check if reductions are connected
|
||||
if (p0->is_reduction()) {
|
||||
if (is_marked_reduction(p0)) {
|
||||
Node* second_in = p0->in(2);
|
||||
Node_List* second_pk = my_pack(second_in);
|
||||
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
|
||||
// Remove reduction flag if no parent pack or if not enough work
|
||||
// Unmark reduction if no parent pack or if not enough work
|
||||
// to cover reduction expansion overhead
|
||||
p0->remove_flag(Node::Flag_is_reduction);
|
||||
_loop_reductions.remove(p0->_idx);
|
||||
return false;
|
||||
} else if (second_pk->size() != p->size()) {
|
||||
return false;
|
||||
@ -2299,7 +2437,7 @@ bool SuperWord::profitable(Node_List* p) {
|
||||
if (def == n) {
|
||||
// Reductions should only have a Phi use at the loop head or a non-phi use
|
||||
// outside of the loop if it is the last element of the pack (e.g. SafePoint).
|
||||
if (def->is_reduction() &&
|
||||
if (is_marked_reduction(def) &&
|
||||
((use->is_Phi() && use->in(0) == _lpt->_head) ||
|
||||
(!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
|
||||
continue;
|
||||
@ -2442,7 +2580,7 @@ public:
|
||||
for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
|
||||
Node* pred = preds.current();
|
||||
int pred_pid = get_pid_or_zero(pred);
|
||||
if (pred_pid == pid && n->is_reduction()) {
|
||||
if (pred_pid == pid && _slp->is_marked_reduction(n)) {
|
||||
continue; // reduction -> self-cycle is not a cyclic dependency
|
||||
}
|
||||
// Only add edges once, and only for mapped nodes (in block)
|
||||
@ -2992,7 +3130,7 @@ bool SuperWord::output() {
|
||||
} else if (n->req() == 3 && !is_cmov_pack(p)) {
|
||||
// Promote operands to vector
|
||||
Node* in1 = nullptr;
|
||||
bool node_isa_reduction = n->is_reduction();
|
||||
bool node_isa_reduction = is_marked_reduction(n);
|
||||
if (node_isa_reduction) {
|
||||
// the input to the first reduction operation is retained
|
||||
in1 = low_adr->in(1);
|
||||
@ -3246,7 +3384,7 @@ bool SuperWord::output() {
|
||||
Node* SuperWord::create_post_loop_vmask() {
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
assert(cl->is_rce_post_loop(), "Must be an rce post loop");
|
||||
assert(!cl->is_reduction_loop(), "no vector reduction in post loop");
|
||||
assert(!is_marked_reduction_loop(), "no vector reduction in post loop");
|
||||
assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1");
|
||||
|
||||
// Collect vector element types of all post loop packs. Also collect
|
||||
@ -3524,7 +3662,7 @@ void SuperWord::insert_extracts(Node_List* p) {
|
||||
_n_idx_list.pop();
|
||||
Node* def = use->in(idx);
|
||||
|
||||
if (def->is_reduction()) continue;
|
||||
if (is_marked_reduction(def)) continue;
|
||||
|
||||
// Insert extract operation
|
||||
_igvn.hash_delete(def);
|
||||
@ -3547,7 +3685,7 @@ void SuperWord::insert_extracts(Node_List* p) {
|
||||
bool SuperWord::is_vector_use(Node* use, int u_idx) {
|
||||
Node_List* u_pk = my_pack(use);
|
||||
if (u_pk == nullptr) return false;
|
||||
if (use->is_reduction()) return true;
|
||||
if (is_marked_reduction(use)) return true;
|
||||
Node* def = use->in(u_idx);
|
||||
Node_List* d_pk = my_pack(def);
|
||||
if (d_pk == nullptr) {
|
||||
@ -3708,7 +3846,7 @@ bool SuperWord::construct_bb() {
|
||||
if (in_bb(use) && !visited_test(use) &&
|
||||
// Don't go around backedge
|
||||
(!use->is_Phi() || n == entry)) {
|
||||
if (use->is_reduction()) {
|
||||
if (is_marked_reduction(use)) {
|
||||
// First see if we can map the reduction on the given system we are on, then
|
||||
// make a data entry operation for each reduction we see.
|
||||
BasicType bt = use->bottom_type()->basic_type();
|
||||
@ -4345,10 +4483,6 @@ void SuperWord::init() {
|
||||
_iteration_last.clear();
|
||||
_node_info.clear();
|
||||
_align_to_ref = nullptr;
|
||||
_lpt = nullptr;
|
||||
_lp = nullptr;
|
||||
_bb = nullptr;
|
||||
_iv = nullptr;
|
||||
_race_possible = 0;
|
||||
_early_return = false;
|
||||
_num_work_vecs = 0;
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "opto/phaseX.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "utilities/growableArray.hpp"
|
||||
#include "utilities/pair.hpp"
|
||||
#include "libadt/dict.hpp"
|
||||
|
||||
//
|
||||
@ -357,6 +358,7 @@ class SuperWord : public ResourceObj {
|
||||
IdealLoopTree* _lpt; // Current loop tree node
|
||||
CountedLoopNode* _lp; // Current CountedLoopNode
|
||||
CountedLoopEndNode* _pre_loop_end; // Current CountedLoopEndNode of pre loop
|
||||
VectorSet _loop_reductions; // Reduction nodes in the current loop
|
||||
Node* _bb; // Current basic block
|
||||
PhiNode* _iv; // Induction var
|
||||
bool _race_possible; // In cases where SDMU is true
|
||||
@ -471,6 +473,62 @@ class SuperWord : public ResourceObj {
|
||||
|
||||
// methods
|
||||
|
||||
typedef const Pair<const Node*, int> PathEnd;
|
||||
|
||||
// Search for a path P = (n_1, n_2, ..., n_k) such that:
|
||||
// - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
|
||||
// - path(n) for all n in P,
|
||||
// - k <= max, and
|
||||
// - there exists a node e such that original_input(n_k, input) = e and end(e).
|
||||
// Return <e, k>, if P is found, or <nullptr, -1> otherwise.
|
||||
// Note that original_input(n, i) has the same behavior as n->in(i) except
|
||||
// that it commutes the inputs of binary nodes whose edges have been swapped.
|
||||
template <typename NodePredicate1, typename NodePredicate2>
|
||||
static PathEnd find_in_path(const Node *n1, uint input, int max,
|
||||
NodePredicate1 path, NodePredicate2 end) {
|
||||
const PathEnd no_path(nullptr, -1);
|
||||
const Node* current = n1;
|
||||
int k = 0;
|
||||
for (int i = 0; i <= max; i++) {
|
||||
if (current == nullptr) {
|
||||
return no_path;
|
||||
}
|
||||
if (end(current)) {
|
||||
return PathEnd(current, k);
|
||||
}
|
||||
if (!path(current)) {
|
||||
return no_path;
|
||||
}
|
||||
current = original_input(current, input);
|
||||
k++;
|
||||
}
|
||||
return no_path;
|
||||
}
|
||||
|
||||
public:
|
||||
// Whether n is a reduction operator and part of a reduction cycle.
|
||||
// This function can be used for individual queries outside the SLP analysis,
|
||||
// e.g. to inform matching in target-specific code. Otherwise, the
|
||||
// almost-equivalent but faster SuperWord::mark_reductions() is preferable.
|
||||
static bool is_reduction(const Node* n);
|
||||
// Whether n is marked as a reduction node.
|
||||
bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); }
|
||||
// Whether the current loop has any reduction node.
|
||||
bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); }
|
||||
private:
|
||||
// Whether n is a standard reduction operator.
|
||||
static bool is_reduction_operator(const Node* n);
|
||||
// Whether n is part of a reduction cycle via the 'input' edge index. To bound
|
||||
// the search, constrain the size of reduction cycles to LoopMaxUnroll.
|
||||
static bool in_reduction_cycle(const Node* n, uint input);
|
||||
// Reference to the i'th input node of n, commuting the inputs of binary nodes
|
||||
// whose edges have been swapped. Assumes n is a commutative operation.
|
||||
static Node* original_input(const Node* n, uint i);
|
||||
// Find and mark reductions in a loop. Running mark_reductions() is similar to
|
||||
// querying is_reduction(n) for every n in the SuperWord loop, but stricter in
|
||||
// that it assumes counted loops and requires that reduction nodes are not
|
||||
// used within the loop except by their reduction cycle predecessors.
|
||||
void mark_reductions();
|
||||
// Extract the superword level parallelism
|
||||
bool SLP_extract();
|
||||
// Find the adjacent memory references and create pack pairs for them.
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Red Hat, Inc. All rights reserved.
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -44,9 +45,9 @@ public class TestSuperwordFailsUnrolling {
|
||||
public static void main(String[] args) {
|
||||
Object avx = wb.getVMFlag("UseAVX");
|
||||
if (avx != null && ((Long)avx) > 2) {
|
||||
TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8");
|
||||
TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
|
||||
}
|
||||
TestFramework.runWithFlags("-XX:LoopMaxUnroll=8");
|
||||
TestFramework.runWithFlags("-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8287087
|
||||
* @summary Test that floating-point min/max x64 operations are implemented
|
||||
* differently depending on whether they are part of a reduction. These
|
||||
* tests complement those in TestFpMinMaxIntrinsics, which focus more
|
||||
* on correctness aspects.
|
||||
* @library /test/lib /
|
||||
* @requires os.simpleArch == "x64"& (vm.opt.UseAVX == "null" | vm.opt.UseAVX > 0)
|
||||
* @run driver compiler.intrinsics.math.TestFpMinMaxReductions
|
||||
*/
|
||||
|
||||
package compiler.intrinsics.math;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
|
||||
public class TestFpMinMaxReductions {
|
||||
|
||||
private static float acc;
|
||||
private static float floatInput1;
|
||||
private static float floatInput2;
|
||||
private static float[] floatArray = new float[1000];
|
||||
|
||||
private static double doubleInput1;
|
||||
private static double doubleInput2;
|
||||
private static double[] doubleArray = new double[1000];
|
||||
|
||||
private static int stride = 1;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
TestFramework.run();
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REG, "1"},
|
||||
failOn = {IRNode.MIN_F_REDUCTION_REG})
|
||||
private static float testFloatMin() {
|
||||
return Math.min(floatInput1, floatInput2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MAX_F_REG, "1"},
|
||||
failOn = {IRNode.MAX_F_REDUCTION_REG})
|
||||
private static float testFloatMax() {
|
||||
return Math.max(floatInput1, floatInput2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMinReduction() {
|
||||
float fmin = Float.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < floatArray.length; i++) {
|
||||
fmin = Math.min(fmin, floatArray[i]);
|
||||
}
|
||||
return fmin;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMinReductionPartiallyUnrolled() {
|
||||
float fmin = Float.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < floatArray.length / 2; i++) {
|
||||
fmin = Math.min(fmin, floatArray[2*i]);
|
||||
fmin = Math.min(fmin, floatArray[2*i + 1]);
|
||||
}
|
||||
return fmin;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMinReductionNonCounted() {
|
||||
float fmin = Float.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < floatArray.length; i += stride) {
|
||||
fmin = Math.min(fmin, floatArray[i]);
|
||||
}
|
||||
return fmin;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMinReductionGlobalAccumulator() {
|
||||
acc = Float.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < floatArray.length; i++) {
|
||||
acc = Math.min(acc, floatArray[i]);
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMinReductionInOuterLoop() {
|
||||
float fmin = Float.POSITIVE_INFINITY;
|
||||
int count = 0;
|
||||
for (int i = 0; i < floatArray.length; i++) {
|
||||
fmin = Math.min(fmin, floatArray[i]);
|
||||
for (int j = 0; j < 10; j += stride) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return fmin + count;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MAX_F_REDUCTION_REG, ">= 1"})
|
||||
private static float testFloatMaxReduction() {
|
||||
float fmax = Float.NEGATIVE_INFINITY;
|
||||
for (int i = 0; i < floatArray.length; i++) {
|
||||
fmax = Math.max(fmax, floatArray[i]);
|
||||
}
|
||||
return fmax;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_D_REG, "1"},
|
||||
failOn = {IRNode.MIN_D_REDUCTION_REG})
|
||||
private static double testDoubleMin() {
|
||||
return Math.min(doubleInput1, doubleInput2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MAX_D_REG, "1"},
|
||||
failOn = {IRNode.MAX_D_REDUCTION_REG})
|
||||
private static double testDoubleMax() {
|
||||
return Math.max(doubleInput1, doubleInput2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MIN_D_REDUCTION_REG, ">= 1"})
|
||||
private static double testDoubleMinReduction() {
|
||||
double fmin = Double.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < doubleArray.length; i++) {
|
||||
fmin = Math.min(fmin, doubleArray[i]);
|
||||
}
|
||||
return fmin;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MAX_D_REDUCTION_REG, ">= 1"})
|
||||
private static double testDoubleMaxReduction() {
|
||||
double fmax = Double.NEGATIVE_INFINITY;
|
||||
for (int i = 0; i < doubleArray.length; i++) {
|
||||
fmax = Math.max(fmax, doubleArray[i]);
|
||||
}
|
||||
return fmax;
|
||||
}
|
||||
|
||||
}
|
@ -649,6 +649,26 @@ public class IRNode {
|
||||
beforeMatchingNameRegex(MAX, "Max(I|L)");
|
||||
}
|
||||
|
||||
public static final String MAX_D_REDUCTION_REG = PREFIX + "MAX_D_REDUCTION_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MAX_D_REDUCTION_REG, "maxD_reduction_reg");
|
||||
}
|
||||
|
||||
public static final String MAX_D_REG = PREFIX + "MAX_D_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MAX_D_REG, "maxD_reg");
|
||||
}
|
||||
|
||||
public static final String MAX_F_REDUCTION_REG = PREFIX + "MAX_F_REDUCTION_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MAX_F_REDUCTION_REG, "maxF_reduction_reg");
|
||||
}
|
||||
|
||||
public static final String MAX_F_REG = PREFIX + "MAX_F_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MAX_F_REG, "maxF_reg");
|
||||
}
|
||||
|
||||
public static final String MAX_I = PREFIX + "MAX_I" + POSTFIX;
|
||||
static {
|
||||
beforeMatchingNameRegex(MAX_I, "MaxI");
|
||||
@ -679,6 +699,26 @@ public class IRNode {
|
||||
beforeMatchingNameRegex(MIN, "Min(I|L)");
|
||||
}
|
||||
|
||||
public static final String MIN_D_REDUCTION_REG = PREFIX + "MIN_D_REDUCTION_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MIN_D_REDUCTION_REG, "minD_reduction_reg");
|
||||
}
|
||||
|
||||
public static final String MIN_D_REG = PREFIX + "MIN_D_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MIN_D_REG, "minD_reg");
|
||||
}
|
||||
|
||||
public static final String MIN_F_REDUCTION_REG = PREFIX + "MIN_F_REDUCTION_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MIN_F_REDUCTION_REG, "minF_reduction_reg");
|
||||
}
|
||||
|
||||
public static final String MIN_F_REG = PREFIX + "MIN_F_REG" + POSTFIX;
|
||||
static {
|
||||
machOnlyNameRegex(MIN_F_REG, "minF_reg");
|
||||
}
|
||||
|
||||
public static final String MIN_I = PREFIX + "MIN_I" + POSTFIX;
|
||||
static {
|
||||
beforeMatchingNameRegex(MIN_I, "MinI");
|
||||
|
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8287087
|
||||
* @summary Test reduction vectorizations that are enabled by performing SLP
|
||||
* reduction analysis on unrolled loops.
|
||||
* @library /test/lib /
|
||||
* @requires vm.bits == 64
|
||||
* @run driver compiler.loopopts.superword.TestGeneralizedReductions
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
import jdk.test.lib.Asserts;
|
||||
|
||||
public class TestGeneralizedReductions {
|
||||
|
||||
private static int acc = 0;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Fix maximum number of unrolls for test stability.
|
||||
TestFramework.runWithFlags("-XX:LoopMaxUnroll=16");
|
||||
}
|
||||
|
||||
@Run(test = {"testReductionOnGlobalAccumulator",
|
||||
"testReductionOnPartiallyUnrolledLoop",
|
||||
"testReductionOnLargePartiallyUnrolledLoop",
|
||||
"testReductionOnPartiallyUnrolledLoopWithSwappedInputs",
|
||||
"testMapReductionOnGlobalAccumulator"})
|
||||
void run() {
|
||||
long[] array = new long[128];
|
||||
long result;
|
||||
|
||||
initArray(array);
|
||||
result = testReductionOnGlobalAccumulator(array);
|
||||
Asserts.assertEQ(result, 8128L, "unexpected result");
|
||||
|
||||
initArray(array);
|
||||
result = testReductionOnPartiallyUnrolledLoop(array);
|
||||
Asserts.assertEQ(result, 8128L, "unexpected result");
|
||||
|
||||
initArray(array);
|
||||
result = testReductionOnLargePartiallyUnrolledLoop(array);
|
||||
Asserts.assertEQ(result, 8128L, "unexpected result");
|
||||
|
||||
initArray(array);
|
||||
result = testReductionOnPartiallyUnrolledLoopWithSwappedInputs(array);
|
||||
Asserts.assertEQ(result, 8128L, "unexpected result");
|
||||
|
||||
initArray(array);
|
||||
result = testMapReductionOnGlobalAccumulator(array);
|
||||
Asserts.assertEQ(result, 448L, "unexpected result");
|
||||
}
|
||||
|
||||
private static void initArray(long[] array) {
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
array[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
|
||||
private static long testReductionOnGlobalAccumulator(long[] array) {
|
||||
acc = 0;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
acc += array[i];
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
|
||||
private static long testReductionOnPartiallyUnrolledLoop(long[] array) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < array.length / 2; i++) {
|
||||
sum += array[2*i];
|
||||
sum += array[2*i + 1];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
|
||||
private static long testReductionOnLargePartiallyUnrolledLoop(long[] array) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < array.length / 8; i++) {
|
||||
sum += array[8*i];
|
||||
sum += array[8*i + 1];
|
||||
sum += array[8*i + 2];
|
||||
sum += array[8*i + 3];
|
||||
sum += array[8*i + 4];
|
||||
sum += array[8*i + 5];
|
||||
sum += array[8*i + 6];
|
||||
sum += array[8*i + 7];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// This test illustrates a limitation of the current reduction analysis: it
|
||||
// fails to detect reduction cycles where nodes are connected via different
|
||||
// input indices (except if the differences result from C2 edge swapping).
|
||||
// If this limitation is overcome in the future, the test case should be
|
||||
// turned into a positive one.
|
||||
@Test
|
||||
@IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
|
||||
failOn = {IRNode.ADD_REDUCTION_VI})
|
||||
private static long testReductionOnPartiallyUnrolledLoopWithSwappedInputs(long[] array) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < array.length / 2; i++) {
|
||||
sum = sum + (int)array[2*i];
|
||||
sum = (int)array[2*i + 1] + sum;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(applyIfCPUFeature = {"avx2", "true"},
|
||||
applyIfAnd = {"SuperWordReductions", "true","UsePopCountInstruction", "true"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, ">= 1",
|
||||
IRNode.POPCOUNT_VL, ">= 1"})
|
||||
private static long testMapReductionOnGlobalAccumulator(long[] array) {
|
||||
acc = 0;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
acc += Long.bitCount(array[i]);
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -44,6 +44,9 @@ public class FpMinMaxIntrinsics {
|
||||
|
||||
private Random r = new Random();
|
||||
|
||||
private static int stride = 1;
|
||||
private static float acc;
|
||||
|
||||
@Setup
|
||||
public void init() {
|
||||
c1 = s1 = step();
|
||||
@ -127,4 +130,44 @@ public class FpMinMaxIntrinsics {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public float fMinReducePartiallyUnrolled() {
|
||||
float result = Float.MAX_VALUE;
|
||||
for (int i = 0; i < COUNT / 2; i++) {
|
||||
result = Math.min(result, floats[2*i]);
|
||||
result = Math.min(result, floats[2*i + 1]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public float fMinReduceNonCounted() {
|
||||
float result = Float.MAX_VALUE;
|
||||
for (int i = 0; i < COUNT; i += stride)
|
||||
result = Math.min(result, floats[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public float fMinReduceGlobalAccumulator() {
|
||||
acc = Float.MAX_VALUE;
|
||||
for (int i = 0; i < COUNT; i += stride)
|
||||
acc = Math.min(acc, floats[i]);
|
||||
return acc;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public float fMinReduceInOuterLoop() {
|
||||
float result = Float.MAX_VALUE;
|
||||
int count = 0;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
result = Math.min(result, floats[i]);
|
||||
for (int j = 0; j < 10; j += stride) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return result + count;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -42,17 +42,17 @@ public abstract class VectorReduction {
|
||||
private int[] intsB;
|
||||
private int[] intsC;
|
||||
private int[] intsD;
|
||||
private int resI;
|
||||
private long[] longsA;
|
||||
private long[] longsB;
|
||||
private long[] longsC;
|
||||
private long[] longsD;
|
||||
private long resL;
|
||||
|
||||
@Param("0")
|
||||
private int seed;
|
||||
private Random r = new Random(seed);
|
||||
|
||||
private static int globalResI;
|
||||
|
||||
@Setup
|
||||
public void init() {
|
||||
intsA = new int[COUNT];
|
||||
@ -75,51 +75,86 @@ public abstract class VectorReduction {
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedI() {
|
||||
public void andRedI(Blackhole bh) {
|
||||
int resI = 0xFFFF;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI &= intsD[i];
|
||||
}
|
||||
bh.consume(resI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void orRedI() {
|
||||
public void orRedI(Blackhole bh) {
|
||||
int resI = 0x0000;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI |= intsD[i];
|
||||
}
|
||||
bh.consume(resI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void xorRedI() {
|
||||
public void xorRedI(Blackhole bh) {
|
||||
int resI = 0x0000;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI ^= intsD[i];
|
||||
}
|
||||
bh.consume(resI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedL() {
|
||||
public void andRedL(Blackhole bh) {
|
||||
long resL = 0xFFFFFFFF;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL &= longsD[i];
|
||||
}
|
||||
bh.consume(resL);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void orRedL() {
|
||||
public void orRedL(Blackhole bh) {
|
||||
long resL = 0x00000000;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL |= longsD[i];
|
||||
}
|
||||
bh.consume(resL);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void xorRedL() {
|
||||
public void xorRedL(Blackhole bh) {
|
||||
long resL = 0x00000000;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL ^= longsD[i];
|
||||
}
|
||||
bh.consume(resL);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedIPartiallyUnrolled(Blackhole bh) {
|
||||
int resI = 0xFFFF;
|
||||
for (int i = 0; i < COUNT / 2; i++) {
|
||||
int j = 2*i;
|
||||
intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
|
||||
resI &= intsD[j];
|
||||
j = 2*i + 1;
|
||||
intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
|
||||
resI &= intsD[j];
|
||||
}
|
||||
bh.consume(resI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedIOnGlobalAccumulator() {
|
||||
globalResI = 0xFFFF;
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
globalResI &= intsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Fork(value = 2, jvmArgsPrepend = {
|
||||
|
Loading…
Reference in New Issue
Block a user