diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index fd64a684674..ff381e5e7a2 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -5374,7 +5374,7 @@ instruct loadD(regD dst, memory mem) // max = java.lang.Math.max(float a, float b) instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ - predicate(UseAVX > 0 && !n->is_reduction()); + predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); match(Set dst (MaxF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ @@ -5396,7 +5396,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, %} instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && n->is_reduction()); + predicate(UseAVX > 0 && SuperWord::is_reduction(n)); match(Set dst (MaxF a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -5410,7 +5410,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe // max = java.lang.Math.max(double a, double b) instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ - predicate(UseAVX > 0 && !n->is_reduction()); + predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); match(Set dst (MaxD a b)); effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp); format %{ @@ -5432,7 +5432,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, %} instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && n->is_reduction()); + predicate(UseAVX > 0 && SuperWord::is_reduction(n)); match(Set dst (MaxD a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -5446,7 +5446,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe // min = java.lang.Math.min(float a, float b) instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ - predicate(UseAVX > 0 && !n->is_reduction()); + predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); match(Set dst (MinF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ @@ -5468,7 +5468,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, %} instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && n->is_reduction()); + predicate(UseAVX > 0 && SuperWord::is_reduction(n)); match(Set dst (MinF a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -5482,7 +5482,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe // min = java.lang.Math.min(double a, double b) instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ - predicate(UseAVX > 0 && !n->is_reduction()); + predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); match(Set dst (MinD a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ @@ -5504,7 +5504,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, %} instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && n->is_reduction()); + predicate(UseAVX > 0 && SuperWord::is_reduction(n)); match(Set dst (MinD a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); diff --git a/src/hotspot/share/adlc/main.cpp b/src/hotspot/share/adlc/main.cpp index dce3f2309f8..ff379809b0c 100644 --- a/src/hotspot/share/adlc/main.cpp +++ b/src/hotspot/share/adlc/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -271,6 +271,7 @@ int main(int argc, char *argv[]) AD.addInclude(AD._DFA_file, "opto/narrowptrnode.hpp"); AD.addInclude(AD._DFA_file, "opto/opcodes.hpp"); AD.addInclude(AD._DFA_file, "opto/convertnode.hpp"); + AD.addInclude(AD._DFA_file, "opto/superword.hpp"); AD.addInclude(AD._DFA_file, "utilities/powerOfTwo.hpp"); // Make sure each .cpp file starts with include lines: diff --git a/src/hotspot/share/opto/idealGraphPrinter.cpp b/src/hotspot/share/opto/idealGraphPrinter.cpp index 45eaecda441..1aac87bb2a0 100644 --- a/src/hotspot/share/opto/idealGraphPrinter.cpp +++ b/src/hotspot/share/opto/idealGraphPrinter.cpp @@ -462,8 +462,8 @@ void IdealGraphPrinter::visit_node(Node *n, bool edges, VectorSet* temp_set) { if (flags & Node::Flag_has_call) { print_prop("has_call", "true"); } - if (flags & Node::Flag_is_reduction) { - print_prop("is_reduction", "true"); + if (flags & Node::Flag_has_swapped_edges) { + print_prop("has_swapped_edges", "true"); } if (C->matcher() != nullptr) { diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index 9070a1080f4..2899e7c18a8 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -1037,10 +1037,6 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) { } if (UseSuperWord) { - if (!cl->is_reduction_loop()) { - phase->mark_reductions(this); - } - // Only attempt slp analysis when user controls do not prohibit it if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) { // Once policy_slp_analysis succeeds, mark the loop with the @@ -1694,15 +1690,6 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n set_idom(new_pre_exit, pre_end, dd_main_head); set_loop(new_pre_exit, outer_loop->_parent); - if (peel_only) { - // Nodes in the peeled iteration that were marked as reductions within the - // original loop might not be reductions within their new outer loop. - for (uint i = 0; i < loop->_body.size(); i++) { - Node* n = old_new[loop->_body[i]->_idx]; - n->remove_flag(Node::Flag_is_reduction); - } - } - // Step B2: Build a zero-trip guard for the main-loop. After leaving the // pre-loop, the main-loop may not execute at all. Later in life this // zero-trip guard will become the minimum-trip guard when we unroll @@ -2456,69 +2443,6 @@ void PhaseIdealLoop::do_maximally_unroll(IdealLoopTree *loop, Node_List &old_new } } -void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) { - if (SuperWordReductions == false) return; - - CountedLoopNode* loop_head = loop->_head->as_CountedLoop(); - if (loop_head->unrolled_count() > 1) { - return; - } - - Node* trip_phi = loop_head->phi(); - for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) { - Node* phi = loop_head->fast_out(i); - if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) { - // For definitions which are loop inclusive and not tripcounts. - Node* def_node = phi->in(LoopNode::LoopBackControl); - - if (def_node != nullptr) { - Node* n_ctrl = get_ctrl(def_node); - if (n_ctrl != nullptr && loop->is_member(get_loop(n_ctrl))) { - // Now test it to see if it fits the standard pattern for a reduction operator. - int opc = def_node->Opcode(); - if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type()) - || opc == Op_MinD || opc == Op_MinF || opc == Op_MaxD || opc == Op_MaxF) { - if (!def_node->is_reduction()) { // Not marked yet - // To be a reduction, the arithmetic node must have the phi as input and provide a def to it - bool ok = false; - for (unsigned j = 1; j < def_node->req(); j++) { - Node* in = def_node->in(j); - if (in == phi) { - ok = true; - break; - } - } - - // do nothing if we did not match the initial criteria - if (ok == false) { - continue; - } - - // The result of the reduction must not be used in the loop - for (DUIterator_Fast imax, i = def_node->fast_outs(imax); i < imax && ok; i++) { - Node* u = def_node->fast_out(i); - if (!loop->is_member(get_loop(ctrl_or_self(u)))) { - continue; - } - if (u == phi) { - continue; - } - ok = false; - } - - // iff the uses conform - if (ok) { - def_node->add_flag(Node::Flag_is_reduction); - loop_head->mark_has_reductions(); - } - } - } - } - } - } - } -} - //------------------------------adjust_limit----------------------------------- // Helper function that computes new loop limit as (rc_limit-offset)/scale Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) { diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index 035c19419e3..73cb22c6db8 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -2249,7 +2249,6 @@ void CountedLoopNode::dump_spec(outputStream *st) const { if (is_pre_loop ()) st->print("pre of N%d" , _main_idx); if (is_main_loop()) st->print("main of N%d", _idx); if (is_post_loop()) st->print("post of N%d", _main_idx); - if (is_reduction_loop()) st->print(" reduction"); if (is_strip_mined()) st->print(" strip mined"); } #endif @@ -3991,7 +3990,6 @@ void IdealLoopTree::dump_head() { if (cl->is_pre_loop ()) tty->print(" pre" ); if (cl->is_main_loop()) tty->print(" main"); if (cl->is_post_loop()) tty->print(" post"); - if (cl->is_reduction_loop()) tty->print(" reduction"); if (cl->is_vectorized_loop()) tty->print(" vector"); if (range_checks_present()) tty->print(" rc "); if (cl->is_multiversioned()) tty->print(" multi "); diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index c781cc4651f..459021120ed 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -61,23 +61,22 @@ protected: uint _loop_flags; // Names for flag bitfields enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3, - MainHasNoPreLoop = 1<<2, - HasExactTripCount = 1<<3, - InnerLoop = 1<<4, - PartialPeelLoop = 1<<5, - PartialPeelFailed = 1<<6, - HasReductions = 1<<7, - WasSlpAnalyzed = 1<<8, - PassedSlpAnalysis = 1<<9, - DoUnrollOnly = 1<<10, - VectorizedLoop = 1<<11, - HasAtomicPostLoop = 1<<12, - IsMultiversioned = 1<<13, - StripMined = 1<<14, - SubwordLoop = 1<<15, - ProfileTripFailed = 1<<16, - LoopNestInnerLoop = 1 << 17, - LoopNestLongOuterLoop = 1 << 18}; + MainHasNoPreLoop = 1<<2, + HasExactTripCount = 1<<3, + InnerLoop = 1<<4, + PartialPeelLoop = 1<<5, + PartialPeelFailed = 1<<6, + WasSlpAnalyzed = 1<<7, + PassedSlpAnalysis = 1<<8, + DoUnrollOnly = 1<<9, + VectorizedLoop = 1<<10, + HasAtomicPostLoop = 1<<11, + IsMultiversioned = 1<<12, + StripMined = 1<<13, + SubwordLoop = 1<<14, + ProfileTripFailed = 1<<15, + LoopNestInnerLoop = 1<<16, + LoopNestLongOuterLoop = 1<<17}; char _unswitch_count; enum { _unswitch_max=3 }; char _postloop_flags; @@ -105,7 +104,6 @@ public: bool is_loop_nest_outer_loop() const { return _loop_flags & LoopNestLongOuterLoop; } void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; } - void mark_has_reductions() { _loop_flags |= HasReductions; } void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; } void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; } void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; } @@ -286,7 +284,6 @@ public: bool is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; } bool is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; } bool is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; } - bool is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; } bool was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; } bool has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; } bool is_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; } @@ -1313,9 +1310,6 @@ public: // Unroll the loop body one step - make each trip do 2 iterations. void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip ); - // Mark vector reduction candidates before loop unrolling - void mark_reductions( IdealLoopTree *loop ); - // Return true if exp is a constant times an induction var bool is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth = 0); diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 1e485cc73dc..53fce009040 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -2622,10 +2622,6 @@ void PhaseIdealLoop::clone_loop_body(const Node_List& body, Node_List &old_new, Node* old = body.at(i); Node* nnn = old->clone(); old_new.map(old->_idx, nnn); - if (old->is_reduction()) { - // Reduction flag is not copied by default. Copy it here when cloning the entire loop body. - nnn->add_flag(Node::Flag_is_reduction); - } if (C->do_vector_loop() && cm != nullptr) { cm->verify_insert_and_clone(old, nnn, cm->clone_idx()); } diff --git a/src/hotspot/share/opto/node.cpp b/src/hotspot/share/opto/node.cpp index dde202023fd..f781fa28785 100644 --- a/src/hotspot/share/opto/node.cpp +++ b/src/hotspot/share/opto/node.cpp @@ -521,10 +521,6 @@ Node *Node::clone() const { // If it is applicable, it will happen anyway when the cloned node is registered with IGVN. n->remove_flag(Node::NodeFlags::Flag_for_post_loop_opts_igvn); } - if (n->is_reduction()) { - // Do not copy reduction information. This must be explicitly set by the calling code. - n->remove_flag(Node::Flag_is_reduction); - } BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2(); bs->register_potential_barrier_node(n); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index e3c10822429..a81359e8459 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -578,6 +578,12 @@ public: _in[i2] = n1; // If this node is in the hash table, make sure it doesn't need a rehash. assert(check_hash == NO_HASH || check_hash == hash(), "edge swap must preserve hash code"); + // Flip swapped edges flag. + if (has_swapped_edges()) { + remove_flag(Node::Flag_has_swapped_edges); + } else { + add_flag(Node::Flag_has_swapped_edges); + } } // Iterators over input Nodes for a Node X are written as: @@ -784,7 +790,7 @@ public: Flag_avoid_back_to_back_before = 1 << 8, Flag_avoid_back_to_back_after = 1 << 9, Flag_has_call = 1 << 10, - Flag_is_reduction = 1 << 11, + Flag_has_swapped_edges = 1 << 11, Flag_is_scheduled = 1 << 12, Flag_is_expensive = 1 << 13, Flag_is_predicated_vector = 1 << 14, @@ -1001,10 +1007,8 @@ public: bool is_macro() const { return (_flags & Flag_is_macro) != 0; } // The node is expensive: the best control is set during loop opts bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != nullptr; } - - // An arithmetic node which accumulates a data in a loop. - // It must have the loop's phi as input and provide a def to the phi. - bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; } + // The node's original edge position is swapped. + bool has_swapped_edges() const { return (_flags & Flag_has_swapped_edges) != 0; } bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index a782fedfec5..af8f08f722b 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -72,6 +72,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) : _lpt(nullptr), // loop tree node _lp(nullptr), // CountedLoopNode _pre_loop_end(nullptr), // Pre loop CountedLoopEndNode + _loop_reductions(arena()), // reduction nodes in the current loop _bb(nullptr), // basic block _iv(nullptr), // induction var _race_possible(false), // cases where SDMU is true @@ -111,7 +112,17 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { return false; // skip malformed counted loop } - if (cl->is_rce_post_loop() && cl->is_reduction_loop()) { + // Initialize simple data used by reduction marking early. + set_lpt(lpt); + set_lp(cl); + // For now, define one block which is the entire loop body. + set_bb(cl); + + if (SuperWordReductions) { + mark_reductions(); + } + + if (cl->is_rce_post_loop() && is_marked_reduction_loop()) { // Post loop vectorization doesn't support reductions return false; } @@ -167,18 +178,12 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { init(); // initialize data structures - set_lpt(lpt); - set_lp(cl); - - // For now, define one block which is the entire loop body - set_bb(cl); - bool success = true; if (do_optimization) { assert(_packset.length() == 0, "packset must be empty"); success = SLP_extract(); if (PostLoopMultiversioning) { - if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) { + if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) { IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next; CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop(); // Main loop SLP works well for manually unrolled loops. But post loop @@ -223,7 +228,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { for (uint i = 0; i < lpt()->_body.size(); i++) { Node* n = lpt()->_body.at(i); if (n == cl->incr() || - n->is_reduction() || + is_marked_reduction(n) || n->is_AddP() || n->is_Cmp() || n->is_Bool() || @@ -411,6 +416,139 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { } } +bool SuperWord::is_reduction(const Node* n) { + if (!is_reduction_operator(n)) { + return false; + } + // Test whether there is a reduction cycle via every edge index + // (typically indices 1 and 2). + for (uint input = 1; input < n->req(); input++) { + if (in_reduction_cycle(n, input)) { + return true; + } + } + return false; +} + +bool SuperWord::is_reduction_operator(const Node* n) { + int opc = n->Opcode(); + return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type())); +} + +bool SuperWord::in_reduction_cycle(const Node* n, uint input) { + // First find input reduction path to phi node. + auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); }; + PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode, + [&](const Node* m) { return m->is_Phi(); }); + const Node* phi = path_to_phi.first; + if (phi == nullptr) { + return false; + } + // If there is an input reduction path from the phi's loop-back to n, then n + // is part of a reduction cycle. + const Node* first = phi->in(LoopNode::LoopBackControl); + PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode, + [&](const Node* m) { return m == n; }); + return path_from_phi.first != nullptr; +} + +Node* SuperWord::original_input(const Node* n, uint i) { + if (n->has_swapped_edges()) { + assert(n->is_Add() || n->is_Mul(), "n should be commutative"); + if (i == 1) { + return n->in(2); + } else if (i == 2) { + return n->in(1); + } + } + return n->in(i); +} + +void SuperWord::mark_reductions() { + + _loop_reductions.clear(); + + // Iterate through all phi nodes associated to the loop and search for + // reduction cycles in the basic block. + for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) { + const Node* phi = lp()->fast_out(i); + if (!phi->is_Phi()) { + continue; + } + if (phi->outcnt() == 0) { + continue; + } + if (phi == iv()) { + continue; + } + // The phi's loop-back is considered the first node in the reduction cycle. + const Node* first = phi->in(LoopNode::LoopBackControl); + if (first == nullptr) { + continue; + } + // Test that the node fits the standard pattern for a reduction operator. + if (!is_reduction_operator(first)) { + continue; + } + // Test that 'first' is the beginning of a reduction cycle ending in 'phi'. + // To contain the number of searched paths, assume that all nodes in a + // reduction cycle are connected via the same edge index, modulo swapped + // inputs. This assumption is realistic because reduction cycles usually + // consist of nodes cloned by loop unrolling. + int reduction_input = -1; + int path_nodes = -1; + for (uint input = 1; input < first->req(); input++) { + // Test whether there is a reduction path in the basic block from 'first' + // to the phi node following edge index 'input'. + PathEnd path = + find_in_path( + first, input, lpt()->_body.size(), + [&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); }, + [&](const Node* n) { return n == phi; }); + if (path.first != nullptr) { + reduction_input = input; + path_nodes = path.second; + break; + } + } + if (reduction_input == -1) { + continue; + } + // Test that reduction nodes do not have any users in the loop besides their + // reduction cycle successors. + const Node* current = first; + const Node* succ = phi; // current's successor in the reduction cycle. + bool used_in_loop = false; + for (int i = 0; i < path_nodes; i++) { + for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) { + Node* u = current->fast_out(j); + if (!in_bb(u)) { + continue; + } + if (u == succ) { + continue; + } + used_in_loop = true; + break; + } + if (used_in_loop) { + break; + } + succ = current; + current = original_input(current, reduction_input); + } + if (used_in_loop) { + continue; + } + // Reduction cycle found. Mark all nodes in the found path as reductions. + current = first; + for (int i = 0; i < path_nodes; i++) { + _loop_reductions.set(current->_idx); + current = original_input(current, reduction_input); + } + } +} + //------------------------------SLP_extract--------------------------- // Extract the superword level parallelism // @@ -1378,7 +1516,7 @@ bool SuperWord::independent(Node* s1, Node* s2) { // those nodes, and have not found another node from the pack, we know // that all nodes in the pack are independent. Node* SuperWord::find_dependence(Node_List* p) { - if (p->at(0)->is_reduction()) { + if (is_marked_reduction(p->at(0))) { return nullptr; // ignore reductions } ResourceMark rm; @@ -1436,7 +1574,7 @@ bool SuperWord::reduction(Node* s1, Node* s2) { int d1 = depth(s1); int d2 = depth(s2); if (d2 > d1) { - if (s1->is_reduction() && s2->is_reduction()) { + if (is_marked_reduction(s1) && is_marked_reduction(s2)) { // This is an ordered set, so s1 should define s2 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { Node* t1 = s1->fast_out(i); @@ -1653,7 +1791,7 @@ void SuperWord::order_def_uses(Node_List* p) { if (s1->is_Store()) return; // reductions are always managed beforehand - if (s1->is_reduction()) return; + if (is_marked_reduction(s1)) return; for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { Node* t1 = s1->fast_out(i); @@ -1689,15 +1827,15 @@ void SuperWord::order_def_uses(Node_List* p) { bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) { // check reductions to see if they are marshalled to represent the reduction // operator in a specified opnd - if (u1->is_reduction() && u2->is_reduction()) { + if (is_marked_reduction(u1) && is_marked_reduction(u2)) { // ensure reductions have phis and reduction definitions feeding the 1st operand Node* first = u1->in(2); - if (first->is_Phi() || first->is_reduction()) { + if (first->is_Phi() || is_marked_reduction(first)) { u1->swap_edges(1, 2); } // ensure reductions have phis and reduction definitions feeding the 1st operand first = u2->in(2); - if (first->is_Phi() || first->is_reduction()) { + if (first->is_Phi() || is_marked_reduction(first)) { u2->swap_edges(1, 2); } return true; @@ -1920,7 +2058,7 @@ void SuperWord::filter_packs() { remove_pack_at(i); } Node *n = pk->at(0); - if (n->is_reduction()) { + if (is_marked_reduction(n)) { _num_reductions++; } else { _num_work_vecs++; @@ -2171,7 +2309,7 @@ bool SuperWord::implemented(Node_List* p) { if (p0 != nullptr) { int opc = p0->Opcode(); uint size = p->size(); - if (p0->is_reduction()) { + if (is_marked_reduction(p0)) { const Type *arith_type = p0->bottom_type(); // Length 2 reductions of INT/LONG do not offer performance benefits if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) { @@ -2261,13 +2399,13 @@ bool SuperWord::profitable(Node_List* p) { } } // Check if reductions are connected - if (p0->is_reduction()) { + if (is_marked_reduction(p0)) { Node* second_in = p0->in(2); Node_List* second_pk = my_pack(second_in); if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) { - // Remove reduction flag if no parent pack or if not enough work + // Unmark reduction if no parent pack or if not enough work // to cover reduction expansion overhead - p0->remove_flag(Node::Flag_is_reduction); + _loop_reductions.remove(p0->_idx); return false; } else if (second_pk->size() != p->size()) { return false; @@ -2299,7 +2437,7 @@ bool SuperWord::profitable(Node_List* p) { if (def == n) { // Reductions should only have a Phi use at the loop head or a non-phi use // outside of the loop if it is the last element of the pack (e.g. SafePoint). - if (def->is_reduction() && + if (is_marked_reduction(def) && ((use->is_Phi() && use->in(0) == _lpt->_head) || (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) { continue; @@ -2442,7 +2580,7 @@ public: for (DepPreds preds(n, dg); !preds.done(); preds.next()) { Node* pred = preds.current(); int pred_pid = get_pid_or_zero(pred); - if (pred_pid == pid && n->is_reduction()) { + if (pred_pid == pid && _slp->is_marked_reduction(n)) { continue; // reduction -> self-cycle is not a cyclic dependency } // Only add edges once, and only for mapped nodes (in block) @@ -2992,7 +3130,7 @@ bool SuperWord::output() { } else if (n->req() == 3 && !is_cmov_pack(p)) { // Promote operands to vector Node* in1 = nullptr; - bool node_isa_reduction = n->is_reduction(); + bool node_isa_reduction = is_marked_reduction(n); if (node_isa_reduction) { // the input to the first reduction operation is retained in1 = low_adr->in(1); @@ -3246,7 +3384,7 @@ bool SuperWord::output() { Node* SuperWord::create_post_loop_vmask() { CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); assert(cl->is_rce_post_loop(), "Must be an rce post loop"); - assert(!cl->is_reduction_loop(), "no vector reduction in post loop"); + assert(!is_marked_reduction_loop(), "no vector reduction in post loop"); assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1"); // Collect vector element types of all post loop packs. Also collect @@ -3524,7 +3662,7 @@ void SuperWord::insert_extracts(Node_List* p) { _n_idx_list.pop(); Node* def = use->in(idx); - if (def->is_reduction()) continue; + if (is_marked_reduction(def)) continue; // Insert extract operation _igvn.hash_delete(def); @@ -3547,7 +3685,7 @@ void SuperWord::insert_extracts(Node_List* p) { bool SuperWord::is_vector_use(Node* use, int u_idx) { Node_List* u_pk = my_pack(use); if (u_pk == nullptr) return false; - if (use->is_reduction()) return true; + if (is_marked_reduction(use)) return true; Node* def = use->in(u_idx); Node_List* d_pk = my_pack(def); if (d_pk == nullptr) { @@ -3708,7 +3846,7 @@ bool SuperWord::construct_bb() { if (in_bb(use) && !visited_test(use) && // Don't go around backedge (!use->is_Phi() || n == entry)) { - if (use->is_reduction()) { + if (is_marked_reduction(use)) { // First see if we can map the reduction on the given system we are on, then // make a data entry operation for each reduction we see. BasicType bt = use->bottom_type()->basic_type(); @@ -4345,10 +4483,6 @@ void SuperWord::init() { _iteration_last.clear(); _node_info.clear(); _align_to_ref = nullptr; - _lpt = nullptr; - _lp = nullptr; - _bb = nullptr; - _iv = nullptr; _race_possible = 0; _early_return = false; _num_work_vecs = 0; diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 1317ac9bb81..70e97e9444c 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -29,6 +29,7 @@ #include "opto/phaseX.hpp" #include "opto/vectornode.hpp" #include "utilities/growableArray.hpp" +#include "utilities/pair.hpp" #include "libadt/dict.hpp" // @@ -357,6 +358,7 @@ class SuperWord : public ResourceObj { IdealLoopTree* _lpt; // Current loop tree node CountedLoopNode* _lp; // Current CountedLoopNode CountedLoopEndNode* _pre_loop_end; // Current CountedLoopEndNode of pre loop + VectorSet _loop_reductions; // Reduction nodes in the current loop Node* _bb; // Current basic block PhiNode* _iv; // Induction var bool _race_possible; // In cases where SDMU is true @@ -471,6 +473,62 @@ class SuperWord : public ResourceObj { // methods + typedef const Pair PathEnd; + + // Search for a path P = (n_1, n_2, ..., n_k) such that: + // - original_input(n_i, input) = n_i+1 for all 1 <= i < k, + // - path(n) for all n in P, + // - k <= max, and + // - there exists a node e such that original_input(n_k, input) = e and end(e). + // Return , if P is found, or otherwise. + // Note that original_input(n, i) has the same behavior as n->in(i) except + // that it commutes the inputs of binary nodes whose edges have been swapped. + template + static PathEnd find_in_path(const Node *n1, uint input, int max, + NodePredicate1 path, NodePredicate2 end) { + const PathEnd no_path(nullptr, -1); + const Node* current = n1; + int k = 0; + for (int i = 0; i <= max; i++) { + if (current == nullptr) { + return no_path; + } + if (end(current)) { + return PathEnd(current, k); + } + if (!path(current)) { + return no_path; + } + current = original_input(current, input); + k++; + } + return no_path; + } + +public: + // Whether n is a reduction operator and part of a reduction cycle. + // This function can be used for individual queries outside the SLP analysis, + // e.g. to inform matching in target-specific code. Otherwise, the + // almost-equivalent but faster SuperWord::mark_reductions() is preferable. + static bool is_reduction(const Node* n); + // Whether n is marked as a reduction node. + bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); } + // Whether the current loop has any reduction node. + bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); } +private: + // Whether n is a standard reduction operator. + static bool is_reduction_operator(const Node* n); + // Whether n is part of a reduction cycle via the 'input' edge index. To bound + // the search, constrain the size of reduction cycles to LoopMaxUnroll. + static bool in_reduction_cycle(const Node* n, uint input); + // Reference to the i'th input node of n, commuting the inputs of binary nodes + // whose edges have been swapped. Assumes n is a commutative operation. + static Node* original_input(const Node* n, uint i); + // Find and mark reductions in a loop. Running mark_reductions() is similar to + // querying is_reduction(n) for every n in the SuperWord loop, but stricter in + // that it assumes counted loops and requires that reduction nodes are not + // used within the loop except by their reduction cycle predecessors. + void mark_reductions(); // Extract the superword level parallelism bool SLP_extract(); // Find the adjacent memory references and create pack pairs for them. diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java b/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java index 25845144826..32fd3978a60 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java @@ -1,5 +1,6 @@ /* * Copyright (c) 2022, Red Hat, Inc. All rights reserved. + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -44,9 +45,9 @@ public class TestSuperwordFailsUnrolling { public static void main(String[] args) { Object avx = wb.getVMFlag("UseAVX"); if (avx != null && ((Long)avx) > 2) { - TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8"); + TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions"); } - TestFramework.runWithFlags("-XX:LoopMaxUnroll=8"); + TestFramework.runWithFlags("-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions"); } @Test diff --git a/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java b/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java new file mode 100644 index 00000000000..47e842f6137 --- /dev/null +++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8287087 + * @summary Test that floating-point min/max x64 operations are implemented + * differently depending on whether they are part of a reduction. These + * tests complement those in TestFpMinMaxIntrinsics, which focus more + * on correctness aspects. + * @library /test/lib / + * @requires os.simpleArch == "x64"& (vm.opt.UseAVX == "null" | vm.opt.UseAVX > 0) + * @run driver compiler.intrinsics.math.TestFpMinMaxReductions + */ + +package compiler.intrinsics.math; + +import compiler.lib.ir_framework.*; + +public class TestFpMinMaxReductions { + + private static float acc; + private static float floatInput1; + private static float floatInput2; + private static float[] floatArray = new float[1000]; + + private static double doubleInput1; + private static double doubleInput2; + private static double[] doubleArray = new double[1000]; + + private static int stride = 1; + + public static void main(String[] args) throws Exception { + TestFramework.run(); + } + + @Test + @IR(counts = {IRNode.MIN_F_REG, "1"}, + failOn = {IRNode.MIN_F_REDUCTION_REG}) + private static float testFloatMin() { + return Math.min(floatInput1, floatInput2); + } + + @Test + @IR(counts = {IRNode.MAX_F_REG, "1"}, + failOn = {IRNode.MAX_F_REDUCTION_REG}) + private static float testFloatMax() { + return Math.max(floatInput1, floatInput2); + } + + @Test + @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMinReduction() { + float fmin = Float.POSITIVE_INFINITY; + for (int i = 0; i < floatArray.length; i++) { + fmin = Math.min(fmin, floatArray[i]); + } + return fmin; + } + + @Test + @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMinReductionPartiallyUnrolled() { + float fmin = Float.POSITIVE_INFINITY; + for (int i = 0; i < floatArray.length / 2; i++) { + fmin = Math.min(fmin, floatArray[2*i]); + fmin = Math.min(fmin, floatArray[2*i + 1]); + } + return fmin; + } + + @Test + @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMinReductionNonCounted() { + float fmin = Float.POSITIVE_INFINITY; + for (int i = 0; i < floatArray.length; i += stride) { + fmin = Math.min(fmin, floatArray[i]); + } + return fmin; + } + + @Test + @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMinReductionGlobalAccumulator() { + acc = Float.POSITIVE_INFINITY; + for (int i = 0; i < floatArray.length; i++) { + acc = Math.min(acc, floatArray[i]); + } + return acc; + } + + @Test + @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMinReductionInOuterLoop() { + float fmin = Float.POSITIVE_INFINITY; + int count = 0; + for (int i = 0; i < floatArray.length; i++) { + fmin = Math.min(fmin, floatArray[i]); + for (int j = 0; j < 10; j += stride) { + count++; + } + } + return fmin + count; + } + + @Test + @IR(counts = {IRNode.MAX_F_REDUCTION_REG, ">= 1"}) + private static float testFloatMaxReduction() { + float fmax = Float.NEGATIVE_INFINITY; + for (int i = 0; i < floatArray.length; i++) { + fmax = Math.max(fmax, floatArray[i]); + } + return fmax; + } + + @Test + @IR(counts = {IRNode.MIN_D_REG, "1"}, + failOn = {IRNode.MIN_D_REDUCTION_REG}) + private static double testDoubleMin() { + return Math.min(doubleInput1, doubleInput2); + } + + @Test + @IR(counts = {IRNode.MAX_D_REG, "1"}, + failOn = {IRNode.MAX_D_REDUCTION_REG}) + private static double testDoubleMax() { + return Math.max(doubleInput1, doubleInput2); + } + + @Test + @IR(counts = {IRNode.MIN_D_REDUCTION_REG, ">= 1"}) + private static double testDoubleMinReduction() { + double fmin = Double.POSITIVE_INFINITY; + for (int i = 0; i < doubleArray.length; i++) { + fmin = Math.min(fmin, doubleArray[i]); + } + return fmin; + } + + @Test + @IR(counts = {IRNode.MAX_D_REDUCTION_REG, ">= 1"}) + private static double testDoubleMaxReduction() { + double fmax = Double.NEGATIVE_INFINITY; + for (int i = 0; i < doubleArray.length; i++) { + fmax = Math.max(fmax, doubleArray[i]); + } + return fmax; + } + +} diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 8c1dbaef277..e4c9a8c11f1 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -649,6 +649,26 @@ public class IRNode { beforeMatchingNameRegex(MAX, "Max(I|L)"); } + public static final String MAX_D_REDUCTION_REG = PREFIX + "MAX_D_REDUCTION_REG" + POSTFIX; + static { + machOnlyNameRegex(MAX_D_REDUCTION_REG, "maxD_reduction_reg"); + } + + public static final String MAX_D_REG = PREFIX + "MAX_D_REG" + POSTFIX; + static { + machOnlyNameRegex(MAX_D_REG, "maxD_reg"); + } + + public static final String MAX_F_REDUCTION_REG = PREFIX + "MAX_F_REDUCTION_REG" + POSTFIX; + static { + machOnlyNameRegex(MAX_F_REDUCTION_REG, "maxF_reduction_reg"); + } + + public static final String MAX_F_REG = PREFIX + "MAX_F_REG" + POSTFIX; + static { + machOnlyNameRegex(MAX_F_REG, "maxF_reg"); + } + public static final String MAX_I = PREFIX + "MAX_I" + POSTFIX; static { beforeMatchingNameRegex(MAX_I, "MaxI"); @@ -679,6 +699,26 @@ public class IRNode { beforeMatchingNameRegex(MIN, "Min(I|L)"); } + public static final String MIN_D_REDUCTION_REG = PREFIX + "MIN_D_REDUCTION_REG" + POSTFIX; + static { + machOnlyNameRegex(MIN_D_REDUCTION_REG, "minD_reduction_reg"); + } + + public static final String MIN_D_REG = PREFIX + "MIN_D_REG" + POSTFIX; + static { + machOnlyNameRegex(MIN_D_REG, "minD_reg"); + } + + public static final String MIN_F_REDUCTION_REG = PREFIX + "MIN_F_REDUCTION_REG" + POSTFIX; + static { + machOnlyNameRegex(MIN_F_REDUCTION_REG, "minF_reduction_reg"); + } + + public static final String MIN_F_REG = PREFIX + "MIN_F_REG" + POSTFIX; + static { + machOnlyNameRegex(MIN_F_REG, "minF_reg"); + } + public static final String MIN_I = PREFIX + "MIN_I" + POSTFIX; static { beforeMatchingNameRegex(MIN_I, "MinI"); diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java new file mode 100644 index 00000000000..60ecaf0e4c8 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8287087 + * @summary Test reduction vectorizations that are enabled by performing SLP + * reduction analysis on unrolled loops. + * @library /test/lib / + * @requires vm.bits == 64 + * @run driver compiler.loopopts.superword.TestGeneralizedReductions + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; +import jdk.test.lib.Asserts; + +public class TestGeneralizedReductions { + + private static int acc = 0; + + public static void main(String[] args) throws Exception { + // Fix maximum number of unrolls for test stability. + TestFramework.runWithFlags("-XX:LoopMaxUnroll=16"); + } + + @Run(test = {"testReductionOnGlobalAccumulator", + "testReductionOnPartiallyUnrolledLoop", + "testReductionOnLargePartiallyUnrolledLoop", + "testReductionOnPartiallyUnrolledLoopWithSwappedInputs", + "testMapReductionOnGlobalAccumulator"}) + void run() { + long[] array = new long[128]; + long result; + + initArray(array); + result = testReductionOnGlobalAccumulator(array); + Asserts.assertEQ(result, 8128L, "unexpected result"); + + initArray(array); + result = testReductionOnPartiallyUnrolledLoop(array); + Asserts.assertEQ(result, 8128L, "unexpected result"); + + initArray(array); + result = testReductionOnLargePartiallyUnrolledLoop(array); + Asserts.assertEQ(result, 8128L, "unexpected result"); + + initArray(array); + result = testReductionOnPartiallyUnrolledLoopWithSwappedInputs(array); + Asserts.assertEQ(result, 8128L, "unexpected result"); + + initArray(array); + result = testMapReductionOnGlobalAccumulator(array); + Asserts.assertEQ(result, 448L, "unexpected result"); + } + + private static void initArray(long[] array) { + for (int i = 0; i < array.length; i++) { + array[i] = i; + } + } + + @Test + @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"}, + counts = {IRNode.ADD_REDUCTION_VI, ">= 1"}) + private static long testReductionOnGlobalAccumulator(long[] array) { + acc = 0; + for (int i = 0; i < array.length; i++) { + acc += array[i]; + } + return acc; + } + + @Test + @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"}, + counts = {IRNode.ADD_REDUCTION_VI, ">= 1"}) + private static long testReductionOnPartiallyUnrolledLoop(long[] array) { + int sum = 0; + for (int i = 0; i < array.length / 2; i++) { + sum += array[2*i]; + sum += array[2*i + 1]; + } + return sum; + } + + @Test + @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"}, + counts = {IRNode.ADD_REDUCTION_VI, ">= 1"}) + private static long testReductionOnLargePartiallyUnrolledLoop(long[] array) { + int sum = 0; + for (int i = 0; i < array.length / 8; i++) { + sum += array[8*i]; + sum += array[8*i + 1]; + sum += array[8*i + 2]; + sum += array[8*i + 3]; + sum += array[8*i + 4]; + sum += array[8*i + 5]; + sum += array[8*i + 6]; + sum += array[8*i + 7]; + } + return sum; + } + + // This test illustrates a limitation of the current reduction analysis: it + // fails to detect reduction cycles where nodes are connected via different + // input indices (except if the differences result from C2 edge swapping). + // If this limitation is overcome in the future, the test case should be + // turned into a positive one. + @Test + @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"}, + failOn = {IRNode.ADD_REDUCTION_VI}) + private static long testReductionOnPartiallyUnrolledLoopWithSwappedInputs(long[] array) { + int sum = 0; + for (int i = 0; i < array.length / 2; i++) { + sum = sum + (int)array[2*i]; + sum = (int)array[2*i + 1] + sum; + } + return sum; + } + + @Test + @IR(applyIfCPUFeature = {"avx2", "true"}, + applyIfAnd = {"SuperWordReductions", "true","UsePopCountInstruction", "true"}, + counts = {IRNode.ADD_REDUCTION_VI, ">= 1", + IRNode.POPCOUNT_VL, ">= 1"}) + private static long testMapReductionOnGlobalAccumulator(long[] array) { + acc = 0; + for (int i = 0; i < array.length; i++) { + acc += Long.bitCount(array[i]); + } + return acc; + } +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java index e4c3330dc9e..27ae2214157 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java +++ b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -44,6 +44,9 @@ public class FpMinMaxIntrinsics { private Random r = new Random(); + private static int stride = 1; + private static float acc; + @Setup public void init() { c1 = s1 = step(); @@ -127,4 +130,44 @@ public class FpMinMaxIntrinsics { return result; } + + @Benchmark + public float fMinReducePartiallyUnrolled() { + float result = Float.MAX_VALUE; + for (int i = 0; i < COUNT / 2; i++) { + result = Math.min(result, floats[2*i]); + result = Math.min(result, floats[2*i + 1]); + } + return result; + } + + @Benchmark + public float fMinReduceNonCounted() { + float result = Float.MAX_VALUE; + for (int i = 0; i < COUNT; i += stride) + result = Math.min(result, floats[i]); + return result; + } + + @Benchmark + public float fMinReduceGlobalAccumulator() { + acc = Float.MAX_VALUE; + for (int i = 0; i < COUNT; i += stride) + acc = Math.min(acc, floats[i]); + return acc; + } + + @Benchmark + public float fMinReduceInOuterLoop() { + float result = Float.MAX_VALUE; + int count = 0; + for (int i = 0; i < COUNT; i++) { + result = Math.min(result, floats[i]); + for (int j = 0; j < 10; j += stride) { + count++; + } + } + return result + count; + } + } diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java index cc853ae471b..b38330f2b83 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,17 +42,17 @@ public abstract class VectorReduction { private int[] intsB; private int[] intsC; private int[] intsD; - private int resI; private long[] longsA; private long[] longsB; private long[] longsC; private long[] longsD; - private long resL; @Param("0") private int seed; private Random r = new Random(seed); + private static int globalResI; + @Setup public void init() { intsA = new int[COUNT]; @@ -75,51 +75,86 @@ public abstract class VectorReduction { } @Benchmark - public void andRedI() { + public void andRedI(Blackhole bh) { + int resI = 0xFFFF; for (int i = 0; i < COUNT; i++) { intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); resI &= intsD[i]; } + bh.consume(resI); } @Benchmark - public void orRedI() { + public void orRedI(Blackhole bh) { + int resI = 0x0000; for (int i = 0; i < COUNT; i++) { intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); resI |= intsD[i]; } + bh.consume(resI); } @Benchmark - public void xorRedI() { + public void xorRedI(Blackhole bh) { + int resI = 0x0000; for (int i = 0; i < COUNT; i++) { intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); resI ^= intsD[i]; } + bh.consume(resI); } @Benchmark - public void andRedL() { + public void andRedL(Blackhole bh) { + long resL = 0xFFFFFFFF; for (int i = 0; i < COUNT; i++) { longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); resL &= longsD[i]; } + bh.consume(resL); } @Benchmark - public void orRedL() { + public void orRedL(Blackhole bh) { + long resL = 0x00000000; for (int i = 0; i < COUNT; i++) { longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); resL |= longsD[i]; } + bh.consume(resL); } @Benchmark - public void xorRedL() { + public void xorRedL(Blackhole bh) { + long resL = 0x00000000; for (int i = 0; i < COUNT; i++) { longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]); resL ^= longsD[i]; } + bh.consume(resL); + } + + @Benchmark + public void andRedIPartiallyUnrolled(Blackhole bh) { + int resI = 0xFFFF; + for (int i = 0; i < COUNT / 2; i++) { + int j = 2*i; + intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]); + resI &= intsD[j]; + j = 2*i + 1; + intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]); + resI &= intsD[j]; + } + bh.consume(resI); + } + + @Benchmark + public void andRedIOnGlobalAccumulator() { + globalResI = 0xFFFF; + for (int i = 0; i < COUNT; i++) { + intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]); + globalResI &= intsD[i]; + } } @Fork(value = 2, jvmArgsPrepend = {