diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
index fd64a684674..ff381e5e7a2 100644
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -5374,7 +5374,7 @@ instruct loadD(regD dst, memory mem)
 
 // max = java.lang.Math.max(float a, float b)
 instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
-  predicate(UseAVX > 0 && !n->is_reduction());
+  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
   match(Set dst (MaxF a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
   format %{
@@ -5396,7 +5396,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
 %}
 
 instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && n->is_reduction());
+  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
   match(Set dst (MaxF a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -5410,7 +5410,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
 
 // max = java.lang.Math.max(double a, double b)
 instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
-  predicate(UseAVX > 0 && !n->is_reduction());
+  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
   match(Set dst (MaxD a b));
   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
   format %{
@@ -5432,7 +5432,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
 %}
 
 instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && n->is_reduction());
+  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
   match(Set dst (MaxD a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -5446,7 +5446,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe
 
 // min = java.lang.Math.min(float a, float b)
 instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
-  predicate(UseAVX > 0 && !n->is_reduction());
+  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
   match(Set dst (MinF a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
   format %{
@@ -5468,7 +5468,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
 %}
 
 instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && n->is_reduction());
+  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
   match(Set dst (MinF a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -5482,7 +5482,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
 
 // min = java.lang.Math.min(double a, double b)
 instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
-  predicate(UseAVX > 0 && !n->is_reduction());
+  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
   match(Set dst (MinD a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
   format %{
@@ -5504,7 +5504,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
 %}
 
 instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && n->is_reduction());
+  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
   match(Set dst (MinD a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
diff --git a/src/hotspot/share/adlc/main.cpp b/src/hotspot/share/adlc/main.cpp
index dce3f2309f8..ff379809b0c 100644
--- a/src/hotspot/share/adlc/main.cpp
+++ b/src/hotspot/share/adlc/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -271,6 +271,7 @@ int main(int argc, char *argv[])
   AD.addInclude(AD._DFA_file, "opto/narrowptrnode.hpp");
   AD.addInclude(AD._DFA_file, "opto/opcodes.hpp");
   AD.addInclude(AD._DFA_file, "opto/convertnode.hpp");
+  AD.addInclude(AD._DFA_file, "opto/superword.hpp");
   AD.addInclude(AD._DFA_file, "utilities/powerOfTwo.hpp");
 
   // Make sure each .cpp file starts with include lines:
diff --git a/src/hotspot/share/opto/idealGraphPrinter.cpp b/src/hotspot/share/opto/idealGraphPrinter.cpp
index 45eaecda441..1aac87bb2a0 100644
--- a/src/hotspot/share/opto/idealGraphPrinter.cpp
+++ b/src/hotspot/share/opto/idealGraphPrinter.cpp
@@ -462,8 +462,8 @@ void IdealGraphPrinter::visit_node(Node *n, bool edges, VectorSet* temp_set) {
     if (flags & Node::Flag_has_call) {
       print_prop("has_call", "true");
     }
-    if (flags & Node::Flag_is_reduction) {
-      print_prop("is_reduction", "true");
+    if (flags & Node::Flag_has_swapped_edges) {
+      print_prop("has_swapped_edges", "true");
     }
 
     if (C->matcher() != nullptr) {
diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp
index 9070a1080f4..2899e7c18a8 100644
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@@ -1037,10 +1037,6 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
   }
 
   if (UseSuperWord) {
-    if (!cl->is_reduction_loop()) {
-      phase->mark_reductions(this);
-    }
-
     // Only attempt slp analysis when user controls do not prohibit it
     if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
       // Once policy_slp_analysis succeeds, mark the loop with the
@@ -1694,15 +1690,6 @@ void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_n
   set_idom(new_pre_exit, pre_end, dd_main_head);
   set_loop(new_pre_exit, outer_loop->_parent);
 
-  if (peel_only) {
-    // Nodes in the peeled iteration that were marked as reductions within the
-    // original loop might not be reductions within their new outer loop.
-    for (uint i = 0; i < loop->_body.size(); i++) {
-      Node* n = old_new[loop->_body[i]->_idx];
-      n->remove_flag(Node::Flag_is_reduction);
-    }
-  }
-
   // Step B2: Build a zero-trip guard for the main-loop.  After leaving the
   // pre-loop, the main-loop may not execute at all.  Later in life this
   // zero-trip guard will become the minimum-trip guard when we unroll
@@ -2456,69 +2443,6 @@ void PhaseIdealLoop::do_maximally_unroll(IdealLoopTree *loop, Node_List &old_new
   }
 }
 
-void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
-  if (SuperWordReductions == false) return;
-
-  CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
-  if (loop_head->unrolled_count() > 1) {
-    return;
-  }
-
-  Node* trip_phi = loop_head->phi();
-  for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
-    Node* phi = loop_head->fast_out(i);
-    if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
-      // For definitions which are loop inclusive and not tripcounts.
-      Node* def_node = phi->in(LoopNode::LoopBackControl);
-
-      if (def_node != nullptr) {
-        Node* n_ctrl = get_ctrl(def_node);
-        if (n_ctrl != nullptr && loop->is_member(get_loop(n_ctrl))) {
-          // Now test it to see if it fits the standard pattern for a reduction operator.
-          int opc = def_node->Opcode();
-          if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())
-              || opc == Op_MinD || opc == Op_MinF || opc == Op_MaxD || opc == Op_MaxF) {
-            if (!def_node->is_reduction()) { // Not marked yet
-              // To be a reduction, the arithmetic node must have the phi as input and provide a def to it
-              bool ok = false;
-              for (unsigned j = 1; j < def_node->req(); j++) {
-                Node* in = def_node->in(j);
-                if (in == phi) {
-                  ok = true;
-                  break;
-                }
-              }
-
-              // do nothing if we did not match the initial criteria
-              if (ok == false) {
-                continue;
-              }
-
-              // The result of the reduction must not be used in the loop
-              for (DUIterator_Fast imax, i = def_node->fast_outs(imax); i < imax && ok; i++) {
-                Node* u = def_node->fast_out(i);
-                if (!loop->is_member(get_loop(ctrl_or_self(u)))) {
-                  continue;
-                }
-                if (u == phi) {
-                  continue;
-                }
-                ok = false;
-              }
-
-              // iff the uses conform
-              if (ok) {
-                def_node->add_flag(Node::Flag_is_reduction);
-                loop_head->mark_has_reductions();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
 //------------------------------adjust_limit-----------------------------------
 // Helper function that computes new loop limit as (rc_limit-offset)/scale
 Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) {
diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp
index 035c19419e3..73cb22c6db8 100644
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@@ -2249,7 +2249,6 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
   if (is_pre_loop ()) st->print("pre of N%d" , _main_idx);
   if (is_main_loop()) st->print("main of N%d", _idx);
   if (is_post_loop()) st->print("post of N%d", _main_idx);
-  if (is_reduction_loop()) st->print(" reduction");
   if (is_strip_mined()) st->print(" strip mined");
 }
 #endif
@@ -3991,7 +3990,6 @@ void IdealLoopTree::dump_head() {
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
     if (cl->is_post_loop()) tty->print(" post");
-    if (cl->is_reduction_loop()) tty->print(" reduction");
     if (cl->is_vectorized_loop()) tty->print(" vector");
     if (range_checks_present()) tty->print(" rc ");
     if (cl->is_multiversioned()) tty->print(" multi ");
diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp
index c781cc4651f..459021120ed 100644
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@@ -61,23 +61,22 @@ protected:
   uint _loop_flags;
   // Names for flag bitfields
   enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3,
-         MainHasNoPreLoop    = 1<<2,
-         HasExactTripCount   = 1<<3,
-         InnerLoop           = 1<<4,
-         PartialPeelLoop     = 1<<5,
-         PartialPeelFailed   = 1<<6,
-         HasReductions       = 1<<7,
-         WasSlpAnalyzed      = 1<<8,
-         PassedSlpAnalysis   = 1<<9,
-         DoUnrollOnly        = 1<<10,
-         VectorizedLoop      = 1<<11,
-         HasAtomicPostLoop   = 1<<12,
-         IsMultiversioned    = 1<<13,
-         StripMined          = 1<<14,
-         SubwordLoop         = 1<<15,
-         ProfileTripFailed   = 1<<16,
-         LoopNestInnerLoop = 1 << 17,
-         LoopNestLongOuterLoop = 1 << 18};
+         MainHasNoPreLoop      = 1<<2,
+         HasExactTripCount     = 1<<3,
+         InnerLoop             = 1<<4,
+         PartialPeelLoop       = 1<<5,
+         PartialPeelFailed     = 1<<6,
+         WasSlpAnalyzed        = 1<<7,
+         PassedSlpAnalysis     = 1<<8,
+         DoUnrollOnly          = 1<<9,
+         VectorizedLoop        = 1<<10,
+         HasAtomicPostLoop     = 1<<11,
+         IsMultiversioned      = 1<<12,
+         StripMined            = 1<<13,
+         SubwordLoop           = 1<<14,
+         ProfileTripFailed     = 1<<15,
+         LoopNestInnerLoop     = 1<<16,
+         LoopNestLongOuterLoop = 1<<17};
   char _unswitch_count;
   enum { _unswitch_max=3 };
   char _postloop_flags;
@@ -105,7 +104,6 @@ public:
   bool is_loop_nest_outer_loop() const { return _loop_flags & LoopNestLongOuterLoop; }
 
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
-  void mark_has_reductions() { _loop_flags |= HasReductions; }
   void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
   void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
   void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
@@ -286,7 +284,6 @@ public:
   bool is_pre_loop      () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
   bool is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
   bool is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
-  bool is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
   bool was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
   bool has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   bool is_unroll_only   () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
@@ -1313,9 +1310,6 @@ public:
   // Unroll the loop body one step - make each trip do 2 iterations.
   void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );
 
-  // Mark vector reduction candidates before loop unrolling
-  void mark_reductions( IdealLoopTree *loop );
-
   // Return true if exp is a constant times an induction var
   bool is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth = 0);
 
diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index 1e485cc73dc..53fce009040 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -2622,10 +2622,6 @@ void PhaseIdealLoop::clone_loop_body(const Node_List& body, Node_List &old_new,
     Node* old = body.at(i);
     Node* nnn = old->clone();
     old_new.map(old->_idx, nnn);
-    if (old->is_reduction()) {
-      // Reduction flag is not copied by default. Copy it here when cloning the entire loop body.
-      nnn->add_flag(Node::Flag_is_reduction);
-    }
     if (C->do_vector_loop() && cm != nullptr) {
       cm->verify_insert_and_clone(old, nnn, cm->clone_idx());
     }
diff --git a/src/hotspot/share/opto/node.cpp b/src/hotspot/share/opto/node.cpp
index dde202023fd..f781fa28785 100644
--- a/src/hotspot/share/opto/node.cpp
+++ b/src/hotspot/share/opto/node.cpp
@@ -521,10 +521,6 @@ Node *Node::clone() const {
     // If it is applicable, it will happen anyway when the cloned node is registered with IGVN.
     n->remove_flag(Node::NodeFlags::Flag_for_post_loop_opts_igvn);
   }
-  if (n->is_reduction()) {
-    // Do not copy reduction information. This must be explicitly set by the calling code.
-    n->remove_flag(Node::Flag_is_reduction);
-  }
   BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
   bs->register_potential_barrier_node(n);
 
diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp
index e3c10822429..a81359e8459 100644
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@@ -578,6 +578,12 @@ public:
     _in[i2] = n1;
     // If this node is in the hash table, make sure it doesn't need a rehash.
     assert(check_hash == NO_HASH || check_hash == hash(), "edge swap must preserve hash code");
+    // Flip swapped edges flag.
+    if (has_swapped_edges()) {
+      remove_flag(Node::Flag_has_swapped_edges);
+    } else {
+      add_flag(Node::Flag_has_swapped_edges);
+    }
   }
 
   // Iterators over input Nodes for a Node X are written as:
@@ -784,7 +790,7 @@ public:
     Flag_avoid_back_to_back_before   = 1 << 8,
     Flag_avoid_back_to_back_after    = 1 << 9,
     Flag_has_call                    = 1 << 10,
-    Flag_is_reduction                = 1 << 11,
+    Flag_has_swapped_edges           = 1 << 11,
     Flag_is_scheduled                = 1 << 12,
     Flag_is_expensive                = 1 << 13,
     Flag_is_predicated_vector        = 1 << 14,
@@ -1001,10 +1007,8 @@ public:
   bool is_macro() const { return (_flags & Flag_is_macro) != 0; }
   // The node is expensive: the best control is set during loop opts
   bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != nullptr; }
-
-  // An arithmetic node which accumulates a data in a loop.
-  // It must have the loop's phi as input and provide a def to the phi.
-  bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
+  // The node's original edge position is swapped.
+  bool has_swapped_edges() const { return (_flags & Flag_has_swapped_edges) != 0; }
 
   bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; }
 
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index a782fedfec5..af8f08f722b 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -72,6 +72,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
   _lpt(nullptr),                                            // loop tree node
   _lp(nullptr),                                             // CountedLoopNode
   _pre_loop_end(nullptr),                                   // Pre loop CountedLoopEndNode
+  _loop_reductions(arena()),                                // reduction nodes in the current loop
   _bb(nullptr),                                             // basic block
   _iv(nullptr),                                             // induction var
   _race_possible(false),                                    // cases where SDMU is true
@@ -111,7 +112,17 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
     return false; // skip malformed counted loop
   }
 
-  if (cl->is_rce_post_loop() && cl->is_reduction_loop()) {
+  // Initialize simple data used by reduction marking early.
+  set_lpt(lpt);
+  set_lp(cl);
+  // For now, define one block which is the entire loop body.
+  set_bb(cl);
+
+  if (SuperWordReductions) {
+    mark_reductions();
+  }
+
+  if (cl->is_rce_post_loop() && is_marked_reduction_loop()) {
     // Post loop vectorization doesn't support reductions
     return false;
   }
@@ -167,18 +178,12 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
 
   init(); // initialize data structures
 
-  set_lpt(lpt);
-  set_lp(cl);
-
-  // For now, define one block which is the entire loop body
-  set_bb(cl);
-
   bool success = true;
   if (do_optimization) {
     assert(_packset.length() == 0, "packset must be empty");
     success = SLP_extract();
     if (PostLoopMultiversioning) {
-      if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
+      if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) {
         IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
         CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
         // Main loop SLP works well for manually unrolled loops. But post loop
@@ -223,7 +228,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
   for (uint i = 0; i < lpt()->_body.size(); i++) {
     Node* n = lpt()->_body.at(i);
     if (n == cl->incr() ||
-      n->is_reduction() ||
+      is_marked_reduction(n) ||
       n->is_AddP() ||
       n->is_Cmp() ||
       n->is_Bool() ||
@@ -411,6 +416,139 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
   }
 }
 
+bool SuperWord::is_reduction(const Node* n) {
+  if (!is_reduction_operator(n)) {
+    return false;
+  }
+  // Test whether there is a reduction cycle via every edge index
+  // (typically indices 1 and 2).
+  for (uint input = 1; input < n->req(); input++) {
+    if (in_reduction_cycle(n, input)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SuperWord::is_reduction_operator(const Node* n) {
+  int opc = n->Opcode();
+  return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
+}
+
+bool SuperWord::in_reduction_cycle(const Node* n, uint input) {
+  // First find input reduction path to phi node.
+  auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
+  PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
+                                     [&](const Node* m) { return m->is_Phi(); });
+  const Node* phi = path_to_phi.first;
+  if (phi == nullptr) {
+    return false;
+  }
+  // If there is an input reduction path from the phi's loop-back to n, then n
+  // is part of a reduction cycle.
+  const Node* first = phi->in(LoopNode::LoopBackControl);
+  PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode,
+                                       [&](const Node* m) { return m == n; });
+  return path_from_phi.first != nullptr;
+}
+
+Node* SuperWord::original_input(const Node* n, uint i) {
+  if (n->has_swapped_edges()) {
+    assert(n->is_Add() || n->is_Mul(), "n should be commutative");
+    if (i == 1) {
+      return n->in(2);
+    } else if (i == 2) {
+      return n->in(1);
+    }
+  }
+  return n->in(i);
+}
+
+void SuperWord::mark_reductions() {
+
+  _loop_reductions.clear();
+
+  // Iterate through all phi nodes associated to the loop and search for
+  // reduction cycles in the basic block.
+  for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
+    const Node* phi = lp()->fast_out(i);
+    if (!phi->is_Phi()) {
+      continue;
+    }
+    if (phi->outcnt() == 0) {
+      continue;
+    }
+    if (phi == iv()) {
+      continue;
+    }
+    // The phi's loop-back is considered the first node in the reduction cycle.
+    const Node* first = phi->in(LoopNode::LoopBackControl);
+    if (first == nullptr) {
+      continue;
+    }
+    // Test that the node fits the standard pattern for a reduction operator.
+    if (!is_reduction_operator(first)) {
+      continue;
+    }
+    // Test that 'first' is the beginning of a reduction cycle ending in 'phi'.
+    // To contain the number of searched paths, assume that all nodes in a
+    // reduction cycle are connected via the same edge index, modulo swapped
+    // inputs. This assumption is realistic because reduction cycles usually
+    // consist of nodes cloned by loop unrolling.
+    int reduction_input = -1;
+    int path_nodes = -1;
+    for (uint input = 1; input < first->req(); input++) {
+      // Test whether there is a reduction path in the basic block from 'first'
+      // to the phi node following edge index 'input'.
+      PathEnd path =
+        find_in_path(
+          first, input, lpt()->_body.size(),
+          [&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); },
+          [&](const Node* n) { return n == phi; });
+      if (path.first != nullptr) {
+        reduction_input = input;
+        path_nodes = path.second;
+        break;
+      }
+    }
+    if (reduction_input == -1) {
+      continue;
+    }
+    // Test that reduction nodes do not have any users in the loop besides their
+    // reduction cycle successors.
+    const Node* current = first;
+    const Node* succ = phi; // current's successor in the reduction cycle.
+    bool used_in_loop = false;
+    for (int i = 0; i < path_nodes; i++) {
+      for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
+        Node* u = current->fast_out(j);
+        if (!in_bb(u)) {
+          continue;
+        }
+        if (u == succ) {
+          continue;
+        }
+        used_in_loop = true;
+        break;
+      }
+      if (used_in_loop) {
+        break;
+      }
+      succ = current;
+      current = original_input(current, reduction_input);
+    }
+    if (used_in_loop) {
+      continue;
+    }
+    // Reduction cycle found. Mark all nodes in the found path as reductions.
+    current = first;
+    for (int i = 0; i < path_nodes; i++) {
+      _loop_reductions.set(current->_idx);
+      current = original_input(current, reduction_input);
+    }
+  }
+}
+
 //------------------------------SLP_extract---------------------------
 // Extract the superword level parallelism
 //
@@ -1378,7 +1516,7 @@ bool SuperWord::independent(Node* s1, Node* s2) {
 // those nodes, and have not found another node from the pack, we know
 // that all nodes in the pack are independent.
 Node* SuperWord::find_dependence(Node_List* p) {
-  if (p->at(0)->is_reduction()) {
+  if (is_marked_reduction(p->at(0))) {
     return nullptr; // ignore reductions
   }
   ResourceMark rm;
@@ -1436,7 +1574,7 @@ bool SuperWord::reduction(Node* s1, Node* s2) {
   int d1 = depth(s1);
   int d2 = depth(s2);
   if (d2 > d1) {
-    if (s1->is_reduction() && s2->is_reduction()) {
+    if (is_marked_reduction(s1) && is_marked_reduction(s2)) {
       // This is an ordered set, so s1 should define s2
       for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
         Node* t1 = s1->fast_out(i);
@@ -1653,7 +1791,7 @@ void SuperWord::order_def_uses(Node_List* p) {
   if (s1->is_Store()) return;
 
   // reductions are always managed beforehand
-  if (s1->is_reduction()) return;
+  if (is_marked_reduction(s1)) return;
 
   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
     Node* t1 = s1->fast_out(i);
@@ -1689,15 +1827,15 @@ void SuperWord::order_def_uses(Node_List* p) {
 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
   // check reductions to see if they are marshalled to represent the reduction
   // operator in a specified opnd
-  if (u1->is_reduction() && u2->is_reduction()) {
+  if (is_marked_reduction(u1) && is_marked_reduction(u2)) {
     // ensure reductions have phis and reduction definitions feeding the 1st operand
     Node* first = u1->in(2);
-    if (first->is_Phi() || first->is_reduction()) {
+    if (first->is_Phi() || is_marked_reduction(first)) {
       u1->swap_edges(1, 2);
     }
     // ensure reductions have phis and reduction definitions feeding the 1st operand
     first = u2->in(2);
-    if (first->is_Phi() || first->is_reduction()) {
+    if (first->is_Phi() || is_marked_reduction(first)) {
       u2->swap_edges(1, 2);
     }
     return true;
@@ -1920,7 +2058,7 @@ void SuperWord::filter_packs() {
       remove_pack_at(i);
     }
     Node *n = pk->at(0);
-    if (n->is_reduction()) {
+    if (is_marked_reduction(n)) {
       _num_reductions++;
     } else {
       _num_work_vecs++;
@@ -2171,7 +2309,7 @@ bool SuperWord::implemented(Node_List* p) {
   if (p0 != nullptr) {
     int opc = p0->Opcode();
     uint size = p->size();
-    if (p0->is_reduction()) {
+    if (is_marked_reduction(p0)) {
       const Type *arith_type = p0->bottom_type();
       // Length 2 reductions of INT/LONG do not offer performance benefits
       if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
@@ -2261,13 +2399,13 @@ bool SuperWord::profitable(Node_List* p) {
     }
   }
   // Check if reductions are connected
-  if (p0->is_reduction()) {
+  if (is_marked_reduction(p0)) {
     Node* second_in = p0->in(2);
     Node_List* second_pk = my_pack(second_in);
     if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
-      // Remove reduction flag if no parent pack or if not enough work
+      // Unmark reduction if no parent pack or if not enough work
       // to cover reduction expansion overhead
-      p0->remove_flag(Node::Flag_is_reduction);
+      _loop_reductions.remove(p0->_idx);
       return false;
     } else if (second_pk->size() != p->size()) {
       return false;
@@ -2299,7 +2437,7 @@ bool SuperWord::profitable(Node_List* p) {
           if (def == n) {
             // Reductions should only have a Phi use at the loop head or a non-phi use
             // outside of the loop if it is the last element of the pack (e.g. SafePoint).
-            if (def->is_reduction() &&
+            if (is_marked_reduction(def) &&
                 ((use->is_Phi() && use->in(0) == _lpt->_head) ||
                  (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
               continue;
@@ -2442,7 +2580,7 @@ public:
         for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
           Node* pred = preds.current();
           int pred_pid = get_pid_or_zero(pred);
-          if (pred_pid == pid && n->is_reduction()) {
+          if (pred_pid == pid && _slp->is_marked_reduction(n)) {
             continue; // reduction -> self-cycle is not a cyclic dependency
           }
           // Only add edges once, and only for mapped nodes (in block)
@@ -2992,7 +3130,7 @@ bool SuperWord::output() {
       } else if (n->req() == 3 && !is_cmov_pack(p)) {
         // Promote operands to vector
         Node* in1 = nullptr;
-        bool node_isa_reduction = n->is_reduction();
+        bool node_isa_reduction = is_marked_reduction(n);
         if (node_isa_reduction) {
           // the input to the first reduction operation is retained
           in1 = low_adr->in(1);
@@ -3246,7 +3384,7 @@ bool SuperWord::output() {
 Node* SuperWord::create_post_loop_vmask() {
   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   assert(cl->is_rce_post_loop(), "Must be an rce post loop");
-  assert(!cl->is_reduction_loop(), "no vector reduction in post loop");
+  assert(!is_marked_reduction_loop(), "no vector reduction in post loop");
   assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1");
 
   // Collect vector element types of all post loop packs. Also collect
@@ -3524,7 +3662,7 @@ void SuperWord::insert_extracts(Node_List* p) {
     _n_idx_list.pop();
     Node* def = use->in(idx);
 
-    if (def->is_reduction()) continue;
+    if (is_marked_reduction(def)) continue;
 
     // Insert extract operation
     _igvn.hash_delete(def);
@@ -3547,7 +3685,7 @@ void SuperWord::insert_extracts(Node_List* p) {
 bool SuperWord::is_vector_use(Node* use, int u_idx) {
   Node_List* u_pk = my_pack(use);
   if (u_pk == nullptr) return false;
-  if (use->is_reduction()) return true;
+  if (is_marked_reduction(use)) return true;
   Node* def = use->in(u_idx);
   Node_List* d_pk = my_pack(def);
   if (d_pk == nullptr) {
@@ -3708,7 +3846,7 @@ bool SuperWord::construct_bb() {
         if (in_bb(use) && !visited_test(use) &&
             // Don't go around backedge
             (!use->is_Phi() || n == entry)) {
-          if (use->is_reduction()) {
+          if (is_marked_reduction(use)) {
             // First see if we can map the reduction on the given system we are on, then
             // make a data entry operation for each reduction we see.
             BasicType bt = use->bottom_type()->basic_type();
@@ -4345,10 +4483,6 @@ void SuperWord::init() {
   _iteration_last.clear();
   _node_info.clear();
   _align_to_ref = nullptr;
-  _lpt = nullptr;
-  _lp = nullptr;
-  _bb = nullptr;
-  _iv = nullptr;
   _race_possible = 0;
   _early_return = false;
   _num_work_vecs = 0;
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 1317ac9bb81..70e97e9444c 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -29,6 +29,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/vectornode.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/pair.hpp"
 #include "libadt/dict.hpp"
 
 //
@@ -357,6 +358,7 @@ class SuperWord : public ResourceObj {
   IdealLoopTree* _lpt;             // Current loop tree node
   CountedLoopNode* _lp;            // Current CountedLoopNode
   CountedLoopEndNode* _pre_loop_end; // Current CountedLoopEndNode of pre loop
+  VectorSet      _loop_reductions; // Reduction nodes in the current loop
   Node*          _bb;              // Current basic block
   PhiNode*       _iv;              // Induction var
   bool           _race_possible;   // In cases where SDMU is true
@@ -471,6 +473,62 @@ class SuperWord : public ResourceObj {
 
   // methods
 
+  typedef const Pair<const Node*, int> PathEnd;
+
+  // Search for a path P = (n_1, n_2, ..., n_k) such that:
+  // - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
+  // - path(n) for all n in P,
+  // - k <= max, and
+  // - there exists a node e such that original_input(n_k, input) = e and end(e).
+  // Return <e, k>, if P is found, or <nullptr, -1> otherwise.
+  // Note that original_input(n, i) has the same behavior as n->in(i) except
+  // that it commutes the inputs of binary nodes whose edges have been swapped.
+  template <typename NodePredicate1, typename NodePredicate2>
+  static PathEnd find_in_path(const Node *n1, uint input, int max,
+                              NodePredicate1 path, NodePredicate2 end) {
+    const PathEnd no_path(nullptr, -1);
+    const Node* current = n1;
+    int k = 0;
+    for (int i = 0; i <= max; i++) {
+      if (current == nullptr) {
+        return no_path;
+      }
+      if (end(current)) {
+        return PathEnd(current, k);
+      }
+      if (!path(current)) {
+        return no_path;
+      }
+      current = original_input(current, input);
+      k++;
+    }
+    return no_path;
+  }
+
+public:
+  // Whether n is a reduction operator and part of a reduction cycle.
+  // This function can be used for individual queries outside the SLP analysis,
+  // e.g. to inform matching in target-specific code. Otherwise, the
+  // almost-equivalent but faster SuperWord::mark_reductions() is preferable.
+  static bool is_reduction(const Node* n);
+  // Whether n is marked as a reduction node.
+  bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); }
+  // Whether the current loop has any reduction node.
+  bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); }
+private:
+  // Whether n is a standard reduction operator.
+  static bool is_reduction_operator(const Node* n);
+  // Whether n is part of a reduction cycle via the 'input' edge index. To bound
+  // the search, constrain the size of reduction cycles to LoopMaxUnroll.
+  static bool in_reduction_cycle(const Node* n, uint input);
+  // Reference to the i'th input node of n, commuting the inputs of binary nodes
+  // whose edges have been swapped. Assumes n is a commutative operation.
+  static Node* original_input(const Node* n, uint i);
+  // Find and mark reductions in a loop. Running mark_reductions() is similar to
+  // querying is_reduction(n) for every n in the SuperWord loop, but stricter in
+  // that it assumes counted loops and requires that reduction nodes are not
+  // used within the loop except by their reduction cycle predecessors.
+  void mark_reductions();
   // Extract the superword level parallelism
   bool SLP_extract();
   // Find the adjacent memory references and create pack pairs for them.
diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java b/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java
index 25845144826..32fd3978a60 100644
--- a/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/TestSuperwordFailsUnrolling.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2022, Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -44,9 +45,9 @@ public class TestSuperwordFailsUnrolling {
     public static void main(String[] args) {
         Object avx = wb.getVMFlag("UseAVX");
         if (avx != null && ((Long)avx) > 2) {
-            TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8");
+            TestFramework.runWithFlags("-XX:UseAVX=2", "-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
         }
-        TestFramework.runWithFlags("-XX:LoopMaxUnroll=8");
+        TestFramework.runWithFlags("-XX:LoopMaxUnroll=8", "-XX:-SuperWordReductions");
     }
 
     @Test
diff --git a/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java b/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java
new file mode 100644
index 00000000000..47e842f6137
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxReductions.java
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8287087
+ * @summary Test that floating-point min/max x64 operations are implemented
+ *          differently depending on whether they are part of a reduction. These
+ *          tests complement those in TestFpMinMaxIntrinsics, which focus more
+ *          on correctness aspects.
+ * @library /test/lib /
+ * @requires os.simpleArch == "x64"& (vm.opt.UseAVX == "null" | vm.opt.UseAVX > 0)
+ * @run driver compiler.intrinsics.math.TestFpMinMaxReductions
+ */
+
+package compiler.intrinsics.math;
+
+import compiler.lib.ir_framework.*;
+
+public class TestFpMinMaxReductions {
+
+    private static float acc;
+    private static float floatInput1;
+    private static float floatInput2;
+    private static float[] floatArray = new float[1000];
+
+    private static double doubleInput1;
+    private static double doubleInput2;
+    private static double[] doubleArray = new double[1000];
+
+    private static int stride = 1;
+
+    public static void main(String[] args) throws Exception {
+        TestFramework.run();
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REG, "1"},
+        failOn = {IRNode.MIN_F_REDUCTION_REG})
+    private static float testFloatMin() {
+        return Math.min(floatInput1, floatInput2);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MAX_F_REG, "1"},
+        failOn = {IRNode.MAX_F_REDUCTION_REG})
+    private static float testFloatMax() {
+        return Math.max(floatInput1, floatInput2);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMinReduction() {
+        float fmin = Float.POSITIVE_INFINITY;
+        for (int i = 0; i < floatArray.length; i++) {
+            fmin = Math.min(fmin, floatArray[i]);
+        }
+        return fmin;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMinReductionPartiallyUnrolled() {
+        float fmin = Float.POSITIVE_INFINITY;
+        for (int i = 0; i < floatArray.length / 2; i++) {
+            fmin = Math.min(fmin, floatArray[2*i]);
+            fmin = Math.min(fmin, floatArray[2*i + 1]);
+        }
+        return fmin;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMinReductionNonCounted() {
+        float fmin = Float.POSITIVE_INFINITY;
+        for (int i = 0; i < floatArray.length; i += stride) {
+            fmin = Math.min(fmin, floatArray[i]);
+        }
+        return fmin;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMinReductionGlobalAccumulator() {
+        acc = Float.POSITIVE_INFINITY;
+        for (int i = 0; i < floatArray.length; i++) {
+            acc = Math.min(acc, floatArray[i]);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMinReductionInOuterLoop() {
+        float fmin = Float.POSITIVE_INFINITY;
+        int count = 0;
+        for (int i = 0; i < floatArray.length; i++) {
+            fmin = Math.min(fmin, floatArray[i]);
+            for (int j = 0; j < 10; j += stride) {
+                count++;
+            }
+        }
+        return fmin + count;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MAX_F_REDUCTION_REG, ">= 1"})
+    private static float testFloatMaxReduction() {
+        float fmax = Float.NEGATIVE_INFINITY;
+        for (int i = 0; i < floatArray.length; i++) {
+            fmax = Math.max(fmax, floatArray[i]);
+        }
+        return fmax;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_D_REG, "1"},
+        failOn = {IRNode.MIN_D_REDUCTION_REG})
+    private static double testDoubleMin() {
+        return Math.min(doubleInput1, doubleInput2);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MAX_D_REG, "1"},
+        failOn = {IRNode.MAX_D_REDUCTION_REG})
+    private static double testDoubleMax() {
+        return Math.max(doubleInput1, doubleInput2);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MIN_D_REDUCTION_REG, ">= 1"})
+    private static double testDoubleMinReduction() {
+        double fmin = Double.POSITIVE_INFINITY;
+        for (int i = 0; i < doubleArray.length; i++) {
+            fmin = Math.min(fmin, doubleArray[i]);
+        }
+        return fmin;
+    }
+
+    @Test
+    @IR(counts = {IRNode.MAX_D_REDUCTION_REG, ">= 1"})
+    private static double testDoubleMaxReduction() {
+        double fmax = Double.NEGATIVE_INFINITY;
+        for (int i = 0; i < doubleArray.length; i++) {
+            fmax = Math.max(fmax, doubleArray[i]);
+        }
+        return fmax;
+    }
+
+}
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 8c1dbaef277..e4c9a8c11f1 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -649,6 +649,26 @@ public class IRNode {
         beforeMatchingNameRegex(MAX, "Max(I|L)");
     }
 
+    public static final String MAX_D_REDUCTION_REG = PREFIX + "MAX_D_REDUCTION_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MAX_D_REDUCTION_REG, "maxD_reduction_reg");
+    }
+
+    public static final String MAX_D_REG = PREFIX + "MAX_D_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MAX_D_REG, "maxD_reg");
+    }
+
+    public static final String MAX_F_REDUCTION_REG = PREFIX + "MAX_F_REDUCTION_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MAX_F_REDUCTION_REG, "maxF_reduction_reg");
+    }
+
+    public static final String MAX_F_REG = PREFIX + "MAX_F_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MAX_F_REG, "maxF_reg");
+    }
+
     public static final String MAX_I = PREFIX + "MAX_I" + POSTFIX;
     static {
         beforeMatchingNameRegex(MAX_I, "MaxI");
@@ -679,6 +699,26 @@ public class IRNode {
         beforeMatchingNameRegex(MIN, "Min(I|L)");
     }
 
+    public static final String MIN_D_REDUCTION_REG = PREFIX + "MIN_D_REDUCTION_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MIN_D_REDUCTION_REG, "minD_reduction_reg");
+    }
+
+    public static final String MIN_D_REG = PREFIX + "MIN_D_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MIN_D_REG, "minD_reg");
+    }
+
+    public static final String MIN_F_REDUCTION_REG = PREFIX + "MIN_F_REDUCTION_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MIN_F_REDUCTION_REG, "minF_reduction_reg");
+    }
+
+    public static final String MIN_F_REG = PREFIX + "MIN_F_REG" + POSTFIX;
+    static {
+        machOnlyNameRegex(MIN_F_REG, "minF_reg");
+    }
+
     public static final String MIN_I = PREFIX + "MIN_I" + POSTFIX;
     static {
         beforeMatchingNameRegex(MIN_I, "MinI");
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java
new file mode 100644
index 00000000000..60ecaf0e4c8
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestGeneralizedReductions.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8287087
+ * @summary Test reduction vectorizations that are enabled by performing SLP
+ *          reduction analysis on unrolled loops.
+ * @library /test/lib /
+ * @requires vm.bits == 64
+ * @run driver compiler.loopopts.superword.TestGeneralizedReductions
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+import jdk.test.lib.Asserts;
+
+public class TestGeneralizedReductions {
+
+    private static int acc = 0;
+
+    public static void main(String[] args) throws Exception {
+        // Fix maximum number of unrolls for test stability.
+        TestFramework.runWithFlags("-XX:LoopMaxUnroll=16");
+    }
+
+    @Run(test = {"testReductionOnGlobalAccumulator",
+                 "testReductionOnPartiallyUnrolledLoop",
+                 "testReductionOnLargePartiallyUnrolledLoop",
+                 "testReductionOnPartiallyUnrolledLoopWithSwappedInputs",
+                 "testMapReductionOnGlobalAccumulator"})
+    void run() {
+        long[] array = new long[128];
+        long result;
+
+        initArray(array);
+        result = testReductionOnGlobalAccumulator(array);
+        Asserts.assertEQ(result, 8128L, "unexpected result");
+
+        initArray(array);
+        result = testReductionOnPartiallyUnrolledLoop(array);
+        Asserts.assertEQ(result, 8128L, "unexpected result");
+
+        initArray(array);
+        result = testReductionOnLargePartiallyUnrolledLoop(array);
+        Asserts.assertEQ(result, 8128L, "unexpected result");
+
+        initArray(array);
+        result = testReductionOnPartiallyUnrolledLoopWithSwappedInputs(array);
+        Asserts.assertEQ(result, 8128L, "unexpected result");
+
+        initArray(array);
+        result = testMapReductionOnGlobalAccumulator(array);
+        Asserts.assertEQ(result, 448L, "unexpected result");
+    }
+
+    private static void initArray(long[] array) {
+        for (int i = 0; i < array.length; i++) {
+            array[i] = i;
+        }
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
+        counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
+    private static long testReductionOnGlobalAccumulator(long[] array) {
+        acc = 0;
+        for (int i = 0; i < array.length; i++) {
+            acc += array[i];
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
+        counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
+    private static long testReductionOnPartiallyUnrolledLoop(long[] array) {
+        int sum = 0;
+        for (int i = 0; i < array.length / 2; i++) {
+            sum += array[2*i];
+            sum += array[2*i + 1];
+        }
+        return sum;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
+        counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
+    private static long testReductionOnLargePartiallyUnrolledLoop(long[] array) {
+        int sum = 0;
+        for (int i = 0; i < array.length / 8; i++) {
+            sum += array[8*i];
+            sum += array[8*i + 1];
+            sum += array[8*i + 2];
+            sum += array[8*i + 3];
+            sum += array[8*i + 4];
+            sum += array[8*i + 5];
+            sum += array[8*i + 6];
+            sum += array[8*i + 7];
+        }
+        return sum;
+    }
+
+    // This test illustrates a limitation of the current reduction analysis: it
+    // fails to detect reduction cycles where nodes are connected via different
+    // input indices (except if the differences result from C2 edge swapping).
+    // If this limitation is overcome in the future, the test case should be
+    // turned into a positive one.
+    @Test
+    @IR(applyIfCPUFeature = {"avx2", "true"}, applyIf = {"SuperWordReductions", "true"},
+        failOn = {IRNode.ADD_REDUCTION_VI})
+    private static long testReductionOnPartiallyUnrolledLoopWithSwappedInputs(long[] array) {
+        int sum = 0;
+        for (int i = 0; i < array.length / 2; i++) {
+            sum = sum + (int)array[2*i];
+            sum = (int)array[2*i + 1] + sum;
+        }
+        return sum;
+    }
+
+    @Test
+    @IR(applyIfCPUFeature = {"avx2", "true"},
+        applyIfAnd = {"SuperWordReductions", "true","UsePopCountInstruction", "true"},
+        counts = {IRNode.ADD_REDUCTION_VI, ">= 1",
+                  IRNode.POPCOUNT_VL, ">= 1"})
+    private static long testMapReductionOnGlobalAccumulator(long[] array) {
+        acc = 0;
+        for (int i = 0; i < array.length; i++) {
+            acc += Long.bitCount(array[i]);
+        }
+        return acc;
+    }
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
index e4c3330dc9e..27ae2214157 100644
--- a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -44,6 +44,9 @@ public class FpMinMaxIntrinsics {
 
     private Random r = new Random();
 
+    private static int stride = 1;
+    private static float acc;
+
     @Setup
     public void init() {
         c1 = s1 = step();
@@ -127,4 +130,44 @@ public class FpMinMaxIntrinsics {
 
         return result;
     }
+
+    @Benchmark
+    public float fMinReducePartiallyUnrolled() {
+        float result = Float.MAX_VALUE;
+        for (int i = 0; i < COUNT / 2; i++) {
+            result = Math.min(result, floats[2*i]);
+            result = Math.min(result, floats[2*i + 1]);
+        }
+        return result;
+    }
+
+    @Benchmark
+    public float fMinReduceNonCounted() {
+        float result = Float.MAX_VALUE;
+        for (int i = 0; i < COUNT; i += stride)
+            result = Math.min(result, floats[i]);
+        return result;
+    }
+
+    @Benchmark
+    public float fMinReduceGlobalAccumulator() {
+        acc = Float.MAX_VALUE;
+        for (int i = 0; i < COUNT; i += stride)
+            acc = Math.min(acc, floats[i]);
+        return acc;
+    }
+
+    @Benchmark
+    public float fMinReduceInOuterLoop() {
+        float result = Float.MAX_VALUE;
+        int count = 0;
+        for (int i = 0; i < COUNT; i++) {
+            result = Math.min(result, floats[i]);
+            for (int j = 0; j < 10; j += stride) {
+                count++;
+            }
+        }
+        return result + count;
+    }
+
 }
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
index cc853ae471b..b38330f2b83 100644
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,17 +42,17 @@ public abstract class VectorReduction {
     private int[] intsB;
     private int[] intsC;
     private int[] intsD;
-    private int resI;
     private long[] longsA;
     private long[] longsB;
     private long[] longsC;
     private long[] longsD;
-    private long resL;
 
     @Param("0")
     private int seed;
     private Random r = new Random(seed);
 
+    private static int globalResI;
+
     @Setup
     public void init() {
         intsA = new int[COUNT];
@@ -75,51 +75,86 @@ public abstract class VectorReduction {
     }
 
     @Benchmark
-    public void andRedI() {
+    public void andRedI(Blackhole bh) {
+        int resI = 0xFFFF;
         for (int i = 0; i < COUNT; i++) {
             intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
             resI &= intsD[i];
         }
+        bh.consume(resI);
     }
 
     @Benchmark
-    public void orRedI() {
+    public void orRedI(Blackhole bh) {
+        int resI = 0x0000;
         for (int i = 0; i < COUNT; i++) {
             intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
             resI |= intsD[i];
         }
+        bh.consume(resI);
     }
 
     @Benchmark
-    public void xorRedI() {
+    public void xorRedI(Blackhole bh) {
+        int resI = 0x0000;
         for (int i = 0; i < COUNT; i++) {
             intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
             resI ^= intsD[i];
         }
+        bh.consume(resI);
     }
 
     @Benchmark
-    public void andRedL() {
+    public void andRedL(Blackhole bh) {
+        long resL = 0xFFFFFFFF;
         for (int i = 0; i < COUNT; i++) {
             longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
             resL &= longsD[i];
         }
+        bh.consume(resL);
     }
 
     @Benchmark
-    public void orRedL() {
+    public void orRedL(Blackhole bh) {
+        long resL = 0x00000000;
         for (int i = 0; i < COUNT; i++) {
             longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
             resL |= longsD[i];
         }
+        bh.consume(resL);
     }
 
     @Benchmark
-    public void xorRedL() {
+    public void xorRedL(Blackhole bh) {
+        long resL = 0x00000000;
         for (int i = 0; i < COUNT; i++) {
             longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
             resL ^= longsD[i];
         }
+        bh.consume(resL);
+    }
+
+    @Benchmark
+    public void andRedIPartiallyUnrolled(Blackhole bh) {
+        int resI = 0xFFFF;
+        for (int i = 0; i < COUNT / 2; i++) {
+            int j = 2*i;
+            intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
+            resI &= intsD[j];
+            j = 2*i + 1;
+            intsD[j] = (intsA[j] * intsB[j]) + (intsA[j] * intsC[j]) + (intsB[j] * intsC[j]);
+            resI &= intsD[j];
+        }
+        bh.consume(resI);
+    }
+
+    @Benchmark
+    public void andRedIOnGlobalAccumulator() {
+        globalResI = 0xFFFF;
+        for (int i = 0; i < COUNT; i++) {
+            intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
+            globalResI &= intsD[i];
+        }
     }
 
     @Fork(value = 2, jvmArgsPrepend = {