diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index b14295ca15c..1ebc6408a60 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -135,9 +135,9 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
-          // It is not suitable for auto-vectorization because it does not add the elements
-          // in the same order as sequential code, and FP addition is non-associative.
+          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
+          // They are not suitable for auto-vectorization because the result would not conform
+          // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -2858,14 +2858,14 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
 %}
 
 // reduction addF
-// Floating-point addition is not associative, so the rules for AddReductionVF
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVF is only generated by Vector API.
-instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
+
+instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
+  format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
     __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
@@ -2873,11 +2873,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
+instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
+  format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
   ins_encode %{
     __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
     __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
+//    AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionVF dst_src1 src2));
   format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
@@ -2899,14 +2911,14 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
 %}
 
 // reduction addD
-// Floating-point addition is not associative, so the rule for AddReductionVD
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVD is only generated by Vector API.
-instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
-  predicate(UseSVE == 0);
+
+instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for doubles. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(!n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVD dsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
+  format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
     __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
@@ -2914,11 +2926,21 @@ instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
   ins_pipe(pipe_slow);
 %}
 
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
+//    AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionVD dst_src1 src2));
   format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 060bb4a11d4..29f92772368 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -125,9 +125,9 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
-          // It is not suitable for auto-vectorization because it does not add the elements
-          // in the same order as sequential code, and FP addition is non-associative.
+          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
+          // They are not suitable for auto-vectorization because the result would not conform
+          // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -1752,14 +1752,14 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
 REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
 
 // reduction addF
-// Floating-point addition is not associative, so the rules for AddReductionVF
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVF is only generated by Vector API.
-instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
+
+instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
+  format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
     __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
@@ -1767,11 +1767,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
+instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
+  format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
   ins_encode %{
     __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
     __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -1783,11 +1785,21 @@ dnl
 dnl REDUCE_ADD_FP_SVE($1,   $2  )
 dnl REDUCE_ADD_FP_SVE(type, size)
 define(`REDUCE_ADD_FP_SVE', `
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
+//    AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
   format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
@@ -1798,14 +1810,14 @@ dnl
 REDUCE_ADD_FP_SVE(F, S)
 
 // reduction addD
-// Floating-point addition is not associative, so the rule for AddReductionVD
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVD is only generated by Vector API.
-instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
-  predicate(UseSVE == 0);
+
+instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for doubles. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(!n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVD dsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
+  format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
     __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp
index ad603439e59..90ef4da4f1e 100644
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1460,7 +1460,7 @@ public:
   };
   AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
 
-  // Move UnorderedReduction out of loop if possible
+  // Move an unordered Reduction out of loop if possible
   void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
 
   // Create a scheduled list of nodes control dependent on ctrl set.
diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index a3227d47832..b0effb6d4f5 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -4310,11 +4310,19 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
   return AutoVectorizeStatus::Success;
 }
 
+// Returns true if the Reduction node is unordered.
+static bool is_unordered_reduction(Node* n) {
+  return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
+}
+
 // Having ReductionNodes in the loop is expensive. They need to recursively
 // fold together the vector values, for every vectorized loop iteration. If
 // we encounter the following pattern, we can vector accumulate the values
 // inside the loop, and only have a single UnorderedReduction after the loop.
 //
+// Note: UnorderedReduction represents a ReductionNode which does not require
+// calculating in strict order.
+//
 // CountedLoop     init
 //          |        |
 //          +------+ | +-----------------------+
@@ -4354,21 +4362,24 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
 // wise. This is a single operation per vector_accumulator, rather than many
 // for a UnorderedReduction. We can then reduce the last vector_accumulator
 // after the loop, and also reduce the init value into it.
+//
 // We can not do this with all reductions. Some reductions do not allow the
-// reordering of operations (for example float addition).
+// reordering of operations (for example float addition/multiplication require
+// strict order).
 void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
   assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
 
-  // Find all Phi nodes with UnorderedReduction on backedge.
+  // Find all Phi nodes with an unordered Reduction on backedge.
   CountedLoopNode* cl = loop->_head->as_CountedLoop();
   for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
     Node* phi = cl->fast_out(j);
-    // We have a phi with a single use, and a UnorderedReduction on the backedge.
-    if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
+    // We have a phi with a single use, and an unordered Reduction on the backedge.
+    if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
       continue;
     }
 
-    UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();
+    ReductionNode* last_ur = phi->in(2)->as_Reduction();
+    assert(!last_ur->requires_strict_order(), "must be");
 
     // Determine types
     const TypeVect* vec_t = last_ur->vect_type();
@@ -4385,14 +4396,14 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
         continue; // not implemented -> fails
     }
 
-    // Traverse up the chain of UnorderedReductions, checking that it loops back to
-    // the phi. Check that all UnorderedReductions only have a single use, except for
+    // Traverse up the chain of unordered Reductions, checking that it loops back to
+    // the phi. Check that all unordered Reductions only have a single use, except for
     // the last (last_ur), which only has phi as a use in the loop, and all other uses
     // are outside the loop.
-    UnorderedReductionNode* current = last_ur;
-    UnorderedReductionNode* first_ur = nullptr;
+    ReductionNode* current = last_ur;
+    ReductionNode* first_ur = nullptr;
     while (true) {
-      assert(current->is_UnorderedReduction(), "sanity");
+      assert(!current->requires_strict_order(), "sanity");
 
       // Expect no ctrl and a vector_input from within the loop.
       Node* ctrl = current->in(0);
@@ -4409,7 +4420,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
         break; // Chain traversal fails.
       }
 
-      // Expect single use of UnorderedReduction, except for last_ur.
+      // Expect single use of an unordered Reduction, except for last_ur.
       if (current == last_ur) {
         // Expect all uses to be outside the loop, except phi.
         for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
@@ -4427,12 +4438,13 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
         }
       }
 
-      // Expect another UnorderedReduction or phi as the scalar input.
+      // Expect another unordered Reduction or phi as the scalar input.
       Node* scalar_input = current->in(1);
-      if (scalar_input->is_UnorderedReduction() &&
+      if (is_unordered_reduction(scalar_input) &&
           scalar_input->Opcode() == current->Opcode()) {
-        // Move up the UnorderedReduction chain.
-        current = scalar_input->as_UnorderedReduction();
+        // Move up the unordered Reduction chain.
+        current = scalar_input->as_Reduction();
+        assert(!current->requires_strict_order(), "must be");
       } else if (scalar_input == phi) {
         // Chain terminates at phi.
         first_ur = current;
@@ -4456,7 +4468,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
     VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
     register_new_node(identity_vector, C->root());
     assert(vec_t == identity_vector->vect_type(), "matching vector type");
-    VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");
+    VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
 
     // Turn the scalar phi into a vector phi.
     _igvn.rehash_node_delayed(phi);
@@ -4465,7 +4477,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
     phi->as_Type()->set_type(vec_t);
     _igvn.set_type(phi, vec_t);
 
-    // Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
+    // Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
     current = first_ur;
     while (true) {
       // Create vector_accumulator to replace current.
@@ -4474,11 +4486,12 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
       VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
       register_new_node(vector_accumulator, cl);
       _igvn.replace_node(current, vector_accumulator);
-      VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
+      VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
       if (current == last_ur) {
         break;
       }
-      current = vector_accumulator->unique_out()->as_UnorderedReduction();
+      current = vector_accumulator->unique_out()->as_Reduction();
+      assert(!current->requires_strict_order(), "must be");
     }
 
     // Create post-loop reduction.
@@ -4495,7 +4508,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
       }
     }
     register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
-    VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");
+    VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
 
     assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
     assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp
index dc9dc6654b5..ae379c4833a 100644
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@@ -175,7 +175,6 @@ class SubTypeCheckNode;
 class Type;
 class TypeNode;
 class UnlockNode;
-class UnorderedReductionNode;
 class VectorNode;
 class LoadVectorNode;
 class LoadVectorMaskedNode;
@@ -739,7 +738,6 @@ public:
         DEFINE_CLASS_ID(ExpandV, Vector, 5)
         DEFINE_CLASS_ID(CompressM, Vector, 6)
         DEFINE_CLASS_ID(Reduction, Vector, 7)
-          DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
         DEFINE_CLASS_ID(NegV, Vector, 8)
       DEFINE_CLASS_ID(Con, Type, 8)
           DEFINE_CLASS_ID(ConI, Con, 0)
@@ -991,7 +989,6 @@ public:
   DEFINE_CLASS_QUERY(Sub)
   DEFINE_CLASS_QUERY(SubTypeCheck)
   DEFINE_CLASS_QUERY(Type)
-  DEFINE_CLASS_QUERY(UnorderedReduction)
   DEFINE_CLASS_QUERY(Vector)
   DEFINE_CLASS_QUERY(VectorMaskCmp)
   DEFINE_CLASS_QUERY(VectorUnbox)
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
index 3ef6ae02534..b31f6ace5a6 100644
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -1616,21 +1616,23 @@ bool LibraryCallKit::inline_vector_reduction() {
   }
 
   Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
-  Node* value = nullptr;
-  if (mask == nullptr) {
-    assert(!is_masked_op, "Masked op needs the mask value never null");
-    value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
-  } else {
-    if (use_predicate) {
-      value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
-      value->add_req(mask);
-      value->add_flag(Node::Flag_is_predicated_vector);
-    } else {
-      Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
-      value = gvn().transform(new VectorBlendNode(reduce_identity, opd, mask));
-      value = ReductionNode::make(opc, nullptr, init, value, elem_bt);
-    }
+  Node* value = opd;
+
+  assert(mask != nullptr || !is_masked_op, "Masked op needs the mask value never null");
+  if (mask != nullptr && !use_predicate) {
+    Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
+    value = gvn().transform(new VectorBlendNode(reduce_identity, value, mask));
   }
+
+  // Make an unordered Reduction node. This affects only AddReductionVF/VD and MulReductionVF/VD,
+  // as these operations are allowed to be associative (not requiring strict order) in VectorAPI.
+  value = ReductionNode::make(opc, nullptr, init, value, elem_bt, /* requires_strict_order */ false);
+
+  if (mask != nullptr && use_predicate) {
+    value->add_req(mask);
+    value->add_flag(Node::Flag_is_predicated_vector);
+  }
+
   value = gvn().transform(value);
 
   Node* bits = nullptr;
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index b14a7f7b165..d560f112039 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -1296,7 +1296,8 @@ int ReductionNode::opcode(int opc, BasicType bt) {
 }
 
 // Return the appropriate reduction node.
-ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
+ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt,
+                                   bool requires_strict_order) {
 
   int vopc = opcode(opc, bt);
 
@@ -1306,17 +1307,17 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
   switch (vopc) {
   case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
   case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
-  case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
-  case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
+  case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2, requires_strict_order);
+  case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2, requires_strict_order);
   case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
   case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
-  case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
-  case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
-  case Op_MinReductionV:  return new MinReductionVNode(ctrl, n1, n2);
-  case Op_MaxReductionV:  return new MaxReductionVNode(ctrl, n1, n2);
-  case Op_AndReductionV:  return new AndReductionVNode(ctrl, n1, n2);
-  case Op_OrReductionV:   return new OrReductionVNode(ctrl, n1, n2);
-  case Op_XorReductionV:  return new XorReductionVNode(ctrl, n1, n2);
+  case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2, requires_strict_order);
+  case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2, requires_strict_order);
+  case Op_MinReductionV:  return new MinReductionVNode (ctrl, n1, n2);
+  case Op_MaxReductionV:  return new MaxReductionVNode (ctrl, n1, n2);
+  case Op_AndReductionV:  return new AndReductionVNode (ctrl, n1, n2);
+  case Op_OrReductionV:   return new OrReductionVNode  (ctrl, n1, n2);
+  case Op_XorReductionV:  return new XorReductionVNode (ctrl, n1, n2);
   default:
     assert(false, "unknown node: %s", NodeClassNames[vopc]);
     return nullptr;
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index 17c7482d88c..6c5402eb511 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -203,7 +203,9 @@ class ReductionNode : public Node {
     init_class_id(Class_Reduction);
   }
 
-  static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt);
+  static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt,
+                             // This only effects floating-point add and mul reductions.
+                             bool requires_strict_order = true);
   static int  opcode(int opc, BasicType bt);
   static bool implemented(int opc, uint vlen, BasicType bt);
   // Make an identity scalar (zero for add, one for mul, etc) for scalar opc.
@@ -225,47 +227,97 @@ class ReductionNode : public Node {
 
   // Needed for proper cloning.
   virtual uint size_of() const { return sizeof(*this); }
-};
 
-//---------------------------UnorderedReductionNode-------------------------------------
-// Order of reduction does not matter. Example int add. Not true for float add.
-class UnorderedReductionNode : public ReductionNode {
-public:
-  UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {
-    init_class_id(Class_UnorderedReduction);
+  // Floating-point addition and multiplication are non-associative, so
+  // AddReductionVF/D and MulReductionVF/D require strict ordering
+  // in auto-vectorization. Vector API can generate AddReductionVF/D
+  // and MulReductionVF/VD without strict ordering, which can benefit
+  // some platforms.
+  //
+  // Other reductions don't need strict ordering.
+  virtual bool requires_strict_order() const {
+    return false;
   }
+
+#ifndef PRODUCT
+  void dump_spec(outputStream* st) const {
+    if (requires_strict_order()) {
+      st->print("requires_strict_order");
+    } else {
+      st->print("no_strict_order");
+    }
+  }
+#endif
 };
 
 //------------------------------AddReductionVINode--------------------------------------
 // Vector add byte, short and int as a reduction
-class AddReductionVINode : public UnorderedReductionNode {
+class AddReductionVINode : public ReductionNode {
 public:
-  AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  AddReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------AddReductionVLNode--------------------------------------
 // Vector add long as a reduction
-class AddReductionVLNode : public UnorderedReductionNode {
+class AddReductionVLNode : public ReductionNode {
 public:
-  AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  AddReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------AddReductionVFNode--------------------------------------
 // Vector add float as a reduction
 class AddReductionVFNode : public ReductionNode {
+private:
+  // True if add reduction operation for floats requires strict ordering.
+  // As an example - The value is true when add reduction for floats is auto-vectorized
+  // as auto-vectorization mandates strict ordering but the value is false when this node
+  // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
+  const bool _requires_strict_order;
 public:
-  AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  //_requires_strict_order is set to true by default as mandated by auto-vectorization
+  AddReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
+    ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
+
   virtual int Opcode() const;
+
+  virtual bool requires_strict_order() const { return _requires_strict_order; }
+
+  virtual uint hash() const { return Node::hash() + _requires_strict_order; }
+
+  virtual bool cmp(const Node& n) const {
+    return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
+  }
+
+  virtual uint size_of() const { return sizeof(*this); }
 };
 
 //------------------------------AddReductionVDNode--------------------------------------
 // Vector add double as a reduction
 class AddReductionVDNode : public ReductionNode {
+private:
+  // True if add reduction operation for doubles requires strict ordering.
+  // As an example - The value is true when add reduction for doubles is auto-vectorized
+  // as auto-vectorization mandates strict ordering but the value is false when this node
+  // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
+  const bool _requires_strict_order;
 public:
-  AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  //_requires_strict_order is set to true by default as mandated by auto-vectorization
+  AddReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
+    ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
+
   virtual int Opcode() const;
+
+  virtual bool requires_strict_order() const { return _requires_strict_order; }
+
+  virtual uint hash() const { return Node::hash() + _requires_strict_order; }
+
+  virtual bool cmp(const Node& n) const {
+    return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
+  }
+
+  virtual uint size_of() const { return sizeof(*this); }
 };
 
 //------------------------------SubVBNode--------------------------------------
@@ -400,34 +452,70 @@ public:
 
 //------------------------------MulReductionVINode--------------------------------------
 // Vector multiply byte, short and int as a reduction
-class MulReductionVINode : public UnorderedReductionNode {
+class MulReductionVINode : public ReductionNode {
 public:
-  MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  MulReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------MulReductionVLNode--------------------------------------
 // Vector multiply int as a reduction
-class MulReductionVLNode : public UnorderedReductionNode {
+class MulReductionVLNode : public ReductionNode {
 public:
-  MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  MulReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------MulReductionVFNode--------------------------------------
 // Vector multiply float as a reduction
 class MulReductionVFNode : public ReductionNode {
+  // True if mul reduction operation for floats requires strict ordering.
+  // As an example - The value is true when mul reduction for floats is auto-vectorized
+  // as auto-vectorization mandates strict ordering but the value is false when this node
+  // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
+  const bool _requires_strict_order;
 public:
-  MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  //_requires_strict_order is set to true by default as mandated by auto-vectorization
+  MulReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
+    ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
+
   virtual int Opcode() const;
+
+  virtual bool requires_strict_order() const { return _requires_strict_order; }
+
+  virtual uint hash() const { return Node::hash() + _requires_strict_order; }
+
+  virtual bool cmp(const Node& n) const {
+    return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
+  }
+
+  virtual uint size_of() const { return sizeof(*this); }
 };
 
 //------------------------------MulReductionVDNode--------------------------------------
 // Vector multiply double as a reduction
 class MulReductionVDNode : public ReductionNode {
+  // True if mul reduction operation for doubles requires strict ordering.
+  // As an example - The value is true when mul reduction for doubles is auto-vectorized
+  // as auto-vectorization mandates strict ordering but the value is false when this node
+  // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
+  const bool _requires_strict_order;
 public:
-  MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  //_requires_strict_order is set to true by default as mandated by auto-vectorization
+  MulReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
+    ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
+
   virtual int Opcode() const;
+
+  virtual bool requires_strict_order() const { return _requires_strict_order; }
+
+  virtual uint hash() const { return Node::hash() + _requires_strict_order; }
+
+  virtual bool cmp(const Node& n) const {
+    return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
+  }
+
+  virtual uint size_of() const { return sizeof(*this); }
 };
 
 //------------------------------DivVFNode--------------------------------------
@@ -753,9 +841,9 @@ class AndVNode : public VectorNode {
 
 //------------------------------AndReductionVNode--------------------------------------
 // Vector and byte, short, int, long as a reduction
-class AndReductionVNode : public UnorderedReductionNode {
+class AndReductionVNode : public ReductionNode {
  public:
-  AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  AndReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
@@ -770,9 +858,9 @@ class OrVNode : public VectorNode {
 
 //------------------------------OrReductionVNode--------------------------------------
 // Vector xor byte, short, int, long as a reduction
-class OrReductionVNode : public UnorderedReductionNode {
+class OrReductionVNode : public ReductionNode {
  public:
-  OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  OrReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
@@ -787,25 +875,25 @@ class XorVNode : public VectorNode {
 
 //------------------------------XorReductionVNode--------------------------------------
 // Vector and int, long as a reduction
-class XorReductionVNode : public UnorderedReductionNode {
+class XorReductionVNode : public ReductionNode {
  public:
-  XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  XorReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------MinReductionVNode--------------------------------------
 // Vector min byte, short, int, long, float, double as a reduction
-class MinReductionVNode : public UnorderedReductionNode {
+class MinReductionVNode : public ReductionNode {
 public:
-  MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  MinReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
 //------------------------------MaxReductionVNode--------------------------------------
 // Vector min byte, short, int, long, float, double as a reduction
-class MaxReductionVNode : public UnorderedReductionNode {
+class MaxReductionVNode : public ReductionNode {
 public:
-  MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
+  MaxReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
 };
 
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java
new file mode 100644
index 00000000000..327e6e5e12d
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2024, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+
+/*
+ * @test
+ * @bug 8320725
+ * @summary Ensure strictly ordered AddReductionVF/VD and MulReductionVF/VD nodes
+            are generated when these operations are auto-vectorized
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestVectorFPReduction
+ */
+
+public class TestVectorFPReduction {
+
+    final private static int SIZE = 1024;
+
+    private static double[] da = new double[SIZE];
+    private static double[] db = new double[SIZE];
+    private static float[] fa = new float[SIZE];
+    private static float[] fb = new float[SIZE];
+    private static float fresult;
+    private static double dresult;
+
+    public static void main(String[] args) {
+        TestFramework.run();
+    }
+
+    @Test
+    @IR(failOn = {IRNode.ADD_REDUCTION_VF},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    private static void testAddReductionVF() {
+        float result = 1;
+        for (int i = 0; i < SIZE; i++) {
+            result += (fa[i] + fb[i]);
+        }
+        fresult += result;
+    }
+
+    @Test
+    @IR(failOn = {IRNode.ADD_REDUCTION_VD},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    private static void testAddReductionVD() {
+        double result = 1;
+        for (int i = 0; i < SIZE; i++) {
+            result += (da[i] + db[i]);
+        }
+        dresult += result;
+    }
+
+    @Test
+    @IR(failOn = {IRNode.MUL_REDUCTION_VF},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    private static void testMulReductionVF() {
+        float result = 1;
+        for (int i = 0; i < SIZE; i++) {
+            result *= (fa[i] + fb[i]);
+        }
+        fresult += result;
+    }
+
+    @Test
+    @IR(failOn = {IRNode.MUL_REDUCTION_VD},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    private static void testMulReductionVD() {
+        double result = 1;
+        for (int i = 0; i < SIZE; i++) {
+            result *= (da[i] + db[i]);
+        }
+        dresult += result;
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java
new file mode 100644
index 00000000000..549d9aa5d49
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.util.Random;
+
+import jdk.test.lib.Asserts;
+import jdk.test.lib.Utils;
+
+/**
+ * @test
+ * @bug 8320725
+ * @library /test/lib /
+ * @summary Verify non-strictly ordered AddReductionVF/VD and MulReductionVF/VD
+ *          nodes are generated in VectorAPI
+ * @modules jdk.incubator.vector
+ * @run driver compiler.vectorapi.TestVectorAddMulReduction
+ */
+
+public class TestVectorAddMulReduction {
+
+    private static final int SIZE = 1024;
+    private static final Random RD = Utils.getRandomInstance();
+
+    private static float[] fa;
+    private static float fres;
+    private static double[] da;
+    private static double dres;
+
+    static {
+        fa = new float[SIZE];
+        da = new double[SIZE];
+        fres = 1;
+        dres = 1;
+        for (int i = 0; i < SIZE; i++) {
+            fa[i] = RD.nextFloat();
+            da[i] = RD.nextDouble();
+        }
+    }
+
+    // Test add reduction operation for floats
+    @ForceInline
+    public static void testFloatAddKernel(VectorSpecies SPECIES, float[] f) {
+        for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
+            var av = FloatVector.fromArray(SPECIES, f, i);
+            fres += av.reduceLanes(VectorOperators.ADD);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=8"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatAdd_64() {
+        testFloatAddKernel(FloatVector.SPECIES_64, fa);
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatAdd_128() {
+        testFloatAddKernel(FloatVector.SPECIES_128, fa);
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=32"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatAdd_256() {
+        testFloatAddKernel(FloatVector.SPECIES_256, fa);
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=64"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatAdd_512() {
+        testFloatAddKernel(FloatVector.SPECIES_512, fa);
+    }
+
+    // Test add reduction operation for doubles
+    @ForceInline
+    public static void testDoubleAddKernel(VectorSpecies SPECIES, double[] d) {
+        for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
+            var av = DoubleVector.fromArray(SPECIES, d, i);
+            dres += av.reduceLanes(VectorOperators.ADD);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testDoubleAdd_128() {
+        testDoubleAddKernel(DoubleVector.SPECIES_128, da);
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=32"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testDoubleAdd_256() {
+        testDoubleAddKernel(DoubleVector.SPECIES_256, da);
+    }
+
+    @Test
+    @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=64"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testDoubleAdd_512() {
+        testDoubleAddKernel(DoubleVector.SPECIES_512, da);
+    }
+
+    // Test mul reduction operation for floats
+    // On aarch64, there are no direct vector mul reduction instructions for float/double mul reduction
+    // and scalar instructions are emitted for 64-bit/128-bit vectors. Thus MulReductionVF/VD nodes are generated
+    // only for vector length of 8B/16B on vectorAPI.
+    @ForceInline
+    public static void testFloatMulKernel(VectorSpecies SPECIES, float[] f) {
+        for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
+            var av = FloatVector.fromArray(SPECIES, f, i);
+            fres += av.reduceLanes(VectorOperators.MUL);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=8"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatMul_64() {
+        testFloatMulKernel(FloatVector.SPECIES_64, fa);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testFloatMul_128() {
+        testFloatMulKernel(FloatVector.SPECIES_128, fa);
+    }
+
+    // Test mul reduction operation for doubles
+    @ForceInline
+    public static void testDoubleMulKernel(VectorSpecies SPECIES, double[] d) {
+        for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
+            var av = DoubleVector.fromArray(SPECIES, d, i);
+            dres += av.reduceLanes(VectorOperators.MUL);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
+        failOn = {"requires_strict_order"},
+        applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        phase = CompilePhase.PRINT_IDEAL)
+    public static void testDoubleMul_128() {
+        testDoubleMulKernel(DoubleVector.SPECIES_128, da);
+    }
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+}