diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index b14295ca15c..1ebc6408a60 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -135,9 +135,9 @@ source %{ (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || - // The vector implementation of Op_AddReductionVD/F is for the Vector API only. - // It is not suitable for auto-vectorization because it does not add the elements - // in the same order as sequential code, and FP addition is non-associative. + // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. + // They are not suitable for auto-vectorization because the result would not conform + // to the JLS, Section Evaluation Order. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || opcode == Op_MulVL) { @@ -2858,14 +2858,14 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{ %} // reduction addF -// Floating-point addition is not associative, so the rules for AddReductionVF -// on NEON can't be used to auto-vectorize floating-point reduce-add. -// Currently, on NEON, AddReductionVF is only generated by Vector API. -instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ - predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2); + +instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ + // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVF fsrc vsrc)); effect(TEMP_DEF dst); - format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %} + format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %} ins_encode %{ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S); __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister); @@ -2873,11 +2873,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ ins_pipe(pipe_slow); %} -instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ - predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4); +instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ + // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVF fsrc vsrc)); effect(TEMP_DEF dst, TEMP tmp); - format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %} + format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %} ins_encode %{ __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister); __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S); @@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ ins_pipe(pipe_slow); %} +// This rule calculates the reduction result in strict order. Two cases will +// reach here: +// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example - +// AddReductionVF generated by Vector API. For vector size > 128-bits, it is more +// beneficial performance-wise to generate direct SVE instruction even if it is +// strictly ordered. +// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by +// auto-vectorization on SVE machine. instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{ - predicate(UseSVE > 0); + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order()); match(Set dst_src1 (AddReductionVF dst_src1 src2)); format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ + assert(UseSVE > 0, "must be sve"); uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2); assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister); @@ -2899,14 +2911,14 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{ %} // reduction addD -// Floating-point addition is not associative, so the rule for AddReductionVD -// on NEON can't be used to auto-vectorize floating-point reduce-add. -// Currently, on NEON, AddReductionVD is only generated by Vector API. -instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ - predicate(UseSVE == 0); + +instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ + // Non-strictly ordered floating-point add reduction for doubles. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(!n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVD dsrc vsrc)); effect(TEMP_DEF dst); - format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %} + format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %} ins_encode %{ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D); __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister); @@ -2914,11 +2926,21 @@ instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ ins_pipe(pipe_slow); %} +// This rule calculates the reduction result in strict order. Two cases will +// reach here: +// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example - +// AddReductionVD generated by Vector API. For vector size > 128-bits, it is more +// beneficial performance-wise to generate direct SVE instruction even if it is +// strictly ordered. +// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by +// auto-vectorization on SVE machine. instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{ - predicate(UseSVE > 0); + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order()); match(Set dst_src1 (AddReductionVD dst_src1 src2)); format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ + assert(UseSVE > 0, "must be sve"); uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2); assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister); diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 060bb4a11d4..29f92772368 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -125,9 +125,9 @@ source %{ (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || - // The vector implementation of Op_AddReductionVD/F is for the Vector API only. - // It is not suitable for auto-vectorization because it does not add the elements - // in the same order as sequential code, and FP addition is non-associative. + // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. + // They are not suitable for auto-vectorization because the result would not conform + // to the JLS, Section Evaluation Order. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || opcode == Op_MulVL) { @@ -1752,14 +1752,14 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I) REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL) // reduction addF -// Floating-point addition is not associative, so the rules for AddReductionVF -// on NEON can't be used to auto-vectorize floating-point reduce-add. -// Currently, on NEON, AddReductionVF is only generated by Vector API. -instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ - predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2); + +instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ + // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVF fsrc vsrc)); effect(TEMP_DEF dst); - format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %} + format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %} ins_encode %{ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S); __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister); @@ -1767,11 +1767,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ ins_pipe(pipe_slow); %} -instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ - predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4); +instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ + // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVF fsrc vsrc)); effect(TEMP_DEF dst, TEMP tmp); - format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %} + format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %} ins_encode %{ __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister); __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S); @@ -1783,11 +1785,21 @@ dnl dnl REDUCE_ADD_FP_SVE($1, $2 ) dnl REDUCE_ADD_FP_SVE(type, size) define(`REDUCE_ADD_FP_SVE', ` +// This rule calculates the reduction result in strict order. Two cases will +// reach here: +// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example - +// AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more +// beneficial performance-wise to generate direct SVE instruction even if it is +// strictly ordered. +// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by +// auto-vectorization on SVE machine. instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{ - predicate(UseSVE > 0); + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order()); match(Set dst_src1 (AddReductionV$1 dst_src1 src2)); format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ + assert(UseSVE > 0, "must be sve"); uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2); assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister); @@ -1798,14 +1810,14 @@ dnl REDUCE_ADD_FP_SVE(F, S) // reduction addD -// Floating-point addition is not associative, so the rule for AddReductionVD -// on NEON can't be used to auto-vectorize floating-point reduce-add. -// Currently, on NEON, AddReductionVD is only generated by Vector API. -instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ - predicate(UseSVE == 0); + +instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ + // Non-strictly ordered floating-point add reduction for doubles. This rule is + // intended for the VectorAPI (which allows for non-strictly ordered add reduction). + predicate(!n->as_Reduction()->requires_strict_order()); match(Set dst (AddReductionVD dsrc vsrc)); effect(TEMP_DEF dst); - format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %} + format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %} ins_encode %{ __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D); __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister); diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index ad603439e59..90ef4da4f1e 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -1460,7 +1460,7 @@ public: }; AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared); - // Move UnorderedReduction out of loop if possible + // Move an unordered Reduction out of loop if possible void move_unordered_reduction_out_of_loop(IdealLoopTree* loop); // Create a scheduled list of nodes control dependent on ctrl set. diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index a3227d47832..b0effb6d4f5 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4310,11 +4310,19 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { return AutoVectorizeStatus::Success; } +// Returns true if the Reduction node is unordered. +static bool is_unordered_reduction(Node* n) { + return n->is_Reduction() && !n->as_Reduction()->requires_strict_order(); +} + // Having ReductionNodes in the loop is expensive. They need to recursively // fold together the vector values, for every vectorized loop iteration. If // we encounter the following pattern, we can vector accumulate the values // inside the loop, and only have a single UnorderedReduction after the loop. // +// Note: UnorderedReduction represents a ReductionNode which does not require +// calculating in strict order. +// // CountedLoop init // | | // +------+ | +-----------------------+ @@ -4354,21 +4362,24 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { // wise. This is a single operation per vector_accumulator, rather than many // for a UnorderedReduction. We can then reduce the last vector_accumulator // after the loop, and also reduce the init value into it. +// // We can not do this with all reductions. Some reductions do not allow the -// reordering of operations (for example float addition). +// reordering of operations (for example float addition/multiplication require +// strict order). void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity"); - // Find all Phi nodes with UnorderedReduction on backedge. + // Find all Phi nodes with an unordered Reduction on backedge. CountedLoopNode* cl = loop->_head->as_CountedLoop(); for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { Node* phi = cl->fast_out(j); - // We have a phi with a single use, and a UnorderedReduction on the backedge. - if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) { + // We have a phi with a single use, and an unordered Reduction on the backedge. + if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) { continue; } - UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction(); + ReductionNode* last_ur = phi->in(2)->as_Reduction(); + assert(!last_ur->requires_strict_order(), "must be"); // Determine types const TypeVect* vec_t = last_ur->vect_type(); @@ -4385,14 +4396,14 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { continue; // not implemented -> fails } - // Traverse up the chain of UnorderedReductions, checking that it loops back to - // the phi. Check that all UnorderedReductions only have a single use, except for + // Traverse up the chain of unordered Reductions, checking that it loops back to + // the phi. Check that all unordered Reductions only have a single use, except for // the last (last_ur), which only has phi as a use in the loop, and all other uses // are outside the loop. - UnorderedReductionNode* current = last_ur; - UnorderedReductionNode* first_ur = nullptr; + ReductionNode* current = last_ur; + ReductionNode* first_ur = nullptr; while (true) { - assert(current->is_UnorderedReduction(), "sanity"); + assert(!current->requires_strict_order(), "sanity"); // Expect no ctrl and a vector_input from within the loop. Node* ctrl = current->in(0); @@ -4409,7 +4420,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { break; // Chain traversal fails. } - // Expect single use of UnorderedReduction, except for last_ur. + // Expect single use of an unordered Reduction, except for last_ur. if (current == last_ur) { // Expect all uses to be outside the loop, except phi. for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) { @@ -4427,12 +4438,13 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } } - // Expect another UnorderedReduction or phi as the scalar input. + // Expect another unordered Reduction or phi as the scalar input. Node* scalar_input = current->in(1); - if (scalar_input->is_UnorderedReduction() && + if (is_unordered_reduction(scalar_input) && scalar_input->Opcode() == current->Opcode()) { - // Move up the UnorderedReduction chain. - current = scalar_input->as_UnorderedReduction(); + // Move up the unordered Reduction chain. + current = scalar_input->as_Reduction(); + assert(!current->requires_strict_order(), "must be"); } else if (scalar_input == phi) { // Chain terminates at phi. first_ur = current; @@ -4456,7 +4468,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); register_new_node(identity_vector, C->root()); assert(vec_t == identity_vector->vect_type(), "matching vector type"); - VectorNode::trace_new_vector(identity_vector, "UnorderedReduction"); + VectorNode::trace_new_vector(identity_vector, "Unordered Reduction"); // Turn the scalar phi into a vector phi. _igvn.rehash_node_delayed(phi); @@ -4465,7 +4477,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { phi->as_Type()->set_type(vec_t); _igvn.set_type(phi, vec_t); - // Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators. + // Traverse down the chain of unordered Reductions, and replace them with vector_accumulators. current = first_ur; while (true) { // Create vector_accumulator to replace current. @@ -4474,11 +4486,12 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t); register_new_node(vector_accumulator, cl); _igvn.replace_node(current, vector_accumulator); - VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction"); + VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction"); if (current == last_ur) { break; } - current = vector_accumulator->unique_out()->as_UnorderedReduction(); + current = vector_accumulator->unique_out()->as_Reduction(); + assert(!current->requires_strict_order(), "must be"); } // Create post-loop reduction. @@ -4495,7 +4508,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } } register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); - VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction"); + VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction"); assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index dc9dc6654b5..ae379c4833a 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -175,7 +175,6 @@ class SubTypeCheckNode; class Type; class TypeNode; class UnlockNode; -class UnorderedReductionNode; class VectorNode; class LoadVectorNode; class LoadVectorMaskedNode; @@ -739,7 +738,6 @@ public: DEFINE_CLASS_ID(ExpandV, Vector, 5) DEFINE_CLASS_ID(CompressM, Vector, 6) DEFINE_CLASS_ID(Reduction, Vector, 7) - DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0) DEFINE_CLASS_ID(NegV, Vector, 8) DEFINE_CLASS_ID(Con, Type, 8) DEFINE_CLASS_ID(ConI, Con, 0) @@ -991,7 +989,6 @@ public: DEFINE_CLASS_QUERY(Sub) DEFINE_CLASS_QUERY(SubTypeCheck) DEFINE_CLASS_QUERY(Type) - DEFINE_CLASS_QUERY(UnorderedReduction) DEFINE_CLASS_QUERY(Vector) DEFINE_CLASS_QUERY(VectorMaskCmp) DEFINE_CLASS_QUERY(VectorUnbox) diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 3ef6ae02534..b31f6ace5a6 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1616,21 +1616,23 @@ bool LibraryCallKit::inline_vector_reduction() { } Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt); - Node* value = nullptr; - if (mask == nullptr) { - assert(!is_masked_op, "Masked op needs the mask value never null"); - value = ReductionNode::make(opc, nullptr, init, opd, elem_bt); - } else { - if (use_predicate) { - value = ReductionNode::make(opc, nullptr, init, opd, elem_bt); - value->add_req(mask); - value->add_flag(Node::Flag_is_predicated_vector); - } else { - Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt))); - value = gvn().transform(new VectorBlendNode(reduce_identity, opd, mask)); - value = ReductionNode::make(opc, nullptr, init, value, elem_bt); - } + Node* value = opd; + + assert(mask != nullptr || !is_masked_op, "Masked op needs the mask value never null"); + if (mask != nullptr && !use_predicate) { + Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt))); + value = gvn().transform(new VectorBlendNode(reduce_identity, value, mask)); } + + // Make an unordered Reduction node. This affects only AddReductionVF/VD and MulReductionVF/VD, + // as these operations are allowed to be associative (not requiring strict order) in VectorAPI. + value = ReductionNode::make(opc, nullptr, init, value, elem_bt, /* requires_strict_order */ false); + + if (mask != nullptr && use_predicate) { + value->add_req(mask); + value->add_flag(Node::Flag_is_predicated_vector); + } + value = gvn().transform(value); Node* bits = nullptr; diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index b14a7f7b165..d560f112039 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1296,7 +1296,8 @@ int ReductionNode::opcode(int opc, BasicType bt) { } // Return the appropriate reduction node. -ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) { +ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt, + bool requires_strict_order) { int vopc = opcode(opc, bt); @@ -1306,17 +1307,17 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi switch (vopc) { case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2); case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2); - case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2); - case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2); + case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2, requires_strict_order); + case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2, requires_strict_order); case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2); case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2); - case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2); - case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2); - case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2); - case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2); - case Op_AndReductionV: return new AndReductionVNode(ctrl, n1, n2); - case Op_OrReductionV: return new OrReductionVNode(ctrl, n1, n2); - case Op_XorReductionV: return new XorReductionVNode(ctrl, n1, n2); + case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2, requires_strict_order); + case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2, requires_strict_order); + case Op_MinReductionV: return new MinReductionVNode (ctrl, n1, n2); + case Op_MaxReductionV: return new MaxReductionVNode (ctrl, n1, n2); + case Op_AndReductionV: return new AndReductionVNode (ctrl, n1, n2); + case Op_OrReductionV: return new OrReductionVNode (ctrl, n1, n2); + case Op_XorReductionV: return new XorReductionVNode (ctrl, n1, n2); default: assert(false, "unknown node: %s", NodeClassNames[vopc]); return nullptr; diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 17c7482d88c..6c5402eb511 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -203,7 +203,9 @@ class ReductionNode : public Node { init_class_id(Class_Reduction); } - static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt); + static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt, + // This only effects floating-point add and mul reductions. + bool requires_strict_order = true); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); // Make an identity scalar (zero for add, one for mul, etc) for scalar opc. @@ -225,47 +227,97 @@ class ReductionNode : public Node { // Needed for proper cloning. virtual uint size_of() const { return sizeof(*this); } -}; -//---------------------------UnorderedReductionNode------------------------------------- -// Order of reduction does not matter. Example int add. Not true for float add. -class UnorderedReductionNode : public ReductionNode { -public: - UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) { - init_class_id(Class_UnorderedReduction); + // Floating-point addition and multiplication are non-associative, so + // AddReductionVF/D and MulReductionVF/D require strict ordering + // in auto-vectorization. Vector API can generate AddReductionVF/D + // and MulReductionVF/VD without strict ordering, which can benefit + // some platforms. + // + // Other reductions don't need strict ordering. + virtual bool requires_strict_order() const { + return false; } + +#ifndef PRODUCT + void dump_spec(outputStream* st) const { + if (requires_strict_order()) { + st->print("requires_strict_order"); + } else { + st->print("no_strict_order"); + } + } +#endif }; //------------------------------AddReductionVINode-------------------------------------- // Vector add byte, short and int as a reduction -class AddReductionVINode : public UnorderedReductionNode { +class AddReductionVINode : public ReductionNode { public: - AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + AddReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------AddReductionVLNode-------------------------------------- // Vector add long as a reduction -class AddReductionVLNode : public UnorderedReductionNode { +class AddReductionVLNode : public ReductionNode { public: - AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + AddReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------AddReductionVFNode-------------------------------------- // Vector add float as a reduction class AddReductionVFNode : public ReductionNode { +private: + // True if add reduction operation for floats requires strict ordering. + // As an example - The value is true when add reduction for floats is auto-vectorized + // as auto-vectorization mandates strict ordering but the value is false when this node + // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering. + const bool _requires_strict_order; public: - AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + //_requires_strict_order is set to true by default as mandated by auto-vectorization + AddReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) : + ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {} + virtual int Opcode() const; + + virtual bool requires_strict_order() const { return _requires_strict_order; } + + virtual uint hash() const { return Node::hash() + _requires_strict_order; } + + virtual bool cmp(const Node& n) const { + return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order(); + } + + virtual uint size_of() const { return sizeof(*this); } }; //------------------------------AddReductionVDNode-------------------------------------- // Vector add double as a reduction class AddReductionVDNode : public ReductionNode { +private: + // True if add reduction operation for doubles requires strict ordering. + // As an example - The value is true when add reduction for doubles is auto-vectorized + // as auto-vectorization mandates strict ordering but the value is false when this node + // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering. + const bool _requires_strict_order; public: - AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + //_requires_strict_order is set to true by default as mandated by auto-vectorization + AddReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) : + ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {} + virtual int Opcode() const; + + virtual bool requires_strict_order() const { return _requires_strict_order; } + + virtual uint hash() const { return Node::hash() + _requires_strict_order; } + + virtual bool cmp(const Node& n) const { + return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order(); + } + + virtual uint size_of() const { return sizeof(*this); } }; //------------------------------SubVBNode-------------------------------------- @@ -400,34 +452,70 @@ public: //------------------------------MulReductionVINode-------------------------------------- // Vector multiply byte, short and int as a reduction -class MulReductionVINode : public UnorderedReductionNode { +class MulReductionVINode : public ReductionNode { public: - MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + MulReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------MulReductionVLNode-------------------------------------- // Vector multiply int as a reduction -class MulReductionVLNode : public UnorderedReductionNode { +class MulReductionVLNode : public ReductionNode { public: - MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + MulReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------MulReductionVFNode-------------------------------------- // Vector multiply float as a reduction class MulReductionVFNode : public ReductionNode { + // True if mul reduction operation for floats requires strict ordering. + // As an example - The value is true when mul reduction for floats is auto-vectorized + // as auto-vectorization mandates strict ordering but the value is false when this node + // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering. + const bool _requires_strict_order; public: - MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + //_requires_strict_order is set to true by default as mandated by auto-vectorization + MulReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) : + ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {} + virtual int Opcode() const; + + virtual bool requires_strict_order() const { return _requires_strict_order; } + + virtual uint hash() const { return Node::hash() + _requires_strict_order; } + + virtual bool cmp(const Node& n) const { + return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order(); + } + + virtual uint size_of() const { return sizeof(*this); } }; //------------------------------MulReductionVDNode-------------------------------------- // Vector multiply double as a reduction class MulReductionVDNode : public ReductionNode { + // True if mul reduction operation for doubles requires strict ordering. + // As an example - The value is true when mul reduction for doubles is auto-vectorized + // as auto-vectorization mandates strict ordering but the value is false when this node + // is generated through VectorAPI as VectorAPI does not impose any such rules on ordering. + const bool _requires_strict_order; public: - MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + //_requires_strict_order is set to true by default as mandated by auto-vectorization + MulReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) : + ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {} + virtual int Opcode() const; + + virtual bool requires_strict_order() const { return _requires_strict_order; } + + virtual uint hash() const { return Node::hash() + _requires_strict_order; } + + virtual bool cmp(const Node& n) const { + return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order(); + } + + virtual uint size_of() const { return sizeof(*this); } }; //------------------------------DivVFNode-------------------------------------- @@ -753,9 +841,9 @@ class AndVNode : public VectorNode { //------------------------------AndReductionVNode-------------------------------------- // Vector and byte, short, int, long as a reduction -class AndReductionVNode : public UnorderedReductionNode { +class AndReductionVNode : public ReductionNode { public: - AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + AndReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; @@ -770,9 +858,9 @@ class OrVNode : public VectorNode { //------------------------------OrReductionVNode-------------------------------------- // Vector xor byte, short, int, long as a reduction -class OrReductionVNode : public UnorderedReductionNode { +class OrReductionVNode : public ReductionNode { public: - OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + OrReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; @@ -787,25 +875,25 @@ class XorVNode : public VectorNode { //------------------------------XorReductionVNode-------------------------------------- // Vector and int, long as a reduction -class XorReductionVNode : public UnorderedReductionNode { +class XorReductionVNode : public ReductionNode { public: - XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + XorReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------MinReductionVNode-------------------------------------- // Vector min byte, short, int, long, float, double as a reduction -class MinReductionVNode : public UnorderedReductionNode { +class MinReductionVNode : public ReductionNode { public: - MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + MinReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; //------------------------------MaxReductionVNode-------------------------------------- // Vector min byte, short, int, long, float, double as a reduction -class MaxReductionVNode : public UnorderedReductionNode { +class MaxReductionVNode : public ReductionNode { public: - MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + MaxReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; }; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java new file mode 100644 index 00000000000..327e6e5e12d --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2024, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; + +/* + * @test + * @bug 8320725 + * @summary Ensure strictly ordered AddReductionVF/VD and MulReductionVF/VD nodes + are generated when these operations are auto-vectorized + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestVectorFPReduction + */ + +public class TestVectorFPReduction { + + final private static int SIZE = 1024; + + private static double[] da = new double[SIZE]; + private static double[] db = new double[SIZE]; + private static float[] fa = new float[SIZE]; + private static float[] fb = new float[SIZE]; + private static float fresult; + private static double dresult; + + public static void main(String[] args) { + TestFramework.run(); + } + + @Test + @IR(failOn = {IRNode.ADD_REDUCTION_VF}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"}, + phase = CompilePhase.PRINT_IDEAL) + private static void testAddReductionVF() { + float result = 1; + for (int i = 0; i < SIZE; i++) { + result += (fa[i] + fb[i]); + } + fresult += result; + } + + @Test + @IR(failOn = {IRNode.ADD_REDUCTION_VD}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"}, + phase = CompilePhase.PRINT_IDEAL) + private static void testAddReductionVD() { + double result = 1; + for (int i = 0; i < SIZE; i++) { + result += (da[i] + db[i]); + } + dresult += result; + } + + @Test + @IR(failOn = {IRNode.MUL_REDUCTION_VF}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"}, + phase = CompilePhase.PRINT_IDEAL) + private static void testMulReductionVF() { + float result = 1; + for (int i = 0; i < SIZE; i++) { + result *= (fa[i] + fb[i]); + } + fresult += result; + } + + @Test + @IR(failOn = {IRNode.MUL_REDUCTION_VD}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"}, + phase = CompilePhase.PRINT_IDEAL) + private static void testMulReductionVD() { + double result = 1; + for (int i = 0; i < SIZE; i++) { + result *= (da[i] + db[i]); + } + dresult += result; + } +} diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java new file mode 100644 index 00000000000..549d9aa5d49 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorAddMulReduction.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.ir_framework.*; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +import java.util.Random; + +import jdk.test.lib.Asserts; +import jdk.test.lib.Utils; + +/** + * @test + * @bug 8320725 + * @library /test/lib / + * @summary Verify non-strictly ordered AddReductionVF/VD and MulReductionVF/VD + * nodes are generated in VectorAPI + * @modules jdk.incubator.vector + * @run driver compiler.vectorapi.TestVectorAddMulReduction + */ + +public class TestVectorAddMulReduction { + + private static final int SIZE = 1024; + private static final Random RD = Utils.getRandomInstance(); + + private static float[] fa; + private static float fres; + private static double[] da; + private static double dres; + + static { + fa = new float[SIZE]; + da = new double[SIZE]; + fres = 1; + dres = 1; + for (int i = 0; i < SIZE; i++) { + fa[i] = RD.nextFloat(); + da[i] = RD.nextDouble(); + } + } + + // Test add reduction operation for floats + @ForceInline + public static void testFloatAddKernel(VectorSpecies SPECIES, float[] f) { + for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) { + var av = FloatVector.fromArray(SPECIES, f, i); + fres += av.reduceLanes(VectorOperators.ADD); + } + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=8"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatAdd_64() { + testFloatAddKernel(FloatVector.SPECIES_64, fa); + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=16"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatAdd_128() { + testFloatAddKernel(FloatVector.SPECIES_128, fa); + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=32"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatAdd_256() { + testFloatAddKernel(FloatVector.SPECIES_256, fa); + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=64"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatAdd_512() { + testFloatAddKernel(FloatVector.SPECIES_512, fa); + } + + // Test add reduction operation for doubles + @ForceInline + public static void testDoubleAddKernel(VectorSpecies SPECIES, double[] d) { + for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) { + var av = DoubleVector.fromArray(SPECIES, d, i); + dres += av.reduceLanes(VectorOperators.ADD); + } + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=16"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testDoubleAdd_128() { + testDoubleAddKernel(DoubleVector.SPECIES_128, da); + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=32"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testDoubleAdd_256() { + testDoubleAddKernel(DoubleVector.SPECIES_256, da); + } + + @Test + @IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=64"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testDoubleAdd_512() { + testDoubleAddKernel(DoubleVector.SPECIES_512, da); + } + + // Test mul reduction operation for floats + // On aarch64, there are no direct vector mul reduction instructions for float/double mul reduction + // and scalar instructions are emitted for 64-bit/128-bit vectors. Thus MulReductionVF/VD nodes are generated + // only for vector length of 8B/16B on vectorAPI. + @ForceInline + public static void testFloatMulKernel(VectorSpecies SPECIES, float[] f) { + for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) { + var av = FloatVector.fromArray(SPECIES, f, i); + fres += av.reduceLanes(VectorOperators.MUL); + } + } + + @Test + @IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=8"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatMul_64() { + testFloatMulKernel(FloatVector.SPECIES_64, fa); + } + + @Test + @IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=16"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testFloatMul_128() { + testFloatMulKernel(FloatVector.SPECIES_128, fa); + } + + // Test mul reduction operation for doubles + @ForceInline + public static void testDoubleMulKernel(VectorSpecies SPECIES, double[] d) { + for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) { + var av = DoubleVector.fromArray(SPECIES, d, i); + dres += av.reduceLanes(VectorOperators.MUL); + } + } + + @Test + @IR(counts = {IRNode.MUL_REDUCTION_VD, ">=1", "no_strict_order", ">=1"}, + failOn = {"requires_strict_order"}, + applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"MaxVectorSize", ">=16"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testDoubleMul_128() { + testDoubleMulKernel(DoubleVector.SPECIES_128, da); + } + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector"); + } +}