8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction

Co-authored-by: Eric Liu <eliu@openjdk.org>
Reviewed-by: gli, epeter, aph
This commit is contained in:
Bhavana Kilambi 2024-06-11 07:16:56 +00:00 committed by Emanuel Peter
parent badf1cb9ce
commit 0e4d4a0c31
10 changed files with 575 additions and 118 deletions

View File

@ -135,9 +135,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
@ -2858,14 +2858,14 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
%}
// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
@ -2873,11 +2873,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
ins_pipe(pipe_slow);
%}
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
ins_pipe(pipe_slow);
%}
// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
// AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
// auto-vectorization on SVE machine.
instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 src2));
format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
@ -2899,14 +2911,14 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
%}
// reduction addD
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);
instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(!n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
@ -2914,11 +2926,21 @@ instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
ins_pipe(pipe_slow);
%}
// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
// AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
// auto-vectorization on SVE machine.
instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVD dst_src1 src2));
format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);

View File

@ -125,9 +125,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
@ -1752,14 +1752,14 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
@ -1767,11 +1767,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
ins_pipe(pipe_slow);
%}
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@ -1783,11 +1785,21 @@ dnl
dnl REDUCE_ADD_FP_SVE($1, $2 )
dnl REDUCE_ADD_FP_SVE(type, size)
define(`REDUCE_ADD_FP_SVE', `
// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
// AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
// auto-vectorization on SVE machine.
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
@ -1798,14 +1810,14 @@ dnl
REDUCE_ADD_FP_SVE(F, S)
// reduction addD
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);
instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(!n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1460,7 +1460,7 @@ public:
};
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
// Move UnorderedReduction out of loop if possible
// Move an unordered Reduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
// Create a scheduled list of nodes control dependent on ctrl set.

View File

@ -4310,11 +4310,19 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
return AutoVectorizeStatus::Success;
}
// Returns true if the Reduction node is unordered.
static bool is_unordered_reduction(Node* n) {
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// Note: UnorderedReduction represents a ReductionNode which does not require
// calculating in strict order.
//
// CountedLoop init
// | |
// +------+ | +-----------------------+
@ -4354,21 +4362,24 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
//
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition).
// reordering of operations (for example float addition/multiplication require
// strict order).
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
// Find all Phi nodes with UnorderedReduction on backedge.
// Find all Phi nodes with an unordered Reduction on backedge.
CountedLoopNode* cl = loop->_head->as_CountedLoop();
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
Node* phi = cl->fast_out(j);
// We have a phi with a single use, and a UnorderedReduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
// We have a phi with a single use, and an unordered Reduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
continue;
}
UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();
ReductionNode* last_ur = phi->in(2)->as_Reduction();
assert(!last_ur->requires_strict_order(), "must be");
// Determine types
const TypeVect* vec_t = last_ur->vect_type();
@ -4385,14 +4396,14 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
continue; // not implemented -> fails
}
// Traverse up the chain of UnorderedReductions, checking that it loops back to
// the phi. Check that all UnorderedReductions only have a single use, except for
// Traverse up the chain of unordered Reductions, checking that it loops back to
// the phi. Check that all unordered Reductions only have a single use, except for
// the last (last_ur), which only has phi as a use in the loop, and all other uses
// are outside the loop.
UnorderedReductionNode* current = last_ur;
UnorderedReductionNode* first_ur = nullptr;
ReductionNode* current = last_ur;
ReductionNode* first_ur = nullptr;
while (true) {
assert(current->is_UnorderedReduction(), "sanity");
assert(!current->requires_strict_order(), "sanity");
// Expect no ctrl and a vector_input from within the loop.
Node* ctrl = current->in(0);
@ -4409,7 +4420,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
break; // Chain traversal fails.
}
// Expect single use of UnorderedReduction, except for last_ur.
// Expect single use of an unordered Reduction, except for last_ur.
if (current == last_ur) {
// Expect all uses to be outside the loop, except phi.
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
@ -4427,12 +4438,13 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
}
}
// Expect another UnorderedReduction or phi as the scalar input.
// Expect another unordered Reduction or phi as the scalar input.
Node* scalar_input = current->in(1);
if (scalar_input->is_UnorderedReduction() &&
if (is_unordered_reduction(scalar_input) &&
scalar_input->Opcode() == current->Opcode()) {
// Move up the UnorderedReduction chain.
current = scalar_input->as_UnorderedReduction();
// Move up the unordered Reduction chain.
current = scalar_input->as_Reduction();
assert(!current->requires_strict_order(), "must be");
} else if (scalar_input == phi) {
// Chain terminates at phi.
first_ur = current;
@ -4456,7 +4468,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
register_new_node(identity_vector, C->root());
assert(vec_t == identity_vector->vect_type(), "matching vector type");
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");
VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
// Turn the scalar phi into a vector phi.
_igvn.rehash_node_delayed(phi);
@ -4465,7 +4477,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
phi->as_Type()->set_type(vec_t);
_igvn.set_type(phi, vec_t);
// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
// Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
current = first_ur;
while (true) {
// Create vector_accumulator to replace current.
@ -4474,11 +4486,12 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
register_new_node(vector_accumulator, cl);
_igvn.replace_node(current, vector_accumulator);
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
if (current == last_ur) {
break;
}
current = vector_accumulator->unique_out()->as_UnorderedReduction();
current = vector_accumulator->unique_out()->as_Reduction();
assert(!current->requires_strict_order(), "must be");
}
// Create post-loop reduction.
@ -4495,7 +4508,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
}
}
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");
VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");

View File

@ -175,7 +175,6 @@ class SubTypeCheckNode;
class Type;
class TypeNode;
class UnlockNode;
class UnorderedReductionNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
@ -739,7 +738,6 @@ public:
DEFINE_CLASS_ID(ExpandV, Vector, 5)
DEFINE_CLASS_ID(CompressM, Vector, 6)
DEFINE_CLASS_ID(Reduction, Vector, 7)
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
DEFINE_CLASS_ID(NegV, Vector, 8)
DEFINE_CLASS_ID(Con, Type, 8)
DEFINE_CLASS_ID(ConI, Con, 0)
@ -991,7 +989,6 @@ public:
DEFINE_CLASS_QUERY(Sub)
DEFINE_CLASS_QUERY(SubTypeCheck)
DEFINE_CLASS_QUERY(Type)
DEFINE_CLASS_QUERY(UnorderedReduction)
DEFINE_CLASS_QUERY(Vector)
DEFINE_CLASS_QUERY(VectorMaskCmp)
DEFINE_CLASS_QUERY(VectorUnbox)

View File

@ -1616,21 +1616,23 @@ bool LibraryCallKit::inline_vector_reduction() {
}
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
Node* value = nullptr;
if (mask == nullptr) {
assert(!is_masked_op, "Masked op needs the mask value never null");
value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
} else {
if (use_predicate) {
value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
Node* value = opd;
assert(mask != nullptr || !is_masked_op, "Masked op needs the mask value never null");
if (mask != nullptr && !use_predicate) {
Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
value = gvn().transform(new VectorBlendNode(reduce_identity, value, mask));
}
// Make an unordered Reduction node. This affects only AddReductionVF/VD and MulReductionVF/VD,
// as these operations are allowed to be associative (not requiring strict order) in VectorAPI.
value = ReductionNode::make(opc, nullptr, init, value, elem_bt, /* requires_strict_order */ false);
if (mask != nullptr && use_predicate) {
value->add_req(mask);
value->add_flag(Node::Flag_is_predicated_vector);
} else {
Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
value = gvn().transform(new VectorBlendNode(reduce_identity, opd, mask));
value = ReductionNode::make(opc, nullptr, init, value, elem_bt);
}
}
value = gvn().transform(value);
Node* bits = nullptr;

View File

@ -1296,7 +1296,8 @@ int ReductionNode::opcode(int opc, BasicType bt) {
}
// Return the appropriate reduction node.
ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt,
bool requires_strict_order) {
int vopc = opcode(opc, bt);
@ -1306,17 +1307,17 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
switch (vopc) {
case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2, requires_strict_order);
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2, requires_strict_order);
case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
case Op_AndReductionV: return new AndReductionVNode(ctrl, n1, n2);
case Op_OrReductionV: return new OrReductionVNode(ctrl, n1, n2);
case Op_XorReductionV: return new XorReductionVNode(ctrl, n1, n2);
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2, requires_strict_order);
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2, requires_strict_order);
case Op_MinReductionV: return new MinReductionVNode (ctrl, n1, n2);
case Op_MaxReductionV: return new MaxReductionVNode (ctrl, n1, n2);
case Op_AndReductionV: return new AndReductionVNode (ctrl, n1, n2);
case Op_OrReductionV: return new OrReductionVNode (ctrl, n1, n2);
case Op_XorReductionV: return new XorReductionVNode (ctrl, n1, n2);
default:
assert(false, "unknown node: %s", NodeClassNames[vopc]);
return nullptr;

View File

@ -203,7 +203,9 @@ class ReductionNode : public Node {
init_class_id(Class_Reduction);
}
static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt);
static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt,
// This only effects floating-point add and mul reductions.
bool requires_strict_order = true);
static int opcode(int opc, BasicType bt);
static bool implemented(int opc, uint vlen, BasicType bt);
// Make an identity scalar (zero for add, one for mul, etc) for scalar opc.
@ -225,47 +227,97 @@ class ReductionNode : public Node {
// Needed for proper cloning.
virtual uint size_of() const { return sizeof(*this); }
};
//---------------------------UnorderedReductionNode-------------------------------------
// Order of reduction does not matter. Example int add. Not true for float add.
class UnorderedReductionNode : public ReductionNode {
public:
UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {
init_class_id(Class_UnorderedReduction);
// Floating-point addition and multiplication are non-associative, so
// AddReductionVF/D and MulReductionVF/D require strict ordering
// in auto-vectorization. Vector API can generate AddReductionVF/D
// and MulReductionVF/VD without strict ordering, which can benefit
// some platforms.
//
// Other reductions don't need strict ordering.
virtual bool requires_strict_order() const {
return false;
}
#ifndef PRODUCT
void dump_spec(outputStream* st) const {
if (requires_strict_order()) {
st->print("requires_strict_order");
} else {
st->print("no_strict_order");
}
}
#endif
};
//------------------------------AddReductionVINode--------------------------------------
// Vector add byte, short and int as a reduction
class AddReductionVINode : public UnorderedReductionNode {
class AddReductionVINode : public ReductionNode {
public:
AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
AddReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------AddReductionVLNode--------------------------------------
// Vector add long as a reduction
class AddReductionVLNode : public UnorderedReductionNode {
class AddReductionVLNode : public ReductionNode {
public:
AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
AddReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------AddReductionVFNode--------------------------------------
// Vector add float as a reduction
class AddReductionVFNode : public ReductionNode {
private:
// True if add reduction operation for floats requires strict ordering.
// As an example - The value is true when add reduction for floats is auto-vectorized
// as auto-vectorization mandates strict ordering but the value is false when this node
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
const bool _requires_strict_order;
public:
AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
//_requires_strict_order is set to true by default as mandated by auto-vectorization
AddReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
virtual int Opcode() const;
virtual bool requires_strict_order() const { return _requires_strict_order; }
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
virtual bool cmp(const Node& n) const {
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
}
virtual uint size_of() const { return sizeof(*this); }
};
//------------------------------AddReductionVDNode--------------------------------------
// Vector add double as a reduction
class AddReductionVDNode : public ReductionNode {
private:
// True if add reduction operation for doubles requires strict ordering.
// As an example - The value is true when add reduction for doubles is auto-vectorized
// as auto-vectorization mandates strict ordering but the value is false when this node
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
const bool _requires_strict_order;
public:
AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
//_requires_strict_order is set to true by default as mandated by auto-vectorization
AddReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
virtual int Opcode() const;
virtual bool requires_strict_order() const { return _requires_strict_order; }
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
virtual bool cmp(const Node& n) const {
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
}
virtual uint size_of() const { return sizeof(*this); }
};
//------------------------------SubVBNode--------------------------------------
@ -400,34 +452,70 @@ public:
//------------------------------MulReductionVINode--------------------------------------
// Vector multiply byte, short and int as a reduction
class MulReductionVINode : public UnorderedReductionNode {
class MulReductionVINode : public ReductionNode {
public:
MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
MulReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MulReductionVLNode--------------------------------------
// Vector multiply int as a reduction
class MulReductionVLNode : public UnorderedReductionNode {
class MulReductionVLNode : public ReductionNode {
public:
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
MulReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MulReductionVFNode--------------------------------------
// Vector multiply float as a reduction
class MulReductionVFNode : public ReductionNode {
// True if mul reduction operation for floats requires strict ordering.
// As an example - The value is true when mul reduction for floats is auto-vectorized
// as auto-vectorization mandates strict ordering but the value is false when this node
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
const bool _requires_strict_order;
public:
MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
//_requires_strict_order is set to true by default as mandated by auto-vectorization
MulReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
virtual int Opcode() const;
virtual bool requires_strict_order() const { return _requires_strict_order; }
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
virtual bool cmp(const Node& n) const {
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
}
virtual uint size_of() const { return sizeof(*this); }
};
//------------------------------MulReductionVDNode--------------------------------------
// Vector multiply double as a reduction
class MulReductionVDNode : public ReductionNode {
// True if mul reduction operation for doubles requires strict ordering.
// As an example - The value is true when mul reduction for doubles is auto-vectorized
// as auto-vectorization mandates strict ordering but the value is false when this node
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
const bool _requires_strict_order;
public:
MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
//_requires_strict_order is set to true by default as mandated by auto-vectorization
MulReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
virtual int Opcode() const;
virtual bool requires_strict_order() const { return _requires_strict_order; }
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
virtual bool cmp(const Node& n) const {
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
}
virtual uint size_of() const { return sizeof(*this); }
};
//------------------------------DivVFNode--------------------------------------
@ -753,9 +841,9 @@ class AndVNode : public VectorNode {
//------------------------------AndReductionVNode--------------------------------------
// Vector and byte, short, int, long as a reduction
class AndReductionVNode : public UnorderedReductionNode {
class AndReductionVNode : public ReductionNode {
public:
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
AndReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -770,9 +858,9 @@ class OrVNode : public VectorNode {
//------------------------------OrReductionVNode--------------------------------------
// Vector xor byte, short, int, long as a reduction
class OrReductionVNode : public UnorderedReductionNode {
class OrReductionVNode : public ReductionNode {
public:
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
OrReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -787,25 +875,25 @@ class XorVNode : public VectorNode {
//------------------------------XorReductionVNode--------------------------------------
// Vector and int, long as a reduction
class XorReductionVNode : public UnorderedReductionNode {
class XorReductionVNode : public ReductionNode {
public:
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
XorReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MinReductionVNode--------------------------------------
// Vector min byte, short, int, long, float, double as a reduction
class MinReductionVNode : public UnorderedReductionNode {
class MinReductionVNode : public ReductionNode {
public:
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
MinReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MaxReductionVNode--------------------------------------
// Vector min byte, short, int, long, float, double as a reduction
class MaxReductionVNode : public UnorderedReductionNode {
class MaxReductionVNode : public ReductionNode {
public:
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
MaxReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};

View File

@ -0,0 +1,111 @@
/*
* Copyright (c) 2024, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
/*
* @test
* @bug 8320725
* @summary Ensure strictly ordered AddReductionVF/VD and MulReductionVF/VD nodes
are generated when these operations are auto-vectorized
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestVectorFPReduction
*/
public class TestVectorFPReduction {
final private static int SIZE = 1024;
private static double[] da = new double[SIZE];
private static double[] db = new double[SIZE];
private static float[] fa = new float[SIZE];
private static float[] fb = new float[SIZE];
private static float fresult;
private static double dresult;
public static void main(String[] args) {
TestFramework.run();
}
@Test
@IR(failOn = {IRNode.ADD_REDUCTION_VF},
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
@IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"},
failOn = {"no_strict_order"},
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
phase = CompilePhase.PRINT_IDEAL)
private static void testAddReductionVF() {
float result = 1;
for (int i = 0; i < SIZE; i++) {
result += (fa[i] + fb[i]);
}
fresult += result;
}
@Test
@IR(failOn = {IRNode.ADD_REDUCTION_VD},
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
@IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"},
failOn = {"no_strict_order"},
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
phase = CompilePhase.PRINT_IDEAL)
private static void testAddReductionVD() {
double result = 1;
for (int i = 0; i < SIZE; i++) {
result += (da[i] + db[i]);
}
dresult += result;
}
@Test
@IR(failOn = {IRNode.MUL_REDUCTION_VF},
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
@IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"},
failOn = {"no_strict_order"},
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
phase = CompilePhase.PRINT_IDEAL)
private static void testMulReductionVF() {
float result = 1;
for (int i = 0; i < SIZE; i++) {
result *= (fa[i] + fb[i]);
}
fresult += result;
}
@Test
@IR(failOn = {IRNode.MUL_REDUCTION_VD},
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
@IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"},
failOn = {"no_strict_order"},
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
phase = CompilePhase.PRINT_IDEAL)
private static void testMulReductionVD() {
double result = 1;
for (int i = 0; i < SIZE; i++) {
result *= (da[i] + db[i]);
}
dresult += result;
}
}

View File

@ -0,0 +1,211 @@
/*
* Copyright (c) 2024, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.vectorapi;
import compiler.lib.ir_framework.*;
import jdk.incubator.vector.DoubleVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;
import java.util.Random;
import jdk.test.lib.Asserts;
import jdk.test.lib.Utils;
/**
* @test
* @bug 8320725
* @library /test/lib /
* @summary Verify non-strictly ordered AddReductionVF/VD and MulReductionVF/VD
* nodes are generated in VectorAPI
* @modules jdk.incubator.vector
* @run driver compiler.vectorapi.TestVectorAddMulReduction
*/
public class TestVectorAddMulReduction {
private static final int SIZE = 1024;
private static final Random RD = Utils.getRandomInstance();
private static float[] fa;
private static float fres;
private static double[] da;
private static double dres;
static {
fa = new float[SIZE];
da = new double[SIZE];
fres = 1;
dres = 1;
for (int i = 0; i < SIZE; i++) {
fa[i] = RD.nextFloat();
da[i] = RD.nextDouble();
}
}
// Test add reduction operation for floats
@ForceInline
public static void testFloatAddKernel(VectorSpecies SPECIES, float[] f) {
for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
var av = FloatVector.fromArray(SPECIES, f, i);
fres += av.reduceLanes(VectorOperators.ADD);
}
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=8"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatAdd_64() {
testFloatAddKernel(FloatVector.SPECIES_64, fa);
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=16"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatAdd_128() {
testFloatAddKernel(FloatVector.SPECIES_128, fa);
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=32"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatAdd_256() {
testFloatAddKernel(FloatVector.SPECIES_256, fa);
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=64"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatAdd_512() {
testFloatAddKernel(FloatVector.SPECIES_512, fa);
}
// Test add reduction operation for doubles
@ForceInline
public static void testDoubleAddKernel(VectorSpecies SPECIES, double[] d) {
for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
var av = DoubleVector.fromArray(SPECIES, d, i);
dres += av.reduceLanes(VectorOperators.ADD);
}
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=16"},
phase = CompilePhase.PRINT_IDEAL)
public static void testDoubleAdd_128() {
testDoubleAddKernel(DoubleVector.SPECIES_128, da);
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=32"},
phase = CompilePhase.PRINT_IDEAL)
public static void testDoubleAdd_256() {
testDoubleAddKernel(DoubleVector.SPECIES_256, da);
}
@Test
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=64"},
phase = CompilePhase.PRINT_IDEAL)
public static void testDoubleAdd_512() {
testDoubleAddKernel(DoubleVector.SPECIES_512, da);
}
// Test mul reduction operation for floats
// On aarch64, there are no direct vector mul reduction instructions for float/double mul reduction
// and scalar instructions are emitted for 64-bit/128-bit vectors. Thus MulReductionVF/VD nodes are generated
// only for vector length of 8B/16B on vectorAPI.
@ForceInline
public static void testFloatMulKernel(VectorSpecies SPECIES, float[] f) {
for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
var av = FloatVector.fromArray(SPECIES, f, i);
fres += av.reduceLanes(VectorOperators.MUL);
}
}
@Test
@IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=8"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatMul_64() {
testFloatMulKernel(FloatVector.SPECIES_64, fa);
}
@Test
@IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=16"},
phase = CompilePhase.PRINT_IDEAL)
public static void testFloatMul_128() {
testFloatMulKernel(FloatVector.SPECIES_128, fa);
}
// Test mul reduction operation for doubles
@ForceInline
public static void testDoubleMulKernel(VectorSpecies SPECIES, double[] d) {
for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
var av = DoubleVector.fromArray(SPECIES, d, i);
dres += av.reduceLanes(VectorOperators.MUL);
}
}
@Test
@IR(counts = {IRNode.MUL_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
failOn = {"requires_strict_order"},
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"MaxVectorSize", ">=16"},
phase = CompilePhase.PRINT_IDEAL)
public static void testDoubleMul_128() {
testDoubleMulKernel(DoubleVector.SPECIES_128, da);
}
public static void main(String[] args) {
TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
}
}