8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction
Co-authored-by: Eric Liu <eliu@openjdk.org> Reviewed-by: gli, epeter, aph
This commit is contained in:
parent
badf1cb9ce
commit
0e4d4a0c31
@ -135,9 +135,9 @@ source %{
|
||||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
|
||||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
|
||||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
|
||||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
|
||||
// It is not suitable for auto-vectorization because it does not add the elements
|
||||
// in the same order as sequential code, and FP addition is non-associative.
|
||||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
|
||||
// They are not suitable for auto-vectorization because the result would not conform
|
||||
// to the JLS, Section Evaluation Order.
|
||||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
|
||||
opcode == Op_MulVL) {
|
||||
@ -2858,14 +2858,14 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
|
||||
%}
|
||||
|
||||
// reduction addF
|
||||
// Floating-point addition is not associative, so the rules for AddReductionVF
|
||||
// on NEON can't be used to auto-vectorize floating-point reduce-add.
|
||||
// Currently, on NEON, AddReductionVF is only generated by Vector API.
|
||||
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
|
||||
|
||||
instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVF fsrc vsrc));
|
||||
effect(TEMP_DEF dst);
|
||||
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
|
||||
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
|
||||
ins_encode %{
|
||||
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
|
||||
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
|
||||
@ -2873,11 +2873,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
|
||||
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
|
||||
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
|
||||
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVF fsrc vsrc));
|
||||
effect(TEMP_DEF dst, TEMP tmp);
|
||||
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
|
||||
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
|
||||
ins_encode %{
|
||||
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
|
||||
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
|
||||
@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
// This rule calculates the reduction result in strict order. Two cases will
|
||||
// reach here:
|
||||
// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
|
||||
// AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
|
||||
// beneficial performance-wise to generate direct SVE instruction even if it is
|
||||
// strictly ordered.
|
||||
// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
|
||||
// auto-vectorization on SVE machine.
|
||||
instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
|
||||
predicate(UseSVE > 0);
|
||||
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
|
||||
n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst_src1 (AddReductionVF dst_src1 src2));
|
||||
format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
|
||||
ins_encode %{
|
||||
assert(UseSVE > 0, "must be sve");
|
||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
|
||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
|
||||
__ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
|
||||
@ -2899,14 +2911,14 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
|
||||
%}
|
||||
|
||||
// reduction addD
|
||||
// Floating-point addition is not associative, so the rule for AddReductionVD
|
||||
// on NEON can't be used to auto-vectorize floating-point reduce-add.
|
||||
// Currently, on NEON, AddReductionVD is only generated by Vector API.
|
||||
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
|
||||
predicate(UseSVE == 0);
|
||||
|
||||
instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
|
||||
// Non-strictly ordered floating-point add reduction for doubles. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(!n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVD dsrc vsrc));
|
||||
effect(TEMP_DEF dst);
|
||||
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
|
||||
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
|
||||
ins_encode %{
|
||||
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
|
||||
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
|
||||
@ -2914,11 +2926,21 @@ instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
// This rule calculates the reduction result in strict order. Two cases will
|
||||
// reach here:
|
||||
// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
|
||||
// AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
|
||||
// beneficial performance-wise to generate direct SVE instruction even if it is
|
||||
// strictly ordered.
|
||||
// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
|
||||
// auto-vectorization on SVE machine.
|
||||
instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
|
||||
predicate(UseSVE > 0);
|
||||
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
|
||||
n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst_src1 (AddReductionVD dst_src1 src2));
|
||||
format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
|
||||
ins_encode %{
|
||||
assert(UseSVE > 0, "must be sve");
|
||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
|
||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
|
||||
__ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
|
||||
|
@ -125,9 +125,9 @@ source %{
|
||||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
|
||||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
|
||||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
|
||||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
|
||||
// It is not suitable for auto-vectorization because it does not add the elements
|
||||
// in the same order as sequential code, and FP addition is non-associative.
|
||||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
|
||||
// They are not suitable for auto-vectorization because the result would not conform
|
||||
// to the JLS, Section Evaluation Order.
|
||||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
|
||||
opcode == Op_MulVL) {
|
||||
@ -1752,14 +1752,14 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
|
||||
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
|
||||
|
||||
// reduction addF
|
||||
// Floating-point addition is not associative, so the rules for AddReductionVF
|
||||
// on NEON can't be used to auto-vectorize floating-point reduce-add.
|
||||
// Currently, on NEON, AddReductionVF is only generated by Vector API.
|
||||
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
|
||||
|
||||
instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVF fsrc vsrc));
|
||||
effect(TEMP_DEF dst);
|
||||
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
|
||||
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
|
||||
ins_encode %{
|
||||
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
|
||||
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
|
||||
@ -1767,11 +1767,13 @@ instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
|
||||
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
|
||||
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
|
||||
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVF fsrc vsrc));
|
||||
effect(TEMP_DEF dst, TEMP tmp);
|
||||
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
|
||||
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
|
||||
ins_encode %{
|
||||
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
|
||||
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
|
||||
@ -1783,11 +1785,21 @@ dnl
|
||||
dnl REDUCE_ADD_FP_SVE($1, $2 )
|
||||
dnl REDUCE_ADD_FP_SVE(type, size)
|
||||
define(`REDUCE_ADD_FP_SVE', `
|
||||
// This rule calculates the reduction result in strict order. Two cases will
|
||||
// reach here:
|
||||
// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
|
||||
// AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
|
||||
// beneficial performance-wise to generate direct SVE instruction even if it is
|
||||
// strictly ordered.
|
||||
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
|
||||
// auto-vectorization on SVE machine.
|
||||
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
|
||||
predicate(UseSVE > 0);
|
||||
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
|
||||
n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
|
||||
format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
|
||||
ins_encode %{
|
||||
assert(UseSVE > 0, "must be sve");
|
||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
|
||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
|
||||
__ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
|
||||
@ -1798,14 +1810,14 @@ dnl
|
||||
REDUCE_ADD_FP_SVE(F, S)
|
||||
|
||||
// reduction addD
|
||||
// Floating-point addition is not associative, so the rule for AddReductionVD
|
||||
// on NEON can't be used to auto-vectorize floating-point reduce-add.
|
||||
// Currently, on NEON, AddReductionVD is only generated by Vector API.
|
||||
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
|
||||
predicate(UseSVE == 0);
|
||||
|
||||
instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
|
||||
// Non-strictly ordered floating-point add reduction for doubles. This rule is
|
||||
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
|
||||
predicate(!n->as_Reduction()->requires_strict_order());
|
||||
match(Set dst (AddReductionVD dsrc vsrc));
|
||||
effect(TEMP_DEF dst);
|
||||
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
|
||||
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
|
||||
ins_encode %{
|
||||
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
|
||||
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -1460,7 +1460,7 @@ public:
|
||||
};
|
||||
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
|
||||
|
||||
// Move UnorderedReduction out of loop if possible
|
||||
// Move an unordered Reduction out of loop if possible
|
||||
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
|
||||
|
||||
// Create a scheduled list of nodes control dependent on ctrl set.
|
||||
|
@ -4310,11 +4310,19 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
|
||||
return AutoVectorizeStatus::Success;
|
||||
}
|
||||
|
||||
// Returns true if the Reduction node is unordered.
|
||||
static bool is_unordered_reduction(Node* n) {
|
||||
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
|
||||
}
|
||||
|
||||
// Having ReductionNodes in the loop is expensive. They need to recursively
|
||||
// fold together the vector values, for every vectorized loop iteration. If
|
||||
// we encounter the following pattern, we can vector accumulate the values
|
||||
// inside the loop, and only have a single UnorderedReduction after the loop.
|
||||
//
|
||||
// Note: UnorderedReduction represents a ReductionNode which does not require
|
||||
// calculating in strict order.
|
||||
//
|
||||
// CountedLoop init
|
||||
// | |
|
||||
// +------+ | +-----------------------+
|
||||
@ -4354,21 +4362,24 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
|
||||
// wise. This is a single operation per vector_accumulator, rather than many
|
||||
// for a UnorderedReduction. We can then reduce the last vector_accumulator
|
||||
// after the loop, and also reduce the init value into it.
|
||||
//
|
||||
// We can not do this with all reductions. Some reductions do not allow the
|
||||
// reordering of operations (for example float addition).
|
||||
// reordering of operations (for example float addition/multiplication require
|
||||
// strict order).
|
||||
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
|
||||
|
||||
// Find all Phi nodes with UnorderedReduction on backedge.
|
||||
// Find all Phi nodes with an unordered Reduction on backedge.
|
||||
CountedLoopNode* cl = loop->_head->as_CountedLoop();
|
||||
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
|
||||
Node* phi = cl->fast_out(j);
|
||||
// We have a phi with a single use, and a UnorderedReduction on the backedge.
|
||||
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
|
||||
// We have a phi with a single use, and an unordered Reduction on the backedge.
|
||||
if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();
|
||||
ReductionNode* last_ur = phi->in(2)->as_Reduction();
|
||||
assert(!last_ur->requires_strict_order(), "must be");
|
||||
|
||||
// Determine types
|
||||
const TypeVect* vec_t = last_ur->vect_type();
|
||||
@ -4385,14 +4396,14 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
continue; // not implemented -> fails
|
||||
}
|
||||
|
||||
// Traverse up the chain of UnorderedReductions, checking that it loops back to
|
||||
// the phi. Check that all UnorderedReductions only have a single use, except for
|
||||
// Traverse up the chain of unordered Reductions, checking that it loops back to
|
||||
// the phi. Check that all unordered Reductions only have a single use, except for
|
||||
// the last (last_ur), which only has phi as a use in the loop, and all other uses
|
||||
// are outside the loop.
|
||||
UnorderedReductionNode* current = last_ur;
|
||||
UnorderedReductionNode* first_ur = nullptr;
|
||||
ReductionNode* current = last_ur;
|
||||
ReductionNode* first_ur = nullptr;
|
||||
while (true) {
|
||||
assert(current->is_UnorderedReduction(), "sanity");
|
||||
assert(!current->requires_strict_order(), "sanity");
|
||||
|
||||
// Expect no ctrl and a vector_input from within the loop.
|
||||
Node* ctrl = current->in(0);
|
||||
@ -4409,7 +4420,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
break; // Chain traversal fails.
|
||||
}
|
||||
|
||||
// Expect single use of UnorderedReduction, except for last_ur.
|
||||
// Expect single use of an unordered Reduction, except for last_ur.
|
||||
if (current == last_ur) {
|
||||
// Expect all uses to be outside the loop, except phi.
|
||||
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
|
||||
@ -4427,12 +4438,13 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
}
|
||||
}
|
||||
|
||||
// Expect another UnorderedReduction or phi as the scalar input.
|
||||
// Expect another unordered Reduction or phi as the scalar input.
|
||||
Node* scalar_input = current->in(1);
|
||||
if (scalar_input->is_UnorderedReduction() &&
|
||||
if (is_unordered_reduction(scalar_input) &&
|
||||
scalar_input->Opcode() == current->Opcode()) {
|
||||
// Move up the UnorderedReduction chain.
|
||||
current = scalar_input->as_UnorderedReduction();
|
||||
// Move up the unordered Reduction chain.
|
||||
current = scalar_input->as_Reduction();
|
||||
assert(!current->requires_strict_order(), "must be");
|
||||
} else if (scalar_input == phi) {
|
||||
// Chain terminates at phi.
|
||||
first_ur = current;
|
||||
@ -4456,7 +4468,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
|
||||
register_new_node(identity_vector, C->root());
|
||||
assert(vec_t == identity_vector->vect_type(), "matching vector type");
|
||||
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");
|
||||
VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
|
||||
|
||||
// Turn the scalar phi into a vector phi.
|
||||
_igvn.rehash_node_delayed(phi);
|
||||
@ -4465,7 +4477,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
phi->as_Type()->set_type(vec_t);
|
||||
_igvn.set_type(phi, vec_t);
|
||||
|
||||
// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
|
||||
// Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
|
||||
current = first_ur;
|
||||
while (true) {
|
||||
// Create vector_accumulator to replace current.
|
||||
@ -4474,11 +4486,12 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
|
||||
register_new_node(vector_accumulator, cl);
|
||||
_igvn.replace_node(current, vector_accumulator);
|
||||
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
|
||||
VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
|
||||
if (current == last_ur) {
|
||||
break;
|
||||
}
|
||||
current = vector_accumulator->unique_out()->as_UnorderedReduction();
|
||||
current = vector_accumulator->unique_out()->as_Reduction();
|
||||
assert(!current->requires_strict_order(), "must be");
|
||||
}
|
||||
|
||||
// Create post-loop reduction.
|
||||
@ -4495,7 +4508,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
}
|
||||
}
|
||||
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
|
||||
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");
|
||||
VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
|
||||
|
||||
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
|
||||
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
|
||||
|
@ -175,7 +175,6 @@ class SubTypeCheckNode;
|
||||
class Type;
|
||||
class TypeNode;
|
||||
class UnlockNode;
|
||||
class UnorderedReductionNode;
|
||||
class VectorNode;
|
||||
class LoadVectorNode;
|
||||
class LoadVectorMaskedNode;
|
||||
@ -739,7 +738,6 @@ public:
|
||||
DEFINE_CLASS_ID(ExpandV, Vector, 5)
|
||||
DEFINE_CLASS_ID(CompressM, Vector, 6)
|
||||
DEFINE_CLASS_ID(Reduction, Vector, 7)
|
||||
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
|
||||
DEFINE_CLASS_ID(NegV, Vector, 8)
|
||||
DEFINE_CLASS_ID(Con, Type, 8)
|
||||
DEFINE_CLASS_ID(ConI, Con, 0)
|
||||
@ -991,7 +989,6 @@ public:
|
||||
DEFINE_CLASS_QUERY(Sub)
|
||||
DEFINE_CLASS_QUERY(SubTypeCheck)
|
||||
DEFINE_CLASS_QUERY(Type)
|
||||
DEFINE_CLASS_QUERY(UnorderedReduction)
|
||||
DEFINE_CLASS_QUERY(Vector)
|
||||
DEFINE_CLASS_QUERY(VectorMaskCmp)
|
||||
DEFINE_CLASS_QUERY(VectorUnbox)
|
||||
|
@ -1616,21 +1616,23 @@ bool LibraryCallKit::inline_vector_reduction() {
|
||||
}
|
||||
|
||||
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
|
||||
Node* value = nullptr;
|
||||
if (mask == nullptr) {
|
||||
assert(!is_masked_op, "Masked op needs the mask value never null");
|
||||
value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
|
||||
} else {
|
||||
if (use_predicate) {
|
||||
value = ReductionNode::make(opc, nullptr, init, opd, elem_bt);
|
||||
Node* value = opd;
|
||||
|
||||
assert(mask != nullptr || !is_masked_op, "Masked op needs the mask value never null");
|
||||
if (mask != nullptr && !use_predicate) {
|
||||
Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
|
||||
value = gvn().transform(new VectorBlendNode(reduce_identity, value, mask));
|
||||
}
|
||||
|
||||
// Make an unordered Reduction node. This affects only AddReductionVF/VD and MulReductionVF/VD,
|
||||
// as these operations are allowed to be associative (not requiring strict order) in VectorAPI.
|
||||
value = ReductionNode::make(opc, nullptr, init, value, elem_bt, /* requires_strict_order */ false);
|
||||
|
||||
if (mask != nullptr && use_predicate) {
|
||||
value->add_req(mask);
|
||||
value->add_flag(Node::Flag_is_predicated_vector);
|
||||
} else {
|
||||
Node* reduce_identity = gvn().transform(VectorNode::scalar2vector(init, num_elem, Type::get_const_basic_type(elem_bt)));
|
||||
value = gvn().transform(new VectorBlendNode(reduce_identity, opd, mask));
|
||||
value = ReductionNode::make(opc, nullptr, init, value, elem_bt);
|
||||
}
|
||||
}
|
||||
|
||||
value = gvn().transform(value);
|
||||
|
||||
Node* bits = nullptr;
|
||||
|
@ -1296,7 +1296,8 @@ int ReductionNode::opcode(int opc, BasicType bt) {
|
||||
}
|
||||
|
||||
// Return the appropriate reduction node.
|
||||
ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
|
||||
ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt,
|
||||
bool requires_strict_order) {
|
||||
|
||||
int vopc = opcode(opc, bt);
|
||||
|
||||
@ -1306,17 +1307,17 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
|
||||
switch (vopc) {
|
||||
case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
|
||||
case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2, requires_strict_order);
|
||||
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2, requires_strict_order);
|
||||
case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
|
||||
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
|
||||
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
|
||||
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
|
||||
case Op_AndReductionV: return new AndReductionVNode(ctrl, n1, n2);
|
||||
case Op_OrReductionV: return new OrReductionVNode(ctrl, n1, n2);
|
||||
case Op_XorReductionV: return new XorReductionVNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2, requires_strict_order);
|
||||
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2, requires_strict_order);
|
||||
case Op_MinReductionV: return new MinReductionVNode (ctrl, n1, n2);
|
||||
case Op_MaxReductionV: return new MaxReductionVNode (ctrl, n1, n2);
|
||||
case Op_AndReductionV: return new AndReductionVNode (ctrl, n1, n2);
|
||||
case Op_OrReductionV: return new OrReductionVNode (ctrl, n1, n2);
|
||||
case Op_XorReductionV: return new XorReductionVNode (ctrl, n1, n2);
|
||||
default:
|
||||
assert(false, "unknown node: %s", NodeClassNames[vopc]);
|
||||
return nullptr;
|
||||
|
@ -203,7 +203,9 @@ class ReductionNode : public Node {
|
||||
init_class_id(Class_Reduction);
|
||||
}
|
||||
|
||||
static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt);
|
||||
static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt,
|
||||
// This only effects floating-point add and mul reductions.
|
||||
bool requires_strict_order = true);
|
||||
static int opcode(int opc, BasicType bt);
|
||||
static bool implemented(int opc, uint vlen, BasicType bt);
|
||||
// Make an identity scalar (zero for add, one for mul, etc) for scalar opc.
|
||||
@ -225,47 +227,97 @@ class ReductionNode : public Node {
|
||||
|
||||
// Needed for proper cloning.
|
||||
virtual uint size_of() const { return sizeof(*this); }
|
||||
};
|
||||
|
||||
//---------------------------UnorderedReductionNode-------------------------------------
|
||||
// Order of reduction does not matter. Example int add. Not true for float add.
|
||||
class UnorderedReductionNode : public ReductionNode {
|
||||
public:
|
||||
UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {
|
||||
init_class_id(Class_UnorderedReduction);
|
||||
// Floating-point addition and multiplication are non-associative, so
|
||||
// AddReductionVF/D and MulReductionVF/D require strict ordering
|
||||
// in auto-vectorization. Vector API can generate AddReductionVF/D
|
||||
// and MulReductionVF/VD without strict ordering, which can benefit
|
||||
// some platforms.
|
||||
//
|
||||
// Other reductions don't need strict ordering.
|
||||
virtual bool requires_strict_order() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
void dump_spec(outputStream* st) const {
|
||||
if (requires_strict_order()) {
|
||||
st->print("requires_strict_order");
|
||||
} else {
|
||||
st->print("no_strict_order");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVINode--------------------------------------
|
||||
// Vector add byte, short and int as a reduction
|
||||
class AddReductionVINode : public UnorderedReductionNode {
|
||||
class AddReductionVINode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
AddReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVLNode--------------------------------------
|
||||
// Vector add long as a reduction
|
||||
class AddReductionVLNode : public UnorderedReductionNode {
|
||||
class AddReductionVLNode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
AddReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVFNode--------------------------------------
|
||||
// Vector add float as a reduction
|
||||
class AddReductionVFNode : public ReductionNode {
|
||||
private:
|
||||
// True if add reduction operation for floats requires strict ordering.
|
||||
// As an example - The value is true when add reduction for floats is auto-vectorized
|
||||
// as auto-vectorization mandates strict ordering but the value is false when this node
|
||||
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
|
||||
const bool _requires_strict_order;
|
||||
public:
|
||||
AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
//_requires_strict_order is set to true by default as mandated by auto-vectorization
|
||||
AddReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
|
||||
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
|
||||
|
||||
virtual int Opcode() const;
|
||||
|
||||
virtual bool requires_strict_order() const { return _requires_strict_order; }
|
||||
|
||||
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
|
||||
|
||||
virtual bool cmp(const Node& n) const {
|
||||
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
|
||||
}
|
||||
|
||||
virtual uint size_of() const { return sizeof(*this); }
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVDNode--------------------------------------
|
||||
// Vector add double as a reduction
|
||||
class AddReductionVDNode : public ReductionNode {
|
||||
private:
|
||||
// True if add reduction operation for doubles requires strict ordering.
|
||||
// As an example - The value is true when add reduction for doubles is auto-vectorized
|
||||
// as auto-vectorization mandates strict ordering but the value is false when this node
|
||||
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
|
||||
const bool _requires_strict_order;
|
||||
public:
|
||||
AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
//_requires_strict_order is set to true by default as mandated by auto-vectorization
|
||||
AddReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
|
||||
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
|
||||
|
||||
virtual int Opcode() const;
|
||||
|
||||
virtual bool requires_strict_order() const { return _requires_strict_order; }
|
||||
|
||||
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
|
||||
|
||||
virtual bool cmp(const Node& n) const {
|
||||
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
|
||||
}
|
||||
|
||||
virtual uint size_of() const { return sizeof(*this); }
|
||||
};
|
||||
|
||||
//------------------------------SubVBNode--------------------------------------
|
||||
@ -400,34 +452,70 @@ public:
|
||||
|
||||
//------------------------------MulReductionVINode--------------------------------------
|
||||
// Vector multiply byte, short and int as a reduction
|
||||
class MulReductionVINode : public UnorderedReductionNode {
|
||||
class MulReductionVINode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
MulReductionVINode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVLNode--------------------------------------
|
||||
// Vector multiply int as a reduction
|
||||
class MulReductionVLNode : public UnorderedReductionNode {
|
||||
class MulReductionVLNode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
MulReductionVLNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVFNode--------------------------------------
|
||||
// Vector multiply float as a reduction
|
||||
class MulReductionVFNode : public ReductionNode {
|
||||
// True if mul reduction operation for floats requires strict ordering.
|
||||
// As an example - The value is true when mul reduction for floats is auto-vectorized
|
||||
// as auto-vectorization mandates strict ordering but the value is false when this node
|
||||
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
|
||||
const bool _requires_strict_order;
|
||||
public:
|
||||
MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
//_requires_strict_order is set to true by default as mandated by auto-vectorization
|
||||
MulReductionVFNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
|
||||
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
|
||||
|
||||
virtual int Opcode() const;
|
||||
|
||||
virtual bool requires_strict_order() const { return _requires_strict_order; }
|
||||
|
||||
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
|
||||
|
||||
virtual bool cmp(const Node& n) const {
|
||||
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
|
||||
}
|
||||
|
||||
virtual uint size_of() const { return sizeof(*this); }
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVDNode--------------------------------------
|
||||
// Vector multiply double as a reduction
|
||||
class MulReductionVDNode : public ReductionNode {
|
||||
// True if mul reduction operation for doubles requires strict ordering.
|
||||
// As an example - The value is true when mul reduction for doubles is auto-vectorized
|
||||
// as auto-vectorization mandates strict ordering but the value is false when this node
|
||||
// is generated through VectorAPI as VectorAPI does not impose any such rules on ordering.
|
||||
const bool _requires_strict_order;
|
||||
public:
|
||||
MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
//_requires_strict_order is set to true by default as mandated by auto-vectorization
|
||||
MulReductionVDNode(Node* ctrl, Node* in1, Node* in2, bool requires_strict_order = true) :
|
||||
ReductionNode(ctrl, in1, in2), _requires_strict_order(requires_strict_order) {}
|
||||
|
||||
virtual int Opcode() const;
|
||||
|
||||
virtual bool requires_strict_order() const { return _requires_strict_order; }
|
||||
|
||||
virtual uint hash() const { return Node::hash() + _requires_strict_order; }
|
||||
|
||||
virtual bool cmp(const Node& n) const {
|
||||
return Node::cmp(n) && _requires_strict_order == ((ReductionNode&)n).requires_strict_order();
|
||||
}
|
||||
|
||||
virtual uint size_of() const { return sizeof(*this); }
|
||||
};
|
||||
|
||||
//------------------------------DivVFNode--------------------------------------
|
||||
@ -753,9 +841,9 @@ class AndVNode : public VectorNode {
|
||||
|
||||
//------------------------------AndReductionVNode--------------------------------------
|
||||
// Vector and byte, short, int, long as a reduction
|
||||
class AndReductionVNode : public UnorderedReductionNode {
|
||||
class AndReductionVNode : public ReductionNode {
|
||||
public:
|
||||
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
AndReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
@ -770,9 +858,9 @@ class OrVNode : public VectorNode {
|
||||
|
||||
//------------------------------OrReductionVNode--------------------------------------
|
||||
// Vector xor byte, short, int, long as a reduction
|
||||
class OrReductionVNode : public UnorderedReductionNode {
|
||||
class OrReductionVNode : public ReductionNode {
|
||||
public:
|
||||
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
OrReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
@ -787,25 +875,25 @@ class XorVNode : public VectorNode {
|
||||
|
||||
//------------------------------XorReductionVNode--------------------------------------
|
||||
// Vector and int, long as a reduction
|
||||
class XorReductionVNode : public UnorderedReductionNode {
|
||||
class XorReductionVNode : public ReductionNode {
|
||||
public:
|
||||
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
XorReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MinReductionVNode--------------------------------------
|
||||
// Vector min byte, short, int, long, float, double as a reduction
|
||||
class MinReductionVNode : public UnorderedReductionNode {
|
||||
class MinReductionVNode : public ReductionNode {
|
||||
public:
|
||||
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
MinReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MaxReductionVNode--------------------------------------
|
||||
// Vector min byte, short, int, long, float, double as a reduction
|
||||
class MaxReductionVNode : public UnorderedReductionNode {
|
||||
class MaxReductionVNode : public ReductionNode {
|
||||
public:
|
||||
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
|
||||
MaxReductionVNode(Node* ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
|
@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Arm Limited. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 8320725
|
||||
* @summary Ensure strictly ordered AddReductionVF/VD and MulReductionVF/VD nodes
|
||||
are generated when these operations are auto-vectorized
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestVectorFPReduction
|
||||
*/
|
||||
|
||||
public class TestVectorFPReduction {
|
||||
|
||||
final private static int SIZE = 1024;
|
||||
|
||||
private static double[] da = new double[SIZE];
|
||||
private static double[] db = new double[SIZE];
|
||||
private static float[] fa = new float[SIZE];
|
||||
private static float[] fb = new float[SIZE];
|
||||
private static float fresult;
|
||||
private static double dresult;
|
||||
|
||||
public static void main(String[] args) {
|
||||
TestFramework.run();
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(failOn = {IRNode.ADD_REDUCTION_VF},
|
||||
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
|
||||
@IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"},
|
||||
failOn = {"no_strict_order"},
|
||||
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
private static void testAddReductionVF() {
|
||||
float result = 1;
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
result += (fa[i] + fb[i]);
|
||||
}
|
||||
fresult += result;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(failOn = {IRNode.ADD_REDUCTION_VD},
|
||||
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
|
||||
@IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"},
|
||||
failOn = {"no_strict_order"},
|
||||
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
private static void testAddReductionVD() {
|
||||
double result = 1;
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
result += (da[i] + db[i]);
|
||||
}
|
||||
dresult += result;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(failOn = {IRNode.MUL_REDUCTION_VF},
|
||||
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
|
||||
@IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"},
|
||||
failOn = {"no_strict_order"},
|
||||
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
private static void testMulReductionVF() {
|
||||
float result = 1;
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
result *= (fa[i] + fb[i]);
|
||||
}
|
||||
fresult += result;
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(failOn = {IRNode.MUL_REDUCTION_VD},
|
||||
applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
|
||||
@IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"},
|
||||
failOn = {"no_strict_order"},
|
||||
applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
private static void testMulReductionVD() {
|
||||
double result = 1;
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
result *= (da[i] + db[i]);
|
||||
}
|
||||
dresult += result;
|
||||
}
|
||||
}
|
@ -0,0 +1,211 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Arm Limited. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.vectorapi;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
|
||||
import jdk.incubator.vector.DoubleVector;
|
||||
import jdk.incubator.vector.FloatVector;
|
||||
import jdk.incubator.vector.VectorOperators;
|
||||
import jdk.incubator.vector.VectorSpecies;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import jdk.test.lib.Asserts;
|
||||
import jdk.test.lib.Utils;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8320725
|
||||
* @library /test/lib /
|
||||
* @summary Verify non-strictly ordered AddReductionVF/VD and MulReductionVF/VD
|
||||
* nodes are generated in VectorAPI
|
||||
* @modules jdk.incubator.vector
|
||||
* @run driver compiler.vectorapi.TestVectorAddMulReduction
|
||||
*/
|
||||
|
||||
public class TestVectorAddMulReduction {
|
||||
|
||||
private static final int SIZE = 1024;
|
||||
private static final Random RD = Utils.getRandomInstance();
|
||||
|
||||
private static float[] fa;
|
||||
private static float fres;
|
||||
private static double[] da;
|
||||
private static double dres;
|
||||
|
||||
static {
|
||||
fa = new float[SIZE];
|
||||
da = new double[SIZE];
|
||||
fres = 1;
|
||||
dres = 1;
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
fa[i] = RD.nextFloat();
|
||||
da[i] = RD.nextDouble();
|
||||
}
|
||||
}
|
||||
|
||||
// Test add reduction operation for floats
|
||||
@ForceInline
|
||||
public static void testFloatAddKernel(VectorSpecies SPECIES, float[] f) {
|
||||
for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
|
||||
var av = FloatVector.fromArray(SPECIES, f, i);
|
||||
fres += av.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=8"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatAdd_64() {
|
||||
testFloatAddKernel(FloatVector.SPECIES_64, fa);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=16"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatAdd_128() {
|
||||
testFloatAddKernel(FloatVector.SPECIES_128, fa);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=32"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatAdd_256() {
|
||||
testFloatAddKernel(FloatVector.SPECIES_256, fa);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=64"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatAdd_512() {
|
||||
testFloatAddKernel(FloatVector.SPECIES_512, fa);
|
||||
}
|
||||
|
||||
// Test add reduction operation for doubles
|
||||
@ForceInline
|
||||
public static void testDoubleAddKernel(VectorSpecies SPECIES, double[] d) {
|
||||
for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
|
||||
var av = DoubleVector.fromArray(SPECIES, d, i);
|
||||
dres += av.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=16"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testDoubleAdd_128() {
|
||||
testDoubleAddKernel(DoubleVector.SPECIES_128, da);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=32"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testDoubleAdd_256() {
|
||||
testDoubleAddKernel(DoubleVector.SPECIES_256, da);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.ADD_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=64"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testDoubleAdd_512() {
|
||||
testDoubleAddKernel(DoubleVector.SPECIES_512, da);
|
||||
}
|
||||
|
||||
// Test mul reduction operation for floats
|
||||
// On aarch64, there are no direct vector mul reduction instructions for float/double mul reduction
|
||||
// and scalar instructions are emitted for 64-bit/128-bit vectors. Thus MulReductionVF/VD nodes are generated
|
||||
// only for vector length of 8B/16B on vectorAPI.
|
||||
@ForceInline
|
||||
public static void testFloatMulKernel(VectorSpecies SPECIES, float[] f) {
|
||||
for (int i = 0; i < SPECIES.loopBound(f.length); i += SPECIES.length()) {
|
||||
var av = FloatVector.fromArray(SPECIES, f, i);
|
||||
fres += av.reduceLanes(VectorOperators.MUL);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=8"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatMul_64() {
|
||||
testFloatMulKernel(FloatVector.SPECIES_64, fa);
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MUL_REDUCTION_VF, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=16"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testFloatMul_128() {
|
||||
testFloatMulKernel(FloatVector.SPECIES_128, fa);
|
||||
}
|
||||
|
||||
// Test mul reduction operation for doubles
|
||||
@ForceInline
|
||||
public static void testDoubleMulKernel(VectorSpecies SPECIES, double[] d) {
|
||||
for (int i = 0; i < SPECIES.loopBound(d.length); i += SPECIES.length()) {
|
||||
var av = DoubleVector.fromArray(SPECIES, d, i);
|
||||
dres += av.reduceLanes(VectorOperators.MUL);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.MUL_REDUCTION_VD, ">=1", "no_strict_order", ">=1"},
|
||||
failOn = {"requires_strict_order"},
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
||||
applyIf = {"MaxVectorSize", ">=16"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
public static void testDoubleMul_128() {
|
||||
testDoubleMulKernel(DoubleVector.SPECIES_128, da);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user