8240248: Extend superword reduction optimizations for x86

Add support for and, or, xor reduction

Co-authored-by: Shravya Rukmannagari <shravya.rukmannagari@intel.com>
Reviewed-by: vlivanov, thartmann
This commit is contained in:
Sandhya Viswanathan 2020-03-23 10:26:40 -07:00
parent 75a8b7fa83
commit 398ce2948c
13 changed files with 1089 additions and 702 deletions

View File

@ -40,10 +40,10 @@ mkdir -p $BUILD_DIR $JAR_DIR
cd $JAR_DIR
rm -f *
wget http://central.maven.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
wget http://central.maven.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
wget https://repo.maven.apache.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
wget https://repo.maven.apache.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
tar -cvzf ../$BUNDLE_NAME *

View File

@ -4161,7 +4161,245 @@ void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRe
vpsrlq(dst, nds, src, vector_len);
}
}
// Reductions for vectors of ints, longs, floats, and doubles.
void MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
int vector_len = Assembler::AVX_128bit;
switch (opcode) {
case Op_AndReductionV: pand(dst, src); break;
case Op_OrReductionV: por (dst, src); break;
case Op_XorReductionV: pxor(dst, src); break;
case Op_AddReductionVF: addss(dst, src); break;
case Op_AddReductionVD: addsd(dst, src); break;
case Op_AddReductionVI: paddd(dst, src); break;
case Op_AddReductionVL: paddq(dst, src); break;
case Op_MulReductionVF: mulss(dst, src); break;
case Op_MulReductionVD: mulsd(dst, src); break;
case Op_MulReductionVI: pmulld(dst, src); break;
case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
default: assert(false, "wrong opcode");
}
}
void MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
int vector_len = Assembler::AVX_256bit;
switch (opcode) {
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
default: assert(false, "wrong opcode");
}
}
void MacroAssembler::reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2) {
switch (opcode) {
case Op_AddReductionVF:
case Op_MulReductionVF:
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
break;
case Op_AddReductionVD:
case Op_MulReductionVD:
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
break;
default: assert(false, "wrong opcode");
}
}
void MacroAssembler::reduceI(int opcode, int vlen,
Register dst, Register src1, XMMRegister src2,
XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
default: assert(false, "wrong vector length");
}
}
#ifdef _LP64
void MacroAssembler::reduceL(int opcode, int vlen,
Register dst, Register src1, XMMRegister src2,
XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
default: assert(false, "wrong vector length");
}
}
#endif // _LP64
void MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2:
assert(vtmp2 == xnoreg, "");
reduce2F(opcode, dst, src, vtmp1);
break;
case 4:
assert(vtmp2 == xnoreg, "");
reduce4F(opcode, dst, src, vtmp1);
break;
case 8:
reduce8F(opcode, dst, src, vtmp1, vtmp2);
break;
case 16:
reduce16F(opcode, dst, src, vtmp1, vtmp2);
break;
default: assert(false, "wrong vector length");
}
}
void MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2:
assert(vtmp2 == xnoreg, "");
reduce2D(opcode, dst, src, vtmp1);
break;
case 4:
reduce4D(opcode, dst, src, vtmp1, vtmp2);
break;
case 8:
reduce8D(opcode, dst, src, vtmp1, vtmp2);
break;
default: assert(false, "wrong vector length");
}
}
void MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
if (opcode == Op_AddReductionVI) {
if (vtmp1 != src2) {
movdqu(vtmp1, src2);
}
phaddd(vtmp1, vtmp1);
} else {
pshufd(vtmp1, src2, 0x1);
reduce_operation_128(opcode, vtmp1, src2);
}
movdl(vtmp2, src1);
reduce_operation_128(opcode, vtmp1, vtmp2);
movdl(dst, vtmp1);
}
void MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
if (opcode == Op_AddReductionVI) {
if (vtmp1 != src2) {
movdqu(vtmp1, src2);
}
phaddd(vtmp1, src2);
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
} else {
pshufd(vtmp2, src2, 0xE);
reduce_operation_128(opcode, vtmp2, src2);
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}
}
void MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
if (opcode == Op_AddReductionVI) {
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
vextracti128_high(vtmp2, vtmp1);
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
} else {
vextracti128_high(vtmp1, src2);
reduce_operation_128(opcode, vtmp1, src2);
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
}
}
void MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
vextracti64x4_high(vtmp2, src2);
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}
#ifdef _LP64
void MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
pshufd(vtmp2, src2, 0xE);
reduce_operation_128(opcode, vtmp2, src2);
movdq(vtmp1, src1);
reduce_operation_128(opcode, vtmp1, vtmp2);
movdq(dst, vtmp1);
}
void MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
vextracti128_high(vtmp1, src2);
reduce_operation_128(opcode, vtmp1, src2);
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
}
void MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
vextracti64x4_high(vtmp2, src2);
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}
#endif // _LP64
void MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
reduce_operation_128(opcode, dst, src);
pshufd(vtmp, src, 0x1);
reduce_operation_128(opcode, dst, vtmp);
}
void MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
reduce2F(opcode, dst, src, vtmp);
pshufd(vtmp, src, 0x2);
reduce_operation_128(opcode, dst, vtmp);
pshufd(vtmp, src, 0x3);
reduce_operation_128(opcode, dst, vtmp);
}
void MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
reduce4F(opcode, dst, src, vtmp2);
vextractf128_high(vtmp2, src);
reduce4F(opcode, dst, vtmp2, vtmp1);
}
void MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
reduce8F(opcode, dst, src, vtmp1, vtmp2);
vextracti64x4_high(vtmp1, src);
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
}
void MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
reduce_operation_128(opcode, dst, src);
pshufd(vtmp, src, 0xE);
reduce_operation_128(opcode, dst, vtmp);
}
void MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
reduce2D(opcode, dst, src, vtmp2);
vextractf128_high(vtmp2, src);
reduce2D(opcode, dst, vtmp2, vtmp1);
}
void MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
reduce4D(opcode, dst, src, vtmp1, vtmp2);
vextracti64x4_high(vtmp1, src);
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}
#endif
//-------------------------------------------------------------------------------------------
void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {

View File

@ -1649,8 +1649,48 @@ public:
void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
// Reductions for vectors of ints, longs, floats, and doubles.
// dst = src1 + reduce(op, src2) using vtmp as temps
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#endif // _LP64
// dst = reduce(op, src2) using vtmp as temps
void reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
private:
void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#endif // _LP64
void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src);
void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
#endif
public:
// C2 compiled method's prolog code.
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);

View File

@ -1277,16 +1277,14 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_AddReductionVL:
if (UseAVX < 3) { // only EVEX : vector connectivity becomes an issue here
return false;
}
break;
case Op_AbsVB:
case Op_AbsVS:
case Op_AbsVI:
case Op_AddReductionVI:
if (UseSSE < 3 || !VM_Version::supports_ssse3()) { // requires at least SSSE3
case Op_AndReductionV:
case Op_OrReductionV:
case Op_XorReductionV:
if (UseSSE < 3) { // requires at least SSSE3
return false;
}
break;
@ -1675,6 +1673,12 @@ static inline uint vector_length(const MachNode* n) {
return vt->length();
}
static inline uint vector_length(const MachNode* use, MachOper* opnd) {
uint def_idx = use->operand_index(opnd);
Node* def = use->in(def_idx);
return def->bottom_type()->is_vect()->length();
}
static inline uint vector_length_in_bytes(const MachNode* n) {
const TypeVect* vt = n->bottom_type()->is_vect();
return vt->length_in_bytes();
@ -3592,709 +3596,168 @@ instruct ReplD_zero(vec dst, immD0 zero) %{
%}
// ====================REDUCTION ARITHMETIC=======================================
// =======================Int Reduction==========================================
// =======================AddReductionVI==========================================
instruct vadd2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
n->in(2)->bottom_type()->is_vect()->length() < 16);
match(Set dst (AddReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_add2I_reduction $dst,$src1,$src2" %}
match(Set dst (MulReductionVI src1 src2));
match(Set dst (AndReductionV src1 src2));
match(Set dst ( OrReductionV src1 src2));
match(Set dst (XorReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
if (UseAVX > 2) {
int vector_len = Assembler::AVX_128bit;
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else if (VM_Version::supports_avxonly()) {
int vector_len = Assembler::AVX_128bit;
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else {
assert(UseSSE > 2, "required");
__ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
__ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
__ movdl($tmp$$XMMRegister, $src1$$Register);
__ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
__ movdl($dst$$Register, $tmp$$XMMRegister);
}
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src2);
__ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
n->in(2)->bottom_type()->is_vect()->length() == 16);
match(Set dst (AddReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_add4I_reduction $dst,$src1,$src2" %}
match(Set dst (MulReductionVI src1 src2));
match(Set dst (AndReductionV src1 src2));
match(Set dst ( OrReductionV src1 src2));
match(Set dst (XorReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
if (UseAVX > 2) {
int vector_len = Assembler::AVX_128bit;
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else if (VM_Version::supports_avxonly()) {
int vector_len = Assembler::AVX_128bit;
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else {
assert(UseSSE > 2, "required");
__ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
__ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
__ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
}
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src2);
__ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (AddReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_add8I_reduction $dst,$src1,$src2" %}
ins_encode %{
if (UseAVX > 2) {
int vector_len = Assembler::AVX_128bit;
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else {
assert(UseAVX > 0, "");
int vector_len = Assembler::AVX_256bit;
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vadd16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
match(Set dst (AddReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
format %{ "vector_add16I_reduction $dst,$src1,$src2" %}
ins_encode %{
__ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
__ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
__ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// =======================AddReductionVL==========================================
// =======================Long Reduction==========================================
#ifdef _LP64
instruct vadd2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
n->in(2)->bottom_type()->is_vect()->length() < 8);
match(Set dst (AddReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_add2L_reduction $dst,$src1,$src2" %}
match(Set dst (MulReductionVL src1 src2));
match(Set dst (AndReductionV src1 src2));
match(Set dst ( OrReductionV src1 src2));
match(Set dst (XorReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdq($tmp2$$XMMRegister, $src1$$Register);
__ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src2);
__ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
n->in(2)->bottom_type()->is_vect()->length() == 8);
match(Set dst (AddReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_add4L_reduction $dst,$src1,$src2" %}
match(Set dst (MulReductionVL src1 src2));
match(Set dst (AndReductionV src1 src2));
match(Set dst ( OrReductionV src1 src2));
match(Set dst (XorReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
__ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($tmp$$XMMRegister, $src1$$Register);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (AddReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_addL_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
__ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($tmp$$XMMRegister, $src1$$Register);
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src2);
__ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64
// =======================AddReductionVF==========================================
// =======================Float Reduction==========================================
instruct vadd2F_reduction_reg(regF dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (AddReductionVF dst src2));
effect(TEMP dst, TEMP tmp);
format %{ "vector_add2F_reduction $dst,$dst,$src2" %}
instruct reductionF128(regF dst, vec src, vec vtmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4);
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_reduction_fp $dst,$src ; using $vtmp as TEMP" %}
ins_encode %{
if (UseAVX > 0) {
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ addss($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
}
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd4F_reduction_reg(regF dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
match(Set dst (AddReductionVF dst src2));
effect(TEMP dst, TEMP tmp);
format %{ "vector_add4F_reduction $dst,$dst,$src2" %}
instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
if (UseAVX > 0) {
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ addss($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
}
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (AddReductionVF dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_add8F_reduction $dst,$dst,$src2" %}
instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16);
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
assert(UseAVX > 0, "required");
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
match(Set dst (AddReductionVF dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_add16F_reduction $dst,$dst,$src2" %}
// =======================Double Reduction==========================================
instruct reduction2D(regD dst, vec src, vec vtmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2);
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// =======================AddReductionVD==========================================
instruct vadd2D_reduction_reg(regD dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (AddReductionVD dst src2));
effect(TEMP tmp, TEMP dst);
format %{ "vector_add2D_reduction $dst,$src2" %}
instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4);
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
if (UseAVX > 0) {
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ addsd($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
}
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
match(Set dst (AddReductionVD dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_add4D_reduction $dst,$dst,$src2" %}
instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
assert(UseAVX > 0, "required");
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vadd8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (AddReductionVD dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_add8D_reduction $dst,$dst,$src2" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// =======================MulReductionVI==========================================
instruct vmul2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (MulReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul2I_reduction $dst,$src1,$src2" %}
ins_encode %{
if (UseAVX > 0) {
int vector_len = Assembler::AVX_128bit;
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else {
assert(UseSSE > 3, "required");
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
__ movdl($tmp$$XMMRegister, $src1$$Register);
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vmul4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
match(Set dst (MulReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul4I_reduction $dst,$src1,$src2" %}
ins_encode %{
if (UseAVX > 0) {
int vector_len = Assembler::AVX_128bit;
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
} else {
assert(UseSSE > 3, "required");
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
__ movdl($tmp$$XMMRegister, $src1$$Register);
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vmul8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (MulReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul8I_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(UseAVX > 1, "required");
int vector_len = Assembler::AVX_128bit;
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vmul16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
match(Set dst (MulReductionVI src1 src2));
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
format %{ "vector_mul16I_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
__ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
__ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdl($tmp2$$XMMRegister, $src1$$Register);
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdl($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// =======================MulReductionVL==========================================
#ifdef _LP64
instruct vmul2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (MulReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul2L_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(VM_Version::supports_avx512dq(), "required");
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdq($tmp2$$XMMRegister, $src1$$Register);
__ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vmul4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
match(Set dst (MulReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul4L_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(VM_Version::supports_avx512dq(), "required");
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
__ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($tmp$$XMMRegister, $src1$$Register);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vmul8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (MulReductionVL src1 src2));
effect(TEMP tmp, TEMP tmp2);
format %{ "vector_mul8L_reduction $dst,$src1,$src2" %}
ins_encode %{
assert(VM_Version::supports_avx512dq(), "required");
__ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
__ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($tmp$$XMMRegister, $src1$$Register);
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
__ movdq($dst$$Register, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
#endif
// =======================MulReductionVF==========================================
instruct vmul2F_reduction_reg(regF dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (MulReductionVF dst src2));
effect(TEMP dst, TEMP tmp);
format %{ "vector_mul2F_reduction $dst,$dst,$src2" %}
ins_encode %{
if (UseAVX > 0) {
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ mulss($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vmul4F_reduction_reg(regF dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
match(Set dst (MulReductionVF dst src2));
effect(TEMP dst, TEMP tmp);
format %{ "vector_mul4F_reduction $dst,$dst,$src2" %}
ins_encode %{
if (UseAVX > 0) {
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ mulss($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vmul8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
match(Set dst (MulReductionVF dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_mul8F_reduction $dst,$dst,$src2" %}
ins_encode %{
assert(UseAVX > 0, "required");
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vmul16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
match(Set dst (MulReductionVF dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_mul16F_reduction $dst,$dst,$src2" %}
ins_encode %{
assert(UseAVX > 2, "required");
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// =======================MulReductionVD==========================================
instruct vmul2D_reduction_reg(regD dst, vec src2, vec tmp) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
match(Set dst (MulReductionVD dst src2));
effect(TEMP dst, TEMP tmp);
format %{ "vector_mul2D_reduction $dst,$dst,$src2" %}
ins_encode %{
if (UseAVX > 0) {
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
} else {
assert(UseSSE > 0, "required");
__ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
}
%}
ins_pipe( pipe_slow );
%}
instruct vmul4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 2
match(Set dst (MulReductionVD dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_mul4D_reduction $dst,$dst,$src2" %}
ins_encode %{
assert(UseAVX > 0, "required");
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
instruct vmul8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 2
match(Set dst (MulReductionVD dst src2));
effect(TEMP tmp, TEMP dst, TEMP tmp2);
format %{ "vector_mul8D_reduction $dst,$dst,$src2" %}
ins_encode %{
assert(UseAVX > 0, "required");
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
int opcode = this->ideal_Opcode();
int vlen = vector_length(this, $src);
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

View File

@ -4059,6 +4059,11 @@ int MatchRule::is_expensive() const {
strcmp(opType,"MulReductionVL")==0 ||
strcmp(opType,"MulReductionVF")==0 ||
strcmp(opType,"MulReductionVD")==0 ||
strcmp(opType,"MinReductionV")==0 ||
strcmp(opType,"MaxReductionV")==0 ||
strcmp(opType,"AndReductionV")==0 ||
strcmp(opType,"OrReductionV")==0 ||
strcmp(opType,"XorReductionV")==0 ||
0 /* 0 to line up columns nicely */ )
return 1;
}
@ -4161,12 +4166,13 @@ bool MatchRule::is_vector() const {
"AddReductionVF", "AddReductionVD",
"MulReductionVI", "MulReductionVL",
"MulReductionVF", "MulReductionVD",
"MaxReductionV", "MinReductionV",
"AndReductionV", "OrReductionV", "XorReductionV",
"MulAddVS2VI",
"LShiftCntV","RShiftCntV",
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
"MaxReductionV", "MinReductionV",
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
"RoundDoubleModeV","LoadVector","StoreVector",
"FmaVD", "FmaVF","PopCountVI",

View File

@ -375,8 +375,11 @@ macro(URShiftVS)
macro(URShiftVI)
macro(URShiftVL)
macro(AndV)
macro(AndReductionV)
macro(OrV)
macro(OrReductionV)
macro(XorV)
macro(XorReductionV)
macro(MinV)
macro(MaxV)
macro(MinReductionV)

View File

@ -3034,6 +3034,9 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f
case Op_MulReductionVD:
case Op_MinReductionV:
case Op_MaxReductionV:
case Op_AndReductionV:
case Op_OrReductionV:
case Op_XorReductionV:
break;
case Op_PackB:

View File

@ -673,7 +673,30 @@ int ReductionNode::opcode(int opc, BasicType bt) {
assert(bt == T_DOUBLE, "must be");
vopc = Op_MaxReductionV;
break;
// TODO: add MulL for targets that support it
case Op_AndI:
assert(bt == T_INT, "must be");
vopc = Op_AndReductionV;
break;
case Op_AndL:
assert(bt == T_LONG, "must be");
vopc = Op_AndReductionV;
break;
case Op_OrI:
assert(bt == T_INT, "must be");
vopc = Op_OrReductionV;
break;
case Op_OrL:
assert(bt == T_LONG, "must be");
vopc = Op_OrReductionV;
break;
case Op_XorI:
assert(bt == T_INT, "must be");
vopc = Op_XorReductionV;
break;
case Op_XorL:
assert(bt == T_LONG, "must be");
vopc = Op_XorReductionV;
break;
default:
break;
}
@ -697,8 +720,11 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
case Op_AndReductionV: return new AndReductionVNode(ctrl, n1, n2);
case Op_OrReductionV: return new OrReductionVNode(ctrl, n1, n2);
case Op_XorReductionV: return new XorReductionVNode(ctrl, n1, n2);
default:
fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
return NULL;

View File

@ -145,6 +145,15 @@ class ReductionNode : public Node {
static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
static int opcode(int opc, BasicType bt);
static bool implemented(int opc, uint vlen, BasicType bt);
virtual const Type* bottom_type() const {
BasicType vbt = in(2)->bottom_type()->is_vect()->element_basic_type();
return Type::get_const_basic_type(vbt);
}
virtual uint ideal_reg() const {
return bottom_type()->ideal_reg();
}
};
//------------------------------AddReductionVINode--------------------------------------
@ -613,6 +622,30 @@ class XorVNode : public VectorNode {
virtual int Opcode() const;
};
//------------------------------AndReductionVNode--------------------------------------
// Vector and int, long as a reduction
class AndReductionVNode : public ReductionNode {
public:
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------OrReductionVNode--------------------------------------
// Vector or int, long as a reduction
class OrReductionVNode : public ReductionNode {
public:
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------XorReductionVNode--------------------------------------
// Vector xor int, long as a reduction
class XorReductionVNode : public ReductionNode {
public:
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MinVNode--------------------------------------
// Vector min
class MinVNode : public VectorNode {
@ -635,26 +668,6 @@ class MinReductionVNode : public ReductionNode {
public:
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
virtual const Type* bottom_type() const {
BasicType bt = in(1)->bottom_type()->basic_type();
if (bt == T_FLOAT) {
return Type::FLOAT;
} else if (bt == T_DOUBLE) {
return Type::DOUBLE;
}
assert(false, "unsupported basic type");
return NULL;
}
virtual uint ideal_reg() const {
BasicType bt = in(1)->bottom_type()->basic_type();
if (bt == T_FLOAT) {
return Op_RegF;
} else if (bt == T_DOUBLE) {
return Op_RegD;
}
assert(false, "unsupported basic type");
return 0;
}
};
//------------------------------MaxReductionVNode--------------------------------------
@ -663,26 +676,6 @@ class MaxReductionVNode : public ReductionNode {
public:
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
virtual const Type* bottom_type() const {
BasicType bt = in(1)->bottom_type()->basic_type();
if (bt == T_FLOAT) {
return Type::FLOAT;
} else {
return Type::DOUBLE;
}
assert(false, "unsupported basic type");
return NULL;
}
virtual uint ideal_reg() const {
BasicType bt = in(1)->bottom_type()->basic_type();
if (bt == T_FLOAT) {
return Op_RegF;
} else {
return Op_RegD;
}
assert(false, "unsupported basic type");
return 0;
}
};
//================================= M E M O R Y ===============================

View File

@ -1820,8 +1820,11 @@ typedef HashtableEntry<InstanceKlass*, mtClass> KlassHashtableEntry;
declare_c2_type(URShiftVINode, VectorNode) \
declare_c2_type(URShiftVLNode, VectorNode) \
declare_c2_type(AndVNode, VectorNode) \
declare_c2_type(AndReductionVNode, ReductionNode) \
declare_c2_type(OrVNode, VectorNode) \
declare_c2_type(OrReductionVNode, ReductionNode) \
declare_c2_type(XorVNode, VectorNode) \
declare_c2_type(XorReductionVNode, ReductionNode) \
declare_c2_type(MaxVNode, VectorNode) \
declare_c2_type(MinVNode, VectorNode) \
declare_c2_type(MaxReductionVNode, ReductionNode) \

View File

@ -0,0 +1,238 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8240248
* @summary Add C2 x86 Superword support for scalar logical reduction optimizations : int test
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=2
* compiler.loopopts.superword.RedTest_int
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=2
* compiler.loopopts.superword.RedTest_int
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=4
* compiler.loopopts.superword.RedTest_int
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=4
* compiler.loopopts.superword.RedTest_int
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=8
* compiler.loopopts.superword.RedTest_int
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=8
* compiler.loopopts.superword.RedTest_int
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=16
* compiler.loopopts.superword.RedTest_int
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=16
* compiler.loopopts.superword.RedTest_int
*/
package compiler.loopopts.superword;
public class RedTest_int {
static final int NUM = 1024;
static final int ITER = 8000;
public static void main(String[] args) throws Exception {
int[] a = new int[NUM];
int[] b = new int[NUM];
int[] c = new int[NUM];
int[] d = new int[NUM];
reductionInit1(a, b, c);
int total = 0;
int valid = 0;
for (int j = 0; j < ITER; j++) {
total = sumReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid += d[j];
}
testCorrectness(total, valid, "Add Reduction");
valid = 0;
for (int j = 0; j < ITER; j++) {
total = orReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid |= d[j];
}
testCorrectness(total, valid, "Or Reduction");
valid = -1;
for (int j = 0; j < ITER; j++) {
total = andReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid &= d[j];
}
testCorrectness(total, valid, "And Reduction");
valid = -1;
for (int j = 0; j < ITER; j++) {
total = xorReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid ^= d[j];
}
testCorrectness(total, valid, "Xor Reduction");
reductionInit2(a, b, c);
valid = 1;
for (int j = 0; j < ITER; j++) {
total = mulReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid *= d[j];
}
testCorrectness(total, valid, "Mul Reduction");
}
public static void reductionInit1(
int[] a,
int[] b,
int[] c) {
for (int i = 0; i < a.length; i++) {
a[i] = (i%2) + 0x4099;
b[i] = (i%2) + 0x1033;
c[i] = (i%2) + 0x455;
}
}
public static void reductionInit2(
int[] a,
int[] b,
int[] c) {
for (int i = 0; i < a.length; i++) {
a[i] = 0x11;
b[i] = 0x12;
c[i] = 0x13;
}
}
public static int sumReductionImplement(
int[] a,
int[] b,
int[] c,
int[] d) {
int total = 0;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += d[i];
}
return total;
}
public static int orReductionImplement(
int[] a,
int[] b,
int[] c,
int[] d) {
int total = 0;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total |= d[i];
}
return total;
}
public static int andReductionImplement(
int[] a,
int[] b,
int[] c,
int[] d) {
int total = -1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total &= d[i];
}
return total;
}
public static int xorReductionImplement(
int[] a,
int[] b,
int[] c,
int[] d) {
int total = -1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total ^= d[i];
}
return total;
}
public static int mulReductionImplement(
int[] a,
int[] b,
int[] c,
int[] d) {
int total = 1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = total*d[i];
}
return total;
}
public static void testCorrectness(
int total,
int valid,
String op) throws Exception {
if (total == valid) {
System.out.println(op + ": Success");
} else {
System.out.println("Invalid total: " + total);
System.out.println("Expected value = " + valid);
throw new Exception(op + ": Failed");
}
}
}

View File

@ -0,0 +1,238 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8240248
* @summary Add C2 x86 Superword support for scalar logical reduction optimizations : long test
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=2
* compiler.loopopts.superword.RedTest_long
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=2
* compiler.loopopts.superword.RedTest_long
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=4
* compiler.loopopts.superword.RedTest_long
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=4
* compiler.loopopts.superword.RedTest_long
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=8
* compiler.loopopts.superword.RedTest_long
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=8
* compiler.loopopts.superword.RedTest_long
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-TieredCompilation
* -XX:+SuperWordReductions
* -XX:LoopMaxUnroll=16
* compiler.loopopts.superword.RedTest_long
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
* -XX:CompileThresholdScaling=0.1
* -XX:-SuperWordReductions
* -XX:LoopMaxUnroll=16
* compiler.loopopts.superword.RedTest_long
*/
package compiler.loopopts.superword;
public class RedTest_long {
static final int NUM = 512;
static final int ITER = 8000;
public static void main(String[] args) throws Exception {
long[] a = new long[NUM];
long[] b = new long[NUM];
long[] c = new long[NUM];
long[] d = new long[NUM];
reductionInit1(a, b, c);
long total = 0;
long valid = 0;
for (int j = 0; j < ITER; j++) {
total = sumReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid += d[j];
}
testCorrectness(total, valid, "Add Reduction");
valid = 0;
for (int j = 0; j < ITER; j++) {
total = orReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid |= d[j];
}
testCorrectness(total, valid, "Or Reduction");
valid = -1;
for (int j = 0; j < ITER; j++) {
total = andReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid &= d[j];
}
testCorrectness(total, valid, "And Reduction");
valid = -1;
for (int j = 0; j < ITER; j++) {
total = xorReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid ^= d[j];
}
testCorrectness(total, valid, "Xor Reduction");
reductionInit2(a, b, c);
valid = 1;
for (int j = 0; j < ITER; j++) {
total = mulReductionImplement(a, b, c, d);
}
for (int j = 0; j < d.length; j++) {
valid *= d[j];
}
testCorrectness(total, valid, "Mul Reduction");
}
public static void reductionInit1(
long[] a,
long[] b,
long[] c) {
for (int i = 0; i < a.length; i++) {
a[i] = (i%2) + 0x4099;
b[i] = (i%2) + 0x1033;
c[i] = (i%2) + 0x455;
}
}
public static void reductionInit2(
long[] a,
long[] b,
long[] c) {
for (int i = 0; i < a.length; i++) {
a[i] = 0x11;
b[i] = 0x12;
c[i] = 0x13;
}
}
public static long sumReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d) {
long total = 0;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += d[i];
}
return total;
}
public static long orReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d) {
long total = 0;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total |= d[i];
}
return total;
}
public static long andReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d) {
long total = -1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total &= d[i];
}
return total;
}
public static long xorReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d) {
long total = -1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total ^= d[i];
}
return total;
}
public static long mulReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d) {
long total = 1;
for (int i = 0; i < a.length; i++) {
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = total*d[i];
}
return total;
}
public static void testCorrectness(
long total,
long valid,
String op) throws Exception {
if (total == valid) {
System.out.println(op + ": Success");
} else {
System.out.println("Invalid total: " + total);
System.out.println("Expected value = " + valid);
throw new Exception(op + ": Failed");
}
}
}

View File

@ -0,0 +1,136 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.*;
import java.util.concurrent.TimeUnit;
import java.util.Random;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
public abstract class VectorReduction {
@Param({"512"})
public int COUNT;
private int[] intsA;
private int[] intsB;
private int[] intsC;
private int[] intsD;
private int resI;
private long[] longsA;
private long[] longsB;
private long[] longsC;
private long[] longsD;
private long resL;
@Param("0")
private int seed;
private Random r = new Random(seed);
@Setup
public void init() {
intsA = new int[COUNT];
intsB = new int[COUNT];
intsC = new int[COUNT];
intsD = new int[COUNT];
longsA = new long[COUNT];
longsB = new long[COUNT];
longsC = new long[COUNT];
longsD = new long[COUNT];
for (int i = 0; i < COUNT; i++) {
intsA[i] = r.nextInt();
intsB[i] = r.nextInt();
intsC[i] = r.nextInt();
longsA[i] = r.nextLong();
longsB[i] = r.nextLong();
longsC[i] = r.nextLong();
}
}
@Benchmark
public void andRedI() {
for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI &= intsD[i];
}
}
@Benchmark
public void orRedI() {
for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI |= intsD[i];
}
}
@Benchmark
public void xorRedI() {
for (int i = 0; i < COUNT; i++) {
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
resI ^= intsD[i];
}
}
@Benchmark
public void andRedL() {
for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL &= longsD[i];
}
}
@Benchmark
public void orRedL() {
for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL |= longsD[i];
}
}
@Benchmark
public void xorRedL() {
for (int i = 0; i < COUNT; i++) {
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
resL ^= longsD[i];
}
}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:+UseSuperWord"
})
public static class WithSuperword extends VectorReduction {
}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:-UseSuperWord"
})
public static class NoSuperword extends VectorReduction {
}
}