8240248: Extend superword reduction optimizations for x86
Add support for and, or, xor reduction Co-authored-by: Shravya Rukmannagari <shravya.rukmannagari@intel.com> Reviewed-by: vlivanov, thartmann
This commit is contained in:
parent
75a8b7fa83
commit
398ce2948c
@ -40,10 +40,10 @@ mkdir -p $BUILD_DIR $JAR_DIR
|
||||
cd $JAR_DIR
|
||||
rm -f *
|
||||
|
||||
wget http://central.maven.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
|
||||
wget http://central.maven.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
|
||||
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
|
||||
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
|
||||
wget https://repo.maven.apache.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
|
||||
wget https://repo.maven.apache.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
|
||||
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
|
||||
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
|
||||
|
||||
tar -cvzf ../$BUNDLE_NAME *
|
||||
|
||||
|
@ -4161,7 +4161,245 @@ void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRe
|
||||
vpsrlq(dst, nds, src, vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
// Reductions for vectors of ints, longs, floats, and doubles.
|
||||
|
||||
void MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
|
||||
switch (opcode) {
|
||||
case Op_AndReductionV: pand(dst, src); break;
|
||||
case Op_OrReductionV: por (dst, src); break;
|
||||
case Op_XorReductionV: pxor(dst, src); break;
|
||||
|
||||
case Op_AddReductionVF: addss(dst, src); break;
|
||||
case Op_AddReductionVD: addsd(dst, src); break;
|
||||
case Op_AddReductionVI: paddd(dst, src); break;
|
||||
case Op_AddReductionVL: paddq(dst, src); break;
|
||||
|
||||
case Op_MulReductionVF: mulss(dst, src); break;
|
||||
case Op_MulReductionVD: mulsd(dst, src); break;
|
||||
case Op_MulReductionVI: pmulld(dst, src); break;
|
||||
case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
|
||||
|
||||
default: assert(false, "wrong opcode");
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
int vector_len = Assembler::AVX_256bit;
|
||||
|
||||
switch (opcode) {
|
||||
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
|
||||
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
|
||||
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
|
||||
|
||||
case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
|
||||
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
|
||||
|
||||
case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
|
||||
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
|
||||
|
||||
default: assert(false, "wrong opcode");
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce_fp(int opcode, int vlen,
|
||||
XMMRegister dst, XMMRegister src,
|
||||
XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
switch (opcode) {
|
||||
case Op_AddReductionVF:
|
||||
case Op_MulReductionVF:
|
||||
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVD:
|
||||
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
|
||||
default: assert(false, "wrong opcode");
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduceI(int opcode, int vlen,
|
||||
Register dst, Register src1, XMMRegister src2,
|
||||
XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
switch (vlen) {
|
||||
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
|
||||
default: assert(false, "wrong vector length");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
void MacroAssembler::reduceL(int opcode, int vlen,
|
||||
Register dst, Register src1, XMMRegister src2,
|
||||
XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
switch (vlen) {
|
||||
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
||||
|
||||
default: assert(false, "wrong vector length");
|
||||
}
|
||||
}
|
||||
#endif // _LP64
|
||||
|
||||
void MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
switch (vlen) {
|
||||
case 2:
|
||||
assert(vtmp2 == xnoreg, "");
|
||||
reduce2F(opcode, dst, src, vtmp1);
|
||||
break;
|
||||
case 4:
|
||||
assert(vtmp2 == xnoreg, "");
|
||||
reduce4F(opcode, dst, src, vtmp1);
|
||||
break;
|
||||
case 8:
|
||||
reduce8F(opcode, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
case 16:
|
||||
reduce16F(opcode, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
default: assert(false, "wrong vector length");
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
switch (vlen) {
|
||||
case 2:
|
||||
assert(vtmp2 == xnoreg, "");
|
||||
reduce2D(opcode, dst, src, vtmp1);
|
||||
break;
|
||||
case 4:
|
||||
reduce4D(opcode, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
case 8:
|
||||
reduce8D(opcode, dst, src, vtmp1, vtmp2);
|
||||
break;
|
||||
default: assert(false, "wrong vector length");
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
if (opcode == Op_AddReductionVI) {
|
||||
if (vtmp1 != src2) {
|
||||
movdqu(vtmp1, src2);
|
||||
}
|
||||
phaddd(vtmp1, vtmp1);
|
||||
} else {
|
||||
pshufd(vtmp1, src2, 0x1);
|
||||
reduce_operation_128(opcode, vtmp1, src2);
|
||||
}
|
||||
movdl(vtmp2, src1);
|
||||
reduce_operation_128(opcode, vtmp1, vtmp2);
|
||||
movdl(dst, vtmp1);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
if (opcode == Op_AddReductionVI) {
|
||||
if (vtmp1 != src2) {
|
||||
movdqu(vtmp1, src2);
|
||||
}
|
||||
phaddd(vtmp1, src2);
|
||||
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
||||
} else {
|
||||
pshufd(vtmp2, src2, 0xE);
|
||||
reduce_operation_128(opcode, vtmp2, src2);
|
||||
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
if (opcode == Op_AddReductionVI) {
|
||||
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
|
||||
vextracti128_high(vtmp2, vtmp1);
|
||||
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
|
||||
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
||||
} else {
|
||||
vextracti128_high(vtmp1, src2);
|
||||
reduce_operation_128(opcode, vtmp1, src2);
|
||||
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
vextracti64x4_high(vtmp2, src2);
|
||||
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
|
||||
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
void MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
pshufd(vtmp2, src2, 0xE);
|
||||
reduce_operation_128(opcode, vtmp2, src2);
|
||||
movdq(vtmp1, src1);
|
||||
reduce_operation_128(opcode, vtmp1, vtmp2);
|
||||
movdq(dst, vtmp1);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
vextracti128_high(vtmp1, src2);
|
||||
reduce_operation_128(opcode, vtmp1, src2);
|
||||
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
vextracti64x4_high(vtmp2, src2);
|
||||
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
|
||||
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
||||
}
|
||||
#endif // _LP64
|
||||
|
||||
void MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
||||
reduce_operation_128(opcode, dst, src);
|
||||
pshufd(vtmp, src, 0x1);
|
||||
reduce_operation_128(opcode, dst, vtmp);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
||||
reduce2F(opcode, dst, src, vtmp);
|
||||
pshufd(vtmp, src, 0x2);
|
||||
reduce_operation_128(opcode, dst, vtmp);
|
||||
pshufd(vtmp, src, 0x3);
|
||||
reduce_operation_128(opcode, dst, vtmp);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
reduce4F(opcode, dst, src, vtmp2);
|
||||
vextractf128_high(vtmp2, src);
|
||||
reduce4F(opcode, dst, vtmp2, vtmp1);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
reduce8F(opcode, dst, src, vtmp1, vtmp2);
|
||||
vextracti64x4_high(vtmp1, src);
|
||||
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
||||
reduce_operation_128(opcode, dst, src);
|
||||
pshufd(vtmp, src, 0xE);
|
||||
reduce_operation_128(opcode, dst, vtmp);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
reduce2D(opcode, dst, src, vtmp2);
|
||||
vextractf128_high(vtmp2, src);
|
||||
reduce2D(opcode, dst, vtmp2, vtmp1);
|
||||
}
|
||||
|
||||
void MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
||||
reduce4D(opcode, dst, src, vtmp1, vtmp2);
|
||||
vextracti64x4_high(vtmp1, src);
|
||||
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
|
||||
}
|
||||
#endif
|
||||
|
||||
//-------------------------------------------------------------------------------------------
|
||||
|
||||
void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
|
||||
|
@ -1649,8 +1649,48 @@ public:
|
||||
void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
|
||||
void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
// Reductions for vectors of ints, longs, floats, and doubles.
|
||||
|
||||
// dst = src1 + reduce(op, src2) using vtmp as temps
|
||||
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
#ifdef _LP64
|
||||
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
#endif // _LP64
|
||||
|
||||
// dst = reduce(op, src2) using vtmp as temps
|
||||
void reduce_fp(int opcode, int vlen,
|
||||
XMMRegister dst, XMMRegister src,
|
||||
XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
|
||||
private:
|
||||
void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
|
||||
void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
|
||||
#ifdef _LP64
|
||||
void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
#endif // _LP64
|
||||
|
||||
void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
|
||||
void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
|
||||
void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
|
||||
void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
|
||||
void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
|
||||
|
||||
void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src);
|
||||
void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
#endif
|
||||
|
||||
public:
|
||||
// C2 compiled method's prolog code.
|
||||
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);
|
||||
|
||||
|
@ -1277,16 +1277,14 @@ const bool Matcher::match_rule_supported(int opcode) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_AddReductionVL:
|
||||
if (UseAVX < 3) { // only EVEX : vector connectivity becomes an issue here
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_AbsVB:
|
||||
case Op_AbsVS:
|
||||
case Op_AbsVI:
|
||||
case Op_AddReductionVI:
|
||||
if (UseSSE < 3 || !VM_Version::supports_ssse3()) { // requires at least SSSE3
|
||||
case Op_AndReductionV:
|
||||
case Op_OrReductionV:
|
||||
case Op_XorReductionV:
|
||||
if (UseSSE < 3) { // requires at least SSSE3
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@ -1675,6 +1673,12 @@ static inline uint vector_length(const MachNode* n) {
|
||||
return vt->length();
|
||||
}
|
||||
|
||||
static inline uint vector_length(const MachNode* use, MachOper* opnd) {
|
||||
uint def_idx = use->operand_index(opnd);
|
||||
Node* def = use->in(def_idx);
|
||||
return def->bottom_type()->is_vect()->length();
|
||||
}
|
||||
|
||||
static inline uint vector_length_in_bytes(const MachNode* n) {
|
||||
const TypeVect* vt = n->bottom_type()->is_vect();
|
||||
return vt->length_in_bytes();
|
||||
@ -3592,709 +3596,168 @@ instruct ReplD_zero(vec dst, immD0 zero) %{
|
||||
%}
|
||||
|
||||
// ====================REDUCTION ARITHMETIC=======================================
|
||||
// =======================Int Reduction==========================================
|
||||
|
||||
// =======================AddReductionVI==========================================
|
||||
|
||||
instruct vadd2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
|
||||
n->in(2)->bottom_type()->is_vect()->length() < 16);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_add2I_reduction $dst,$src1,$src2" %}
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
match(Set dst (AndReductionV src1 src2));
|
||||
match(Set dst ( OrReductionV src1 src2));
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 2) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else if (VM_Version::supports_avxonly()) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 2, "required");
|
||||
__ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp$$XMMRegister);
|
||||
}
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src2);
|
||||
__ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
|
||||
n->in(2)->bottom_type()->is_vect()->length() == 16);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_add4I_reduction $dst,$src1,$src2" %}
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
match(Set dst (AndReductionV src1 src2));
|
||||
match(Set dst ( OrReductionV src1 src2));
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 2) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else if (VM_Version::supports_avxonly()) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 2, "required");
|
||||
__ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
}
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src2);
|
||||
__ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_add8I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 2) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else {
|
||||
assert(UseAVX > 0, "");
|
||||
int vector_len = Assembler::AVX_256bit;
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vector_add16I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
__ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
|
||||
__ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// =======================AddReductionVL==========================================
|
||||
// =======================Long Reduction==========================================
|
||||
|
||||
#ifdef _LP64
|
||||
instruct vadd2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
|
||||
n->in(2)->bottom_type()->is_vect()->length() < 8);
|
||||
match(Set dst (AddReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_add2L_reduction $dst,$src1,$src2" %}
|
||||
match(Set dst (MulReductionVL src1 src2));
|
||||
match(Set dst (AndReductionV src1 src2));
|
||||
match(Set dst ( OrReductionV src1 src2));
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdq($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src2);
|
||||
__ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
|
||||
n->in(2)->bottom_type()->is_vect()->length() == 8);
|
||||
match(Set dst (AddReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_add4L_reduction $dst,$src1,$src2" %}
|
||||
match(Set dst (MulReductionVL src1 src2));
|
||||
match(Set dst (AndReductionV src1 src2));
|
||||
match(Set dst ( OrReductionV src1 src2));
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($tmp$$XMMRegister, $src1$$Register);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (AddReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_addL_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
|
||||
__ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($tmp$$XMMRegister, $src1$$Register);
|
||||
__ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src2);
|
||||
__ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
#endif // _LP64
|
||||
|
||||
// =======================AddReductionVF==========================================
|
||||
// =======================Float Reduction==========================================
|
||||
|
||||
instruct vadd2F_reduction_reg(regF dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (AddReductionVF dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "vector_add2F_reduction $dst,$dst,$src2" %}
|
||||
instruct reductionF128(regF dst, vec src, vec vtmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4);
|
||||
match(Set dst (AddReductionVF dst src));
|
||||
match(Set dst (MulReductionVF dst src));
|
||||
effect(TEMP dst, TEMP vtmp);
|
||||
format %{ "vector_reduction_fp $dst,$src ; using $vtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ addss($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd4F_reduction_reg(regF dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
match(Set dst (AddReductionVF dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "vector_add4F_reduction $dst,$dst,$src2" %}
|
||||
instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
|
||||
match(Set dst (AddReductionVF dst src));
|
||||
match(Set dst (MulReductionVF dst src));
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ addss($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ addss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
|
||||
instruct vadd8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (AddReductionVF dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_add8F_reduction $dst,$dst,$src2" %}
|
||||
instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16);
|
||||
match(Set dst (AddReductionVF dst src));
|
||||
match(Set dst (MulReductionVF dst src));
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 0, "required");
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
|
||||
match(Set dst (AddReductionVF dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_add16F_reduction $dst,$dst,$src2" %}
|
||||
// =======================Double Reduction==========================================
|
||||
|
||||
instruct reduction2D(regD dst, vec src, vec vtmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2);
|
||||
match(Set dst (AddReductionVD dst src));
|
||||
match(Set dst (MulReductionVD dst src));
|
||||
effect(TEMP dst, TEMP vtmp);
|
||||
format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// =======================AddReductionVD==========================================
|
||||
|
||||
instruct vadd2D_reduction_reg(regD dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (AddReductionVD dst src2));
|
||||
effect(TEMP tmp, TEMP dst);
|
||||
format %{ "vector_add2D_reduction $dst,$src2" %}
|
||||
instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4);
|
||||
match(Set dst (AddReductionVD dst src));
|
||||
match(Set dst (MulReductionVD dst src));
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ addsd($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
match(Set dst (AddReductionVD dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_add4D_reduction $dst,$dst,$src2" %}
|
||||
instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
|
||||
match(Set dst (AddReductionVD dst src));
|
||||
match(Set dst (MulReductionVD dst src));
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 0, "required");
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vadd8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (AddReductionVD dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_add8D_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// =======================MulReductionVI==========================================
|
||||
|
||||
instruct vmul2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul2I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 3, "required");
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul4I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 3, "required");
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul8I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 1, "required");
|
||||
int vector_len = Assembler::AVX_128bit;
|
||||
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vector_mul16I_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
|
||||
__ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// =======================MulReductionVL==========================================
|
||||
|
||||
#ifdef _LP64
|
||||
instruct vmul2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul2L_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(VM_Version::supports_avx512dq(), "required");
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdq($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
match(Set dst (MulReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul4L_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(VM_Version::supports_avx512dq(), "required");
|
||||
__ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($tmp$$XMMRegister, $src1$$Register);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (MulReductionVL src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vector_mul8L_reduction $dst,$src1,$src2" %}
|
||||
ins_encode %{
|
||||
assert(VM_Version::supports_avx512dq(), "required");
|
||||
__ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
|
||||
__ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($tmp$$XMMRegister, $src1$$Register);
|
||||
__ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
|
||||
__ movdq($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
#endif
|
||||
|
||||
// =======================MulReductionVF==========================================
|
||||
|
||||
instruct vmul2F_reduction_reg(regF dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVF dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "vector_mul2F_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ mulss($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul4F_reduction_reg(regF dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
|
||||
match(Set dst (MulReductionVF dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "vector_mul4F_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ mulss($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
|
||||
match(Set dst (MulReductionVF dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_mul8F_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 0, "required");
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
|
||||
match(Set dst (MulReductionVF dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_mul16F_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 2, "required");
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// =======================MulReductionVD==========================================
|
||||
|
||||
instruct vmul2D_reduction_reg(regD dst, vec src2, vec tmp) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVD dst src2));
|
||||
effect(TEMP dst, TEMP tmp);
|
||||
format %{ "vector_mul2D_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
if (UseAVX > 0) {
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseSSE > 0, "required");
|
||||
__ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
|
||||
instruct vmul4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVD dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_mul4D_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 0, "required");
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vmul8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
|
||||
predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 2
|
||||
match(Set dst (MulReductionVD dst src2));
|
||||
effect(TEMP tmp, TEMP dst, TEMP tmp2);
|
||||
format %{ "vector_mul8D_reduction $dst,$dst,$src2" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX > 0, "required");
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen = vector_length(this, $src);
|
||||
__ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
@ -4059,6 +4059,11 @@ int MatchRule::is_expensive() const {
|
||||
strcmp(opType,"MulReductionVL")==0 ||
|
||||
strcmp(opType,"MulReductionVF")==0 ||
|
||||
strcmp(opType,"MulReductionVD")==0 ||
|
||||
strcmp(opType,"MinReductionV")==0 ||
|
||||
strcmp(opType,"MaxReductionV")==0 ||
|
||||
strcmp(opType,"AndReductionV")==0 ||
|
||||
strcmp(opType,"OrReductionV")==0 ||
|
||||
strcmp(opType,"XorReductionV")==0 ||
|
||||
0 /* 0 to line up columns nicely */ )
|
||||
return 1;
|
||||
}
|
||||
@ -4161,12 +4166,13 @@ bool MatchRule::is_vector() const {
|
||||
"AddReductionVF", "AddReductionVD",
|
||||
"MulReductionVI", "MulReductionVL",
|
||||
"MulReductionVF", "MulReductionVD",
|
||||
"MaxReductionV", "MinReductionV",
|
||||
"AndReductionV", "OrReductionV", "XorReductionV",
|
||||
"MulAddVS2VI",
|
||||
"LShiftCntV","RShiftCntV",
|
||||
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
|
||||
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
|
||||
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
|
||||
"MaxReductionV", "MinReductionV",
|
||||
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
|
||||
"RoundDoubleModeV","LoadVector","StoreVector",
|
||||
"FmaVD", "FmaVF","PopCountVI",
|
||||
|
@ -375,8 +375,11 @@ macro(URShiftVS)
|
||||
macro(URShiftVI)
|
||||
macro(URShiftVL)
|
||||
macro(AndV)
|
||||
macro(AndReductionV)
|
||||
macro(OrV)
|
||||
macro(OrReductionV)
|
||||
macro(XorV)
|
||||
macro(XorReductionV)
|
||||
macro(MinV)
|
||||
macro(MaxV)
|
||||
macro(MinReductionV)
|
||||
|
@ -3034,6 +3034,9 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f
|
||||
case Op_MulReductionVD:
|
||||
case Op_MinReductionV:
|
||||
case Op_MaxReductionV:
|
||||
case Op_AndReductionV:
|
||||
case Op_OrReductionV:
|
||||
case Op_XorReductionV:
|
||||
break;
|
||||
|
||||
case Op_PackB:
|
||||
|
@ -673,7 +673,30 @@ int ReductionNode::opcode(int opc, BasicType bt) {
|
||||
assert(bt == T_DOUBLE, "must be");
|
||||
vopc = Op_MaxReductionV;
|
||||
break;
|
||||
// TODO: add MulL for targets that support it
|
||||
case Op_AndI:
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_AndReductionV;
|
||||
break;
|
||||
case Op_AndL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
vopc = Op_AndReductionV;
|
||||
break;
|
||||
case Op_OrI:
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_OrReductionV;
|
||||
break;
|
||||
case Op_OrL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
vopc = Op_OrReductionV;
|
||||
break;
|
||||
case Op_XorI:
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_XorReductionV;
|
||||
break;
|
||||
case Op_XorL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
vopc = Op_XorReductionV;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -697,8 +720,11 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
|
||||
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
|
||||
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
|
||||
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
|
||||
case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
|
||||
case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
|
||||
case Op_AndReductionV: return new AndReductionVNode(ctrl, n1, n2);
|
||||
case Op_OrReductionV: return new OrReductionVNode(ctrl, n1, n2);
|
||||
case Op_XorReductionV: return new XorReductionVNode(ctrl, n1, n2);
|
||||
default:
|
||||
fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
|
||||
return NULL;
|
||||
|
@ -145,6 +145,15 @@ class ReductionNode : public Node {
|
||||
static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
|
||||
static int opcode(int opc, BasicType bt);
|
||||
static bool implemented(int opc, uint vlen, BasicType bt);
|
||||
|
||||
virtual const Type* bottom_type() const {
|
||||
BasicType vbt = in(2)->bottom_type()->is_vect()->element_basic_type();
|
||||
return Type::get_const_basic_type(vbt);
|
||||
}
|
||||
|
||||
virtual uint ideal_reg() const {
|
||||
return bottom_type()->ideal_reg();
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVINode--------------------------------------
|
||||
@ -613,6 +622,30 @@ class XorVNode : public VectorNode {
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AndReductionVNode--------------------------------------
|
||||
// Vector and int, long as a reduction
|
||||
class AndReductionVNode : public ReductionNode {
|
||||
public:
|
||||
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------OrReductionVNode--------------------------------------
|
||||
// Vector or int, long as a reduction
|
||||
class OrReductionVNode : public ReductionNode {
|
||||
public:
|
||||
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------XorReductionVNode--------------------------------------
|
||||
// Vector xor int, long as a reduction
|
||||
class XorReductionVNode : public ReductionNode {
|
||||
public:
|
||||
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MinVNode--------------------------------------
|
||||
// Vector min
|
||||
class MinVNode : public VectorNode {
|
||||
@ -635,26 +668,6 @@ class MinReductionVNode : public ReductionNode {
|
||||
public:
|
||||
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const {
|
||||
BasicType bt = in(1)->bottom_type()->basic_type();
|
||||
if (bt == T_FLOAT) {
|
||||
return Type::FLOAT;
|
||||
} else if (bt == T_DOUBLE) {
|
||||
return Type::DOUBLE;
|
||||
}
|
||||
assert(false, "unsupported basic type");
|
||||
return NULL;
|
||||
}
|
||||
virtual uint ideal_reg() const {
|
||||
BasicType bt = in(1)->bottom_type()->basic_type();
|
||||
if (bt == T_FLOAT) {
|
||||
return Op_RegF;
|
||||
} else if (bt == T_DOUBLE) {
|
||||
return Op_RegD;
|
||||
}
|
||||
assert(false, "unsupported basic type");
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------MaxReductionVNode--------------------------------------
|
||||
@ -663,26 +676,6 @@ class MaxReductionVNode : public ReductionNode {
|
||||
public:
|
||||
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const {
|
||||
BasicType bt = in(1)->bottom_type()->basic_type();
|
||||
if (bt == T_FLOAT) {
|
||||
return Type::FLOAT;
|
||||
} else {
|
||||
return Type::DOUBLE;
|
||||
}
|
||||
assert(false, "unsupported basic type");
|
||||
return NULL;
|
||||
}
|
||||
virtual uint ideal_reg() const {
|
||||
BasicType bt = in(1)->bottom_type()->basic_type();
|
||||
if (bt == T_FLOAT) {
|
||||
return Op_RegF;
|
||||
} else {
|
||||
return Op_RegD;
|
||||
}
|
||||
assert(false, "unsupported basic type");
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
//================================= M E M O R Y ===============================
|
||||
|
@ -1820,8 +1820,11 @@ typedef HashtableEntry<InstanceKlass*, mtClass> KlassHashtableEntry;
|
||||
declare_c2_type(URShiftVINode, VectorNode) \
|
||||
declare_c2_type(URShiftVLNode, VectorNode) \
|
||||
declare_c2_type(AndVNode, VectorNode) \
|
||||
declare_c2_type(AndReductionVNode, ReductionNode) \
|
||||
declare_c2_type(OrVNode, VectorNode) \
|
||||
declare_c2_type(OrReductionVNode, ReductionNode) \
|
||||
declare_c2_type(XorVNode, VectorNode) \
|
||||
declare_c2_type(XorReductionVNode, ReductionNode) \
|
||||
declare_c2_type(MaxVNode, VectorNode) \
|
||||
declare_c2_type(MinVNode, VectorNode) \
|
||||
declare_c2_type(MaxReductionVNode, ReductionNode) \
|
||||
|
238
test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
Normal file
238
test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
Normal file
@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8240248
|
||||
* @summary Add C2 x86 Superword support for scalar logical reduction optimizations : int test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=2
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=2
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=4
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=4
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=8
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=8
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=16
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=16
|
||||
* compiler.loopopts.superword.RedTest_int
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
public class RedTest_int {
|
||||
static final int NUM = 1024;
|
||||
static final int ITER = 8000;
|
||||
public static void main(String[] args) throws Exception {
|
||||
int[] a = new int[NUM];
|
||||
int[] b = new int[NUM];
|
||||
int[] c = new int[NUM];
|
||||
int[] d = new int[NUM];
|
||||
reductionInit1(a, b, c);
|
||||
int total = 0;
|
||||
int valid = 0;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = sumReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid += d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Add Reduction");
|
||||
|
||||
valid = 0;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = orReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid |= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Or Reduction");
|
||||
|
||||
valid = -1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = andReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid &= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "And Reduction");
|
||||
|
||||
valid = -1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = xorReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid ^= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Xor Reduction");
|
||||
|
||||
reductionInit2(a, b, c);
|
||||
valid = 1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = mulReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid *= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Mul Reduction");
|
||||
}
|
||||
|
||||
public static void reductionInit1(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c) {
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
a[i] = (i%2) + 0x4099;
|
||||
b[i] = (i%2) + 0x1033;
|
||||
c[i] = (i%2) + 0x455;
|
||||
}
|
||||
}
|
||||
|
||||
public static void reductionInit2(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c) {
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
a[i] = 0x11;
|
||||
b[i] = 0x12;
|
||||
c[i] = 0x13;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static int sumReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d) {
|
||||
int total = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static int orReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d) {
|
||||
int total = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total |= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static int andReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d) {
|
||||
int total = -1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total &= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static int xorReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d) {
|
||||
int total = -1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total ^= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static int mulReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d) {
|
||||
int total = 1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total = total*d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static void testCorrectness(
|
||||
int total,
|
||||
int valid,
|
||||
String op) throws Exception {
|
||||
if (total == valid) {
|
||||
System.out.println(op + ": Success");
|
||||
} else {
|
||||
System.out.println("Invalid total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception(op + ": Failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
238
test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
Normal file
238
test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
Normal file
@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8240248
|
||||
* @summary Add C2 x86 Superword support for scalar logical reduction optimizations : long test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=2
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=2
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=4
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=4
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=8
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=8
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-TieredCompilation
|
||||
* -XX:+SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=16
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||
* -XX:CompileThresholdScaling=0.1
|
||||
* -XX:-SuperWordReductions
|
||||
* -XX:LoopMaxUnroll=16
|
||||
* compiler.loopopts.superword.RedTest_long
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
public class RedTest_long {
|
||||
static final int NUM = 512;
|
||||
static final int ITER = 8000;
|
||||
public static void main(String[] args) throws Exception {
|
||||
long[] a = new long[NUM];
|
||||
long[] b = new long[NUM];
|
||||
long[] c = new long[NUM];
|
||||
long[] d = new long[NUM];
|
||||
reductionInit1(a, b, c);
|
||||
long total = 0;
|
||||
long valid = 0;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = sumReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid += d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Add Reduction");
|
||||
|
||||
valid = 0;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = orReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid |= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Or Reduction");
|
||||
|
||||
valid = -1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = andReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid &= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "And Reduction");
|
||||
|
||||
valid = -1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = xorReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid ^= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Xor Reduction");
|
||||
|
||||
reductionInit2(a, b, c);
|
||||
valid = 1;
|
||||
for (int j = 0; j < ITER; j++) {
|
||||
total = mulReductionImplement(a, b, c, d);
|
||||
}
|
||||
for (int j = 0; j < d.length; j++) {
|
||||
valid *= d[j];
|
||||
}
|
||||
testCorrectness(total, valid, "Mul Reduction");
|
||||
}
|
||||
|
||||
public static void reductionInit1(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c) {
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
a[i] = (i%2) + 0x4099;
|
||||
b[i] = (i%2) + 0x1033;
|
||||
c[i] = (i%2) + 0x455;
|
||||
}
|
||||
}
|
||||
|
||||
public static void reductionInit2(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c) {
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
a[i] = 0x11;
|
||||
b[i] = 0x12;
|
||||
c[i] = 0x13;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static long sumReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d) {
|
||||
long total = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static long orReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d) {
|
||||
long total = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total |= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static long andReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d) {
|
||||
long total = -1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total &= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static long xorReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d) {
|
||||
long total = -1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total ^= d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static long mulReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d) {
|
||||
long total = 1;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total = total*d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public static void testCorrectness(
|
||||
long total,
|
||||
long valid,
|
||||
String op) throws Exception {
|
||||
if (total == valid) {
|
||||
System.out.println(op + ": Success");
|
||||
} else {
|
||||
System.out.println("Invalid total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception(op + ": Failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
136
test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
Normal file
136
test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
Normal file
@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.openjdk.jmh.infra.*;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.Random;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@State(Scope.Thread)
|
||||
public abstract class VectorReduction {
|
||||
@Param({"512"})
|
||||
public int COUNT;
|
||||
|
||||
private int[] intsA;
|
||||
private int[] intsB;
|
||||
private int[] intsC;
|
||||
private int[] intsD;
|
||||
private int resI;
|
||||
private long[] longsA;
|
||||
private long[] longsB;
|
||||
private long[] longsC;
|
||||
private long[] longsD;
|
||||
private long resL;
|
||||
|
||||
@Param("0")
|
||||
private int seed;
|
||||
private Random r = new Random(seed);
|
||||
|
||||
@Setup
|
||||
public void init() {
|
||||
intsA = new int[COUNT];
|
||||
intsB = new int[COUNT];
|
||||
intsC = new int[COUNT];
|
||||
intsD = new int[COUNT];
|
||||
longsA = new long[COUNT];
|
||||
longsB = new long[COUNT];
|
||||
longsC = new long[COUNT];
|
||||
longsD = new long[COUNT];
|
||||
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsA[i] = r.nextInt();
|
||||
intsB[i] = r.nextInt();
|
||||
intsC[i] = r.nextInt();
|
||||
longsA[i] = r.nextLong();
|
||||
longsB[i] = r.nextLong();
|
||||
longsC[i] = r.nextLong();
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedI() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI &= intsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void orRedI() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI |= intsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void xorRedI() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
|
||||
resI ^= intsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void andRedL() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL &= longsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void orRedL() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL |= longsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void xorRedL() {
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
|
||||
resL ^= longsD[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Fork(value = 1, jvmArgsPrepend = {
|
||||
"-XX:+UseSuperWord"
|
||||
})
|
||||
public static class WithSuperword extends VectorReduction {
|
||||
|
||||
}
|
||||
|
||||
@Fork(value = 1, jvmArgsPrepend = {
|
||||
"-XX:-UseSuperWord"
|
||||
})
|
||||
public static class NoSuperword extends VectorReduction {
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user