8240248: Extend superword reduction optimizations for x86

Add support for and, or, xor reduction Co-authored-by: Shravya Rukmannagari <shravya.rukmannagari@intel.com> Reviewed-by: vlivanov, thartmann
2020-03-23 10:26:40 -07:00 · 2020-03-23 10:26:40 -07:00 · 398ce2948c
commit 398ce2948c
parent 75a8b7fa83
13 changed files with 1089 additions and 702 deletions
--- a/make/devkit/createJMHBundle.sh
+++ b/make/devkit/createJMHBundle.sh
@ -40,10 +40,10 @@ mkdir -p $BUILD_DIR $JAR_DIR
 cd $JAR_DIR
 rm -f *

-wget http://central.maven.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
-wget http://central.maven.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
-wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
-wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
+wget https://repo.maven.apache.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
+wget https://repo.maven.apache.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
+wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
+wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar

 tar -cvzf ../$BUNDLE_NAME *

--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -4161,7 +4161,245 @@ void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRe
    vpsrlq(dst, nds, src, vector_len);
  }
 }
+
+// Reductions for vectors of ints, longs, floats, and doubles.
+
+void MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
+  int vector_len = Assembler::AVX_128bit;
+
+  switch (opcode) {
+    case Op_AndReductionV:  pand(dst, src); break;
+    case Op_OrReductionV:   por (dst, src); break;
+    case Op_XorReductionV:  pxor(dst, src); break;
+
+    case Op_AddReductionVF: addss(dst, src); break;
+    case Op_AddReductionVD: addsd(dst, src); break;
+    case Op_AddReductionVI: paddd(dst, src); break;
+    case Op_AddReductionVL: paddq(dst, src); break;
+
+    case Op_MulReductionVF: mulss(dst, src); break;
+    case Op_MulReductionVD: mulsd(dst, src); break;
+    case Op_MulReductionVI: pmulld(dst, src); break;
+    case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
+
+    default: assert(false, "wrong opcode");
+  }
+}
+
+void MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
+  int vector_len = Assembler::AVX_256bit;
+
+  switch (opcode) {
+    case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
+    case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
+    case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
+
+    case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
+    case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
+
+    case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
+    case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
+
+    default: assert(false, "wrong opcode");
+  }
+}
+
+void MacroAssembler::reduce_fp(int opcode, int vlen,
+                               XMMRegister dst, XMMRegister src,
+                               XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (opcode) {
+    case Op_AddReductionVF:
+    case Op_MulReductionVF:
+      reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
+      break;
+
+    case Op_AddReductionVD:
+    case Op_MulReductionVD:
+      reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
+      break;
+
+    default: assert(false, "wrong opcode");
+  }
+}
+
+void MacroAssembler::reduceI(int opcode, int vlen,
+                             Register dst, Register src1, XMMRegister src2,
+                             XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+
+    default: assert(false, "wrong vector length");
+  }
+}
+
+#ifdef _LP64
+void MacroAssembler::reduceL(int opcode, int vlen,
+                             Register dst, Register src1, XMMRegister src2,
+                             XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+    case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
+
+    default: assert(false, "wrong vector length");
+  }
+}
+#endif // _LP64
+
+void MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case 2:
+      assert(vtmp2 == xnoreg, "");
+      reduce2F(opcode, dst, src, vtmp1);
+      break;
+    case 4:
+      assert(vtmp2 == xnoreg, "");
+      reduce4F(opcode, dst, src, vtmp1);
+      break;
+    case 8:
+      reduce8F(opcode, dst, src, vtmp1, vtmp2);
+      break;
+    case 16:
+      reduce16F(opcode, dst, src, vtmp1, vtmp2);
+      break;
+    default: assert(false, "wrong vector length");
+  }
+}
+
+void MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  switch (vlen) {
+    case 2:
+      assert(vtmp2 == xnoreg, "");
+      reduce2D(opcode, dst, src, vtmp1);
+      break;
+    case 4:
+      reduce4D(opcode, dst, src, vtmp1, vtmp2);
+      break;
+    case 8:
+      reduce8D(opcode, dst, src, vtmp1, vtmp2);
+      break;
+    default: assert(false, "wrong vector length");
+  }
+}
+
+void MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    if (vtmp1 != src2) {
+      movdqu(vtmp1, src2);
+    }
+    phaddd(vtmp1, vtmp1);
+  } else {
+    pshufd(vtmp1, src2, 0x1);
+    reduce_operation_128(opcode, vtmp1, src2);
+  }
+  movdl(vtmp2, src1);
+  reduce_operation_128(opcode, vtmp1, vtmp2);
+  movdl(dst, vtmp1);
+}
+
+void MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    if (vtmp1 != src2) {
+      movdqu(vtmp1, src2);
+    }
+    phaddd(vtmp1, src2);
+    reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+  } else {
+    pshufd(vtmp2, src2, 0xE);
+    reduce_operation_128(opcode, vtmp2, src2);
+    reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+  }
+}
+
+void MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  if (opcode == Op_AddReductionVI) {
+    vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
+    vextracti128_high(vtmp2, vtmp1);
+    vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
+    reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+  } else {
+    vextracti128_high(vtmp1, src2);
+    reduce_operation_128(opcode, vtmp1, src2);
+    reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+  }
+}
+
+void MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  vextracti64x4_high(vtmp2, src2);
+  reduce_operation_256(opcode, vtmp2, vtmp2, src2);
+  reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+}
+
+#ifdef _LP64
+void MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  pshufd(vtmp2, src2, 0xE);
+  reduce_operation_128(opcode, vtmp2, src2);
+  movdq(vtmp1, src1);
+  reduce_operation_128(opcode, vtmp1, vtmp2);
+  movdq(dst, vtmp1);
+}
+
+void MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  vextracti128_high(vtmp1, src2);
+  reduce_operation_128(opcode, vtmp1, src2);
+  reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
+}
+
+void MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
+  vextracti64x4_high(vtmp2, src2);
+  reduce_operation_256(opcode, vtmp2, vtmp2, src2);
+  reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
+}
+#endif // _LP64
+
+void MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
+  reduce_operation_128(opcode, dst, src);
+  pshufd(vtmp, src, 0x1);
+  reduce_operation_128(opcode, dst, vtmp);
+}
+
+void MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
+  reduce2F(opcode, dst, src, vtmp);
+  pshufd(vtmp, src, 0x2);
+  reduce_operation_128(opcode, dst, vtmp);
+  pshufd(vtmp, src, 0x3);
+  reduce_operation_128(opcode, dst, vtmp);
+}
+
+void MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  reduce4F(opcode, dst, src, vtmp2);
+  vextractf128_high(vtmp2, src);
+  reduce4F(opcode, dst, vtmp2, vtmp1);
+}
+
+void MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  reduce8F(opcode, dst, src, vtmp1, vtmp2);
+  vextracti64x4_high(vtmp1, src);
+  reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
+}
+
+void MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
+  reduce_operation_128(opcode, dst, src);
+  pshufd(vtmp, src, 0xE);
+  reduce_operation_128(opcode, dst, vtmp);
+}
+
+void MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  reduce2D(opcode, dst, src, vtmp2);
+  vextractf128_high(vtmp2, src);
+  reduce2D(opcode, dst, vtmp2, vtmp1);
+}
+
+void MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
+  reduce4D(opcode, dst, src, vtmp1, vtmp2);
+  vextracti64x4_high(vtmp1, src);
+  reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
+}
 #endif
+
 //-------------------------------------------------------------------------------------------

 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1649,8 +1649,48 @@ public:
  void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
  void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
+  // Reductions for vectors of ints, longs, floats, and doubles.
+
+  // dst = src1 + reduce(op, src2) using vtmp as temps
+  void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+#ifdef _LP64
+  void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+#endif // _LP64
+
+  // dst = reduce(op, src2) using vtmp as temps
+  void reduce_fp(int opcode, int vlen,
+                 XMMRegister dst, XMMRegister src,
+                 XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
+ private:
+  void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+
+  void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+
+#ifdef _LP64
+  void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+#endif // _LP64
+
+  void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
+  void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
+  void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+
+  void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
+  void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
+
+  void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src);
+  void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
 #endif

+ public:
  // C2 compiled method's prolog code.
  void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);

--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1277,16 +1277,14 @@ const bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
-    case Op_AddReductionVL:
-      if (UseAVX < 3) { // only EVEX : vector connectivity becomes an issue here
-        return false;
-      }
-      break;
    case Op_AbsVB:
    case Op_AbsVS:
    case Op_AbsVI:
    case Op_AddReductionVI:
-      if (UseSSE < 3 || !VM_Version::supports_ssse3()) { // requires at least SSSE3
+    case Op_AndReductionV:
+    case Op_OrReductionV:
+    case Op_XorReductionV:
+      if (UseSSE < 3) { // requires at least SSSE3
        return false;
      }
      break;
@ -1675,6 +1673,12 @@ static inline uint vector_length(const MachNode* n) {
  return vt->length();
 }

+static inline uint vector_length(const MachNode* use, MachOper* opnd) {
+  uint def_idx = use->operand_index(opnd);
+  Node* def = use->in(def_idx);
+  return def->bottom_type()->is_vect()->length();
+}
+
 static inline uint vector_length_in_bytes(const MachNode* n) {
  const TypeVect* vt = n->bottom_type()->is_vect();
  return vt->length_in_bytes();
@ -3592,709 +3596,168 @@ instruct ReplD_zero(vec dst, immD0 zero) %{
 %}

 // ====================REDUCTION ARITHMETIC=======================================
+// =======================Int Reduction==========================================

-// =======================AddReductionVI==========================================
-
-instruct vadd2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
+instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
+            n->in(2)->bottom_type()->is_vect()->length() < 16);
  match(Set dst (AddReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_add2I_reduction $dst,$src1,$src2" %}
+  match(Set dst (MulReductionVI src1 src2));
+  match(Set dst (AndReductionV  src1 src2));
+  match(Set dst ( OrReductionV  src1 src2));
+  match(Set dst (XorReductionV  src1 src2));
+  effect(TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    if (UseAVX > 2) {
-      int vector_len = Assembler::AVX_128bit;
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-      __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else if (VM_Version::supports_avxonly()) {
-      int vector_len = Assembler::AVX_128bit;
-      __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else {
-      assert(UseSSE > 2, "required");
-      __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
-      __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
-      __ movdl($tmp$$XMMRegister, $src1$$Register);
-      __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
-      __ movdl($dst$$Register, $tmp$$XMMRegister);
-    }
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src2);
+    __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
+instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT &&
+            n->in(2)->bottom_type()->is_vect()->length() == 16);
  match(Set dst (AddReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_add4I_reduction $dst,$src1,$src2" %}
+  match(Set dst (MulReductionVI src1 src2));
+  match(Set dst (AndReductionV  src1 src2));
+  match(Set dst ( OrReductionV  src1 src2));
+  match(Set dst (XorReductionV  src1 src2));
+  effect(TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    if (UseAVX > 2) {
-      int vector_len = Assembler::AVX_128bit;
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-      __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else if (VM_Version::supports_avxonly()) {
-      int vector_len = Assembler::AVX_128bit;
-      __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
-      __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else {
-      assert(UseSSE > 2, "required");
-      __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
-      __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
-      __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    }
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src2);
+    __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (AddReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_add8I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    if (UseAVX > 2) {
-      int vector_len = Assembler::AVX_128bit;
-      __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
-      __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
-      __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
-      __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-      __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else {
-      assert(UseAVX > 0, "");
-      int vector_len = Assembler::AVX_256bit;
-      __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
-      __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
-      __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
-  match(Set dst (AddReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vector_add16I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
-    __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
-    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
-    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdl($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// =======================AddReductionVL==========================================
+// =======================Long Reduction==========================================

 #ifdef _LP64
-instruct vadd2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
+instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
+            n->in(2)->bottom_type()->is_vect()->length() < 8);
  match(Set dst (AddReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_add2L_reduction $dst,$src1,$src2" %}
+  match(Set dst (MulReductionVL src1 src2));
+  match(Set dst (AndReductionV  src1 src2));
+  match(Set dst ( OrReductionV  src1 src2));
+  match(Set dst (XorReductionV  src1 src2));
+  effect(TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdq($tmp2$$XMMRegister, $src1$$Register);
-    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src2);
+    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
+instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG &&
+            n->in(2)->bottom_type()->is_vect()->length() == 8);
  match(Set dst (AddReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_add4L_reduction $dst,$src1,$src2" %}
+  match(Set dst (MulReductionVL src1 src2));
+  match(Set dst (AndReductionV  src1 src2));
+  match(Set dst ( OrReductionV  src1 src2));
+  match(Set dst (XorReductionV  src1 src2));
+  effect(TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
-    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($tmp$$XMMRegister, $src1$$Register);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (AddReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_addL_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($tmp$$XMMRegister, $src1$$Register);
-    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src2);
+    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}
 #endif // _LP64

-// =======================AddReductionVF==========================================
+// =======================Float Reduction==========================================

-instruct vadd2F_reduction_reg(regF dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (AddReductionVF dst src2));
-  effect(TEMP dst, TEMP tmp);
-  format %{ "vector_add2F_reduction $dst,$dst,$src2" %}
+instruct reductionF128(regF dst, vec src, vec vtmp) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4);
+  match(Set dst (AddReductionVF dst src));
+  match(Set dst (MulReductionVF dst src));
+  effect(TEMP dst, TEMP vtmp);
+  format %{ "vector_reduction_fp  $dst,$src ; using $vtmp as TEMP" %}
  ins_encode %{
-    if (UseAVX > 0) {
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ addss($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd4F_reduction_reg(regF dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
-  match(Set dst (AddReductionVF dst src2));
-  effect(TEMP dst, TEMP tmp);
-  format %{ "vector_add4F_reduction $dst,$dst,$src2" %}
+instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
+  match(Set dst (AddReductionVF dst src));
+  match(Set dst (MulReductionVF dst src));
+  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    if (UseAVX > 0) {
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-      __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ addss($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-      __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-      __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-
-instruct vadd8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (AddReductionVF dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_add8F_reduction $dst,$dst,$src2" %}
+instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() == 16);
+  match(Set dst (AddReductionVF dst src));
+  match(Set dst (MulReductionVF dst src));
+  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    assert(UseAVX > 0, "required");
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
-  match(Set dst (AddReductionVF dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_add16F_reduction $dst,$dst,$src2" %}
+// =======================Double Reduction==========================================
+
+instruct reduction2D(regD dst, vec src, vec vtmp) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2);
+  match(Set dst (AddReductionVD dst src));
+  match(Set dst (MulReductionVD dst src));
+  effect(TEMP dst, TEMP vtmp);
+  format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-// =======================AddReductionVD==========================================
-
-instruct vadd2D_reduction_reg(regD dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (AddReductionVD dst src2));
-  effect(TEMP tmp, TEMP dst);
-  format %{ "vector_add2D_reduction  $dst,$src2" %}
+instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4);
+  match(Set dst (AddReductionVD dst src));
+  match(Set dst (MulReductionVD dst src));
+  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    if (UseAVX > 0) {
-      __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vadd4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
-  match(Set dst (AddReductionVD dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_add4D_reduction $dst,$dst,$src2" %}
+instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
+  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8);
+  match(Set dst (AddReductionVD dst src));
+  match(Set dst (MulReductionVD dst src));
+  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
+  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
  ins_encode %{
-    assert(UseAVX > 0, "required");
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (AddReductionVD dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_add8D_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// =======================MulReductionVI==========================================
-
-instruct vmul2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (MulReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul2I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    if (UseAVX > 0) {
-      int vector_len = Assembler::AVX_128bit;
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-      __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else {
-      assert(UseSSE > 3, "required");
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-      __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
-      __ movdl($tmp$$XMMRegister, $src1$$Register);
-      __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
-  match(Set dst (MulReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul4I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    if (UseAVX > 0) {
-      int vector_len = Assembler::AVX_128bit;
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-      __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($tmp2$$XMMRegister, $src1$$Register);
-      __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    } else {
-      assert(UseSSE > 3, "required");
-      __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
-      __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
-      __ movdl($tmp$$XMMRegister, $src1$$Register);
-      __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
-      __ movdl($dst$$Register, $tmp2$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (MulReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul8I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 1, "required");
-    int vector_len = Assembler::AVX_128bit;
-    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-    __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-    __ movdl($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
-  match(Set dst (MulReductionVI src1 src2));
-  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vector_mul16I_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
-    __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdl($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// =======================MulReductionVL==========================================
-
-#ifdef _LP64
-instruct vmul2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (MulReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul2L_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(VM_Version::supports_avx512dq(), "required");
-    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdq($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
-  match(Set dst (MulReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul4L_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(VM_Version::supports_avx512dq(), "required");
-    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
-    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($tmp$$XMMRegister, $src1$$Register);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (MulReductionVL src1 src2));
-  effect(TEMP tmp, TEMP tmp2);
-  format %{ "vector_mul8L_reduction $dst,$src1,$src2" %}
-  ins_encode %{
-    assert(VM_Version::supports_avx512dq(), "required");
-    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($tmp$$XMMRegister, $src1$$Register);
-    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
-    __ movdq($dst$$Register, $tmp2$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-#endif
-
-// =======================MulReductionVF==========================================
-
-instruct vmul2F_reduction_reg(regF dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (MulReductionVF dst src2));
-  effect(TEMP dst, TEMP tmp);
-  format %{ "vector_mul2F_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    if (UseAVX > 0) {
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4F_reduction_reg(regF dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 4
-  match(Set dst (MulReductionVF dst src2));
-  effect(TEMP dst, TEMP tmp);
-  format %{ "vector_mul4F_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    if (UseAVX > 0) {
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-      __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-      __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-      __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-      __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 8
-  match(Set dst (MulReductionVF dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_mul8F_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 0, "required");
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); // vector_length(src2) == 16
-  match(Set dst (MulReductionVF dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_mul16F_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 2, "required");
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
-    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// =======================MulReductionVD==========================================
-
-instruct vmul2D_reduction_reg(regD dst, vec src2, vec tmp) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); // vector_length(src2) == 2
-  match(Set dst (MulReductionVD dst src2));
-  effect(TEMP dst, TEMP tmp);
-  format %{ "vector_mul2D_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    if (UseAVX > 0) {
-      __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    } else {
-      assert(UseSSE > 0, "required");
-      __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
-      __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-      __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
-    }
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct vmul4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); // vector_length(src2) == 2
-  match(Set dst (MulReductionVD dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_mul4D_reduction  $dst,$dst,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 0, "required");
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
-  predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); // vector_length(src2) == 2
-  match(Set dst (MulReductionVD dst src2));
-  effect(TEMP tmp, TEMP dst, TEMP tmp2);
-  format %{ "vector_mul8D_reduction $dst,$dst,$src2" %}
-  ins_encode %{
-    assert(UseAVX > 0, "required");
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
-    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
-    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
+    int opcode = this->ideal_Opcode();
+    int vlen = vector_length(this, $src);
+    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
 %}
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -4059,6 +4059,11 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"MulReductionVL")==0 ||
        strcmp(opType,"MulReductionVF")==0 ||
        strcmp(opType,"MulReductionVD")==0 ||
+        strcmp(opType,"MinReductionV")==0 ||
+        strcmp(opType,"MaxReductionV")==0 ||
+        strcmp(opType,"AndReductionV")==0 ||
+        strcmp(opType,"OrReductionV")==0 ||
+        strcmp(opType,"XorReductionV")==0 ||
        0 /* 0 to line up columns nicely */ )
      return 1;
  }
@ -4161,12 +4166,13 @@ bool MatchRule::is_vector() const {
    "AddReductionVF", "AddReductionVD",
    "MulReductionVI", "MulReductionVL",
    "MulReductionVF", "MulReductionVD",
+    "MaxReductionV", "MinReductionV",
+    "AndReductionV", "OrReductionV", "XorReductionV",
    "MulAddVS2VI",
    "LShiftCntV","RShiftCntV",
    "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
    "RShiftVB","RShiftVS","RShiftVI","RShiftVL",
    "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
-    "MaxReductionV", "MinReductionV",
    "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
    "RoundDoubleModeV","LoadVector","StoreVector",
    "FmaVD", "FmaVF","PopCountVI",
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -375,8 +375,11 @@ macro(URShiftVS)
 macro(URShiftVI)
 macro(URShiftVL)
 macro(AndV)
+macro(AndReductionV)
 macro(OrV)
+macro(OrReductionV)
 macro(XorV)
+macro(XorReductionV)
 macro(MinV)
 macro(MaxV)
 macro(MinReductionV)
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@ -3034,6 +3034,9 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f
  case Op_MulReductionVD:
  case Op_MinReductionV:
  case Op_MaxReductionV:
+  case Op_AndReductionV:
+  case Op_OrReductionV:
+  case Op_XorReductionV:
    break;

  case Op_PackB:
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -673,7 +673,30 @@ int ReductionNode::opcode(int opc, BasicType bt) {
      assert(bt == T_DOUBLE, "must be");
      vopc = Op_MaxReductionV;
      break;
-    // TODO: add MulL for targets that support it
+    case Op_AndI:
+      assert(bt == T_INT, "must be");
+      vopc = Op_AndReductionV;
+      break;
+    case Op_AndL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_AndReductionV;
+      break;
+    case Op_OrI:
+      assert(bt == T_INT, "must be");
+      vopc = Op_OrReductionV;
+      break;
+    case Op_OrL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_OrReductionV;
+      break;
+    case Op_XorI:
+      assert(bt == T_INT, "must be");
+      vopc = Op_XorReductionV;
+      break;
+    case Op_XorL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_XorReductionV;
+      break;
    default:
      break;
  }
@ -697,8 +720,11 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
  case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
  case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
  case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
-  case Op_MinReductionV: return new MinReductionVNode(ctrl, n1, n2);
-  case Op_MaxReductionV: return new MaxReductionVNode(ctrl, n1, n2);
+  case Op_MinReductionV:  return new MinReductionVNode(ctrl, n1, n2);
+  case Op_MaxReductionV:  return new MaxReductionVNode(ctrl, n1, n2);
+  case Op_AndReductionV:  return new AndReductionVNode(ctrl, n1, n2);
+  case Op_OrReductionV:   return new OrReductionVNode(ctrl, n1, n2);
+  case Op_XorReductionV:  return new XorReductionVNode(ctrl, n1, n2);
  default:
    fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
    return NULL;
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -145,6 +145,15 @@ class ReductionNode : public Node {
  static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
  static int  opcode(int opc, BasicType bt);
  static bool implemented(int opc, uint vlen, BasicType bt);
+
+  virtual const Type* bottom_type() const {
+    BasicType vbt = in(2)->bottom_type()->is_vect()->element_basic_type();
+    return Type::get_const_basic_type(vbt);
+  }
+
+  virtual uint ideal_reg() const {
+    return bottom_type()->ideal_reg();
+  }
 };

 //------------------------------AddReductionVINode--------------------------------------
@ -613,6 +622,30 @@ class XorVNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------AndReductionVNode--------------------------------------
+// Vector and int, long as a reduction
+class AndReductionVNode : public ReductionNode {
+public:
+  AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------OrReductionVNode--------------------------------------
+// Vector or int, long as a reduction
+class OrReductionVNode : public ReductionNode {
+public:
+  OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------XorReductionVNode--------------------------------------
+// Vector xor int, long as a reduction
+class XorReductionVNode : public ReductionNode {
+public:
+  XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------MinVNode--------------------------------------
 // Vector min
 class MinVNode : public VectorNode {
@ -635,26 +668,6 @@ class MinReductionVNode : public ReductionNode {
 public:
  MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const {
-    BasicType bt = in(1)->bottom_type()->basic_type();
-    if (bt == T_FLOAT) {
-      return Type::FLOAT;
-    } else if (bt == T_DOUBLE) {
-      return Type::DOUBLE;
-    }
-    assert(false, "unsupported basic type");
-    return NULL;
-  }
-  virtual uint ideal_reg() const {
-    BasicType bt = in(1)->bottom_type()->basic_type();
-    if (bt == T_FLOAT) {
-      return Op_RegF;
-    } else if (bt == T_DOUBLE) {
-      return Op_RegD;
-    }
-    assert(false, "unsupported basic type");
-    return 0;
-  }
 };

 //------------------------------MaxReductionVNode--------------------------------------
@ -663,26 +676,6 @@ class MaxReductionVNode : public ReductionNode {
 public:
  MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const {
-    BasicType bt = in(1)->bottom_type()->basic_type();
-    if (bt == T_FLOAT) {
-      return Type::FLOAT;
-    } else {
-      return Type::DOUBLE;
-    }
-    assert(false, "unsupported basic type");
-    return NULL;
-  }
-  virtual uint ideal_reg() const {
-    BasicType bt = in(1)->bottom_type()->basic_type();
-    if (bt == T_FLOAT) {
-      return Op_RegF;
-    } else {
-      return Op_RegD;
-    }
-    assert(false, "unsupported basic type");
-    return 0;
-  }
 };

 //================================= M E M O R Y ===============================
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@ -1820,8 +1820,11 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(URShiftVINode, VectorNode)                              \
  declare_c2_type(URShiftVLNode, VectorNode)                              \
  declare_c2_type(AndVNode, VectorNode)                                   \
+  declare_c2_type(AndReductionVNode, ReductionNode)                       \
  declare_c2_type(OrVNode, VectorNode)                                    \
+  declare_c2_type(OrReductionVNode, ReductionNode)                        \
  declare_c2_type(XorVNode, VectorNode)                                   \
+  declare_c2_type(XorReductionVNode, ReductionNode)                       \
  declare_c2_type(MaxVNode, VectorNode)                                   \
  declare_c2_type(MinVNode, VectorNode)                                   \
  declare_c2_type(MaxReductionVNode, ReductionNode)                       \
--- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8240248
+ * @summary Add C2 x86 Superword support for scalar logical reduction optimizations : int test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.RedTest_int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.RedTest_int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.RedTest_int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.RedTest_int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.RedTest_int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.RedTest_int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.RedTest_int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.RedTest_int
+ */
+
+package compiler.loopopts.superword;
+
+public class RedTest_int {
+    static final int NUM = 1024;
+    static final int ITER = 8000;
+    public static void main(String[] args) throws Exception {
+        int[] a = new int[NUM];
+        int[] b = new int[NUM];
+        int[] c = new int[NUM];
+        int[] d = new int[NUM];
+        reductionInit1(a, b, c);
+        int total = 0;
+        int valid = 0;
+        for (int j = 0; j < ITER; j++) {
+            total = sumReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid += d[j];
+        }
+        testCorrectness(total, valid, "Add Reduction");
+
+        valid = 0;
+        for (int j = 0; j < ITER; j++) {
+            total = orReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid |= d[j];
+        }
+        testCorrectness(total, valid, "Or Reduction");
+
+        valid = -1;
+        for (int j = 0; j < ITER; j++) {
+            total = andReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid &= d[j];
+        }
+        testCorrectness(total, valid, "And Reduction");
+
+        valid = -1;
+        for (int j = 0; j < ITER; j++) {
+            total = xorReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid ^= d[j];
+        }
+        testCorrectness(total, valid, "Xor Reduction");
+
+        reductionInit2(a, b, c);
+        valid = 1;
+        for (int j = 0; j < ITER; j++) {
+            total = mulReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid *= d[j];
+        }
+        testCorrectness(total, valid, "Mul Reduction");
+    }
+
+    public static void reductionInit1(
+            int[] a,
+            int[] b,
+            int[] c) {
+        for (int i = 0; i < a.length; i++) {
+           a[i] = (i%2) + 0x4099;
+           b[i] = (i%2) + 0x1033;
+           c[i] = (i%2) + 0x455;
+        }
+    }
+
+    public static void reductionInit2(
+            int[] a,
+            int[] b,
+            int[] c) {
+        for (int i = 0; i < a.length; i++) {
+           a[i] = 0x11;
+           b[i] = 0x12;
+           c[i] = 0x13;
+        }
+    }
+
+
+    public static int sumReductionImplement(
+            int[] a,
+            int[] b,
+            int[] c,
+            int[] d) {
+        int total = 0;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total += d[i];
+        }
+        return total;
+    }
+
+    public static int orReductionImplement(
+            int[] a,
+            int[] b,
+            int[] c,
+            int[] d) {
+        int total = 0;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total |= d[i];
+        }
+        return total;
+    }
+
+    public static int andReductionImplement(
+            int[] a,
+            int[] b,
+            int[] c,
+            int[] d) {
+        int total = -1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total &= d[i];
+        }
+        return total;
+    }
+
+    public static int xorReductionImplement(
+            int[] a,
+            int[] b,
+            int[] c,
+            int[] d) {
+        int total = -1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total ^= d[i];
+        }
+        return total;
+    }
+
+    public static int mulReductionImplement(
+            int[] a,
+            int[] b,
+            int[] c,
+            int[] d) {
+        int total = 1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total = total*d[i];
+        }
+        return total;
+    }
+
+    public static void testCorrectness(
+            int total,
+            int valid,
+            String op) throws Exception {
+        if (total == valid) {
+            System.out.println(op + ": Success");
+        } else {
+            System.out.println("Invalid total: " + total);
+            System.out.println("Expected value = " + valid);
+            throw new Exception(op + ": Failed");
+        }
+    }
+}
+
--- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8240248
+ * @summary Add C2 x86 Superword support for scalar logical reduction optimizations : long test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.RedTest_long
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=2
+ *      compiler.loopopts.superword.RedTest_long
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.RedTest_long
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=4
+ *      compiler.loopopts.superword.RedTest_long
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.RedTest_long
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=8
+ *      compiler.loopopts.superword.RedTest_long
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-TieredCompilation
+ *      -XX:+SuperWordReductions
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.RedTest_long
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
+ *      -XX:CompileThresholdScaling=0.1
+ *      -XX:-SuperWordReductions
+ *      -XX:LoopMaxUnroll=16
+ *      compiler.loopopts.superword.RedTest_long
+ */
+
+package compiler.loopopts.superword;
+
+public class RedTest_long {
+    static final int NUM = 512;
+    static final int ITER = 8000;
+    public static void main(String[] args) throws Exception {
+        long[] a = new long[NUM];
+        long[] b = new long[NUM];
+        long[] c = new long[NUM];
+        long[] d = new long[NUM];
+        reductionInit1(a, b, c);
+        long total = 0;
+        long valid = 0;
+        for (int j = 0; j < ITER; j++) {
+            total = sumReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid += d[j];
+        }
+        testCorrectness(total, valid, "Add Reduction");
+
+        valid = 0;
+        for (int j = 0; j < ITER; j++) {
+            total = orReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid |= d[j];
+        }
+        testCorrectness(total, valid, "Or Reduction");
+
+        valid = -1;
+        for (int j = 0; j < ITER; j++) {
+            total = andReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid &= d[j];
+        }
+        testCorrectness(total, valid, "And Reduction");
+
+        valid = -1;
+        for (int j = 0; j < ITER; j++) {
+            total = xorReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid ^= d[j];
+        }
+        testCorrectness(total, valid, "Xor Reduction");
+
+        reductionInit2(a, b, c);
+        valid = 1;
+        for (int j = 0; j < ITER; j++) {
+            total = mulReductionImplement(a, b, c, d);
+        }
+        for (int j = 0; j < d.length; j++) {
+            valid *= d[j];
+        }
+        testCorrectness(total, valid, "Mul Reduction");
+    }
+
+    public static void reductionInit1(
+            long[] a,
+            long[] b,
+            long[] c) {
+        for (int i = 0; i < a.length; i++) {
+           a[i] = (i%2) + 0x4099;
+           b[i] = (i%2) + 0x1033;
+           c[i] = (i%2) + 0x455;
+        }
+    }
+
+    public static void reductionInit2(
+            long[] a,
+            long[] b,
+            long[] c) {
+        for (int i = 0; i < a.length; i++) {
+           a[i] = 0x11;
+           b[i] = 0x12;
+           c[i] = 0x13;
+        }
+    }
+
+
+    public static long sumReductionImplement(
+            long[] a,
+            long[] b,
+            long[] c,
+            long[] d) {
+        long total = 0;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total += d[i];
+        }
+        return total;
+    }
+
+    public static long orReductionImplement(
+            long[] a,
+            long[] b,
+            long[] c,
+            long[] d) {
+        long total = 0;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total |= d[i];
+        }
+        return total;
+    }
+
+    public static long andReductionImplement(
+            long[] a,
+            long[] b,
+            long[] c,
+            long[] d) {
+        long total = -1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total &= d[i];
+        }
+        return total;
+    }
+
+    public static long xorReductionImplement(
+            long[] a,
+            long[] b,
+            long[] c,
+            long[] d) {
+        long total = -1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total ^= d[i];
+        }
+        return total;
+    }
+
+    public static long mulReductionImplement(
+            long[] a,
+            long[] b,
+            long[] c,
+            long[] d) {
+        long total = 1;
+        for (int i = 0; i < a.length; i++) {
+            d[i] = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+            total = total*d[i];
+        }
+        return total;
+    }
+
+    public static void testCorrectness(
+            long total,
+            long valid,
+            String op) throws Exception {
+        if (total == valid) {
+            System.out.println(op + ": Success");
+        } else {
+            System.out.println("Invalid total: " + total);
+            System.out.println("Expected value = " + valid);
+            throw new Exception(op + ": Failed");
+        }
+    }
+}
+
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction.java
@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public abstract class VectorReduction {
+    @Param({"512"})
+    public int COUNT;
+
+    private int[] intsA;
+    private int[] intsB;
+    private int[] intsC;
+    private int[] intsD;
+    private int resI;
+    private long[] longsA;
+    private long[] longsB;
+    private long[] longsC;
+    private long[] longsD;
+    private long resL;
+
+    @Param("0")
+    private int seed;
+    private Random r = new Random(seed);
+
+    @Setup
+    public void init() {
+        intsA = new int[COUNT];
+        intsB = new int[COUNT];
+        intsC = new int[COUNT];
+        intsD = new int[COUNT];
+        longsA = new long[COUNT];
+        longsB = new long[COUNT];
+        longsC = new long[COUNT];
+        longsD = new long[COUNT];
+
+        for (int i = 0; i < COUNT; i++) {
+            intsA[i] = r.nextInt();
+            intsB[i] = r.nextInt();
+            intsC[i] = r.nextInt();
+            longsA[i] = r.nextLong();
+            longsB[i] = r.nextLong();
+            longsC[i] = r.nextLong();
+        }
+    }
+
+    @Benchmark
+    public void andRedI() {
+        for (int i = 0; i < COUNT; i++) {
+            intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
+            resI &= intsD[i];
+        }
+    }
+
+    @Benchmark
+    public void orRedI() {
+        for (int i = 0; i < COUNT; i++) {
+            intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
+            resI |= intsD[i];
+        }
+    }
+
+    @Benchmark
+    public void xorRedI() {
+        for (int i = 0; i < COUNT; i++) {
+            intsD[i] = (intsA[i] * intsB[i]) + (intsA[i] * intsC[i]) + (intsB[i] * intsC[i]);
+            resI ^= intsD[i];
+        }
+    }
+
+    @Benchmark
+    public void andRedL() {
+        for (int i = 0; i < COUNT; i++) {
+            longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
+            resL &= longsD[i];
+        }
+    }
+
+    @Benchmark
+    public void orRedL() {
+        for (int i = 0; i < COUNT; i++) {
+            longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
+            resL |= longsD[i];
+        }
+    }
+
+    @Benchmark
+    public void xorRedL() {
+        for (int i = 0; i < COUNT; i++) {
+            longsD[i] = (longsA[i] + longsB[i]) + (longsA[i] + longsC[i]) + (longsB[i] + longsC[i]);
+            resL ^= longsD[i];
+        }
+    }
+
+    @Fork(value = 1, jvmArgsPrepend = {
+        "-XX:+UseSuperWord"
+    })
+    public static class WithSuperword extends VectorReduction {
+
+    }
+
+    @Fork(value = 1, jvmArgsPrepend = {
+        "-XX:-UseSuperWord"
+    })
+    public static class NoSuperword extends VectorReduction {
+    }
+
+}
+