8285790: AArch64: Merge C2 NEON and SVE matching rules

Co-authored-by: Ningsheng Jian <njian@openjdk.org> Co-authored-by: Eric Liu <eliu@openjdk.org> Reviewed-by: adinn, aph, xgong
2022-08-17 03:51:46 +00:00 · 2022-08-17 03:51:46 +00:00 · 0cc66aeae8
commit 0cc66aeae8
parent da477b1366
19 changed files with 11741 additions and 18945 deletions
--- a/make/hotspot/gensrc/GensrcAdlc.gmk
+++ b/make/hotspot/gensrc/GensrcAdlc.gmk
@ -142,8 +142,7 @@ ifeq ($(call check-jvm-feature, compiler2), true)

  ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
    AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
-        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_neon.ad \
-        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \
+        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_vector.ad \
    )))
  endif

--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
--- a/src/hotspot/cpu/aarch64/aarch64_neon.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad
--- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -1004,7 +1004,7 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
    // Repeat on higher bytes and join the results.
    // Compress 8 bytes in each iteration.
    for (int idx = 1; idx < (lane_cnt / 8); idx++) {
-      sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
+      sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
      bytemask_compress(rscratch1);
      orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
    }
@ -1108,6 +1108,7 @@ void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicTyp
  sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
 }

+// Clobbers: rflags
 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
                                    FloatRegister zn, FloatRegister zm, int cond) {
  assert(pg->is_governing(), "This register has to be a governing predicate register");
@ -1145,6 +1146,61 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
  subw(dst, rscratch1, dst);
 }

+// Extend integer vector src to dst with the same lane count
+// but larger element size, e.g. 4B -> 4I
+void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
+                                           FloatRegister src, BasicType src_bt) {
+  if (src_bt == T_BYTE) {
+    if (dst_bt == T_SHORT) {
+      // 4B/8B to 4S/8S
+      assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
+      sxtl(dst, T8H, src, T8B);
+    } else {
+      // 4B to 4I
+      assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
+      sxtl(dst, T8H, src, T8B);
+      sxtl(dst, T4S, dst, T4H);
+    }
+  } else if (src_bt == T_SHORT) {
+    // 4S to 4I
+    assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
+    sxtl(dst, T4S, src, T4H);
+  } else if (src_bt == T_INT) {
+    // 2I to 2L
+    assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
+    sxtl(dst, T2D, src, T2S);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// Narrow integer vector src down to dst with the same lane count
+// but smaller element size, e.g. 4I -> 4B
+void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
+                                           FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
+  if (src_bt == T_SHORT) {
+    // 4S/8S to 4B/8B
+    assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_BYTE, "unsupported");
+    xtn(dst, T8B, src, T8H);
+  } else if (src_bt == T_INT) {
+    // 4I to 4B/4S
+    assert(src_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
+    xtn(dst, T4H, src, T4S);
+    if (dst_bt == T_BYTE) {
+      xtn(dst, T8B, dst, T8H);
+    }
+  } else if (src_bt == T_LONG) {
+    // 2L to 2I
+    assert(src_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_INT, "unsupported");
+    xtn(dst, T2S, src, T2D);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
                                          FloatRegister src, SIMD_RegVariant src_size) {
  assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
@ -1257,6 +1313,275 @@ void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
  }
 }

+// Vector reduction add for integral type with ASIMD instructions.
+void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
+                                                 Register isrc, FloatRegister vsrc,
+                                                 unsigned vector_length_in_bytes,
+                                                 FloatRegister vtmp) {
+  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
+  assert_different_registers(dst, isrc);
+  bool isQ = vector_length_in_bytes == 16;
+
+  BLOCK_COMMENT("neon_reduce_add_integral {");
+    switch(bt) {
+      case T_BYTE:
+        addv(vtmp, isQ ? T16B : T8B, vsrc);
+        smov(dst, vtmp, B, 0);
+        addw(dst, dst, isrc, ext::sxtb);
+        break;
+      case T_SHORT:
+        addv(vtmp, isQ ? T8H : T4H, vsrc);
+        smov(dst, vtmp, H, 0);
+        addw(dst, dst, isrc, ext::sxth);
+        break;
+      case T_INT:
+        isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
+        umov(dst, vtmp, S, 0);
+        addw(dst, dst, isrc);
+        break;
+      case T_LONG:
+        assert(isQ, "unsupported");
+        addpd(vtmp, vsrc);
+        umov(dst, vtmp, D, 0);
+        add(dst, dst, isrc);
+        break;
+      default:
+        assert(false, "unsupported");
+        ShouldNotReachHere();
+    }
+  BLOCK_COMMENT("} neon_reduce_add_integral");
+}
+
+// Vector reduction multiply for integral type with ASIMD instructions.
+// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
+// Clobbers: rscratch1
+void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
+                                                 Register isrc, FloatRegister vsrc,
+                                                 unsigned vector_length_in_bytes,
+                                                 FloatRegister vtmp1, FloatRegister vtmp2) {
+  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
+  bool isQ = vector_length_in_bytes == 16;
+
+  BLOCK_COMMENT("neon_reduce_mul_integral {");
+    switch(bt) {
+      case T_BYTE:
+        if (isQ) {
+          // Multiply the lower half and higher half of vector iteratively.
+          // vtmp1 = vsrc[8:15]
+          ins(vtmp1, D, vsrc, 0, 1);
+          // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
+          mulv(vtmp1, T8B, vtmp1, vsrc);
+          // vtmp2 = vtmp1[4:7]
+          ins(vtmp2, S, vtmp1, 0, 1);
+          // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
+          mulv(vtmp1, T8B, vtmp2, vtmp1);
+        } else {
+          ins(vtmp1, S, vsrc, 0, 1);
+          mulv(vtmp1, T8B, vtmp1, vsrc);
+        }
+        // vtmp2 = vtmp1[2:3]
+        ins(vtmp2, H, vtmp1, 0, 1);
+        // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
+        mulv(vtmp2, T8B, vtmp2, vtmp1);
+        // dst = vtmp2[0] * isrc * vtmp2[1]
+        umov(rscratch1, vtmp2, B, 0);
+        mulw(dst, rscratch1, isrc);
+        sxtb(dst, dst);
+        umov(rscratch1, vtmp2, B, 1);
+        mulw(dst, rscratch1, dst);
+        sxtb(dst, dst);
+        break;
+      case T_SHORT:
+        if (isQ) {
+          ins(vtmp2, D, vsrc, 0, 1);
+          mulv(vtmp2, T4H, vtmp2, vsrc);
+          ins(vtmp1, S, vtmp2, 0, 1);
+          mulv(vtmp1, T4H, vtmp1, vtmp2);
+        } else {
+          ins(vtmp1, S, vsrc, 0, 1);
+          mulv(vtmp1, T4H, vtmp1, vsrc);
+        }
+        umov(rscratch1, vtmp1, H, 0);
+        mulw(dst, rscratch1, isrc);
+        sxth(dst, dst);
+        umov(rscratch1, vtmp1, H, 1);
+        mulw(dst, rscratch1, dst);
+        sxth(dst, dst);
+        break;
+      case T_INT:
+        if (isQ) {
+          ins(vtmp1, D, vsrc, 0, 1);
+          mulv(vtmp1, T2S, vtmp1, vsrc);
+        } else {
+          vtmp1 = vsrc;
+        }
+        umov(rscratch1, vtmp1, S, 0);
+        mul(dst, rscratch1, isrc);
+        umov(rscratch1, vtmp1, S, 1);
+        mul(dst, rscratch1, dst);
+        break;
+      case T_LONG:
+        umov(rscratch1, vsrc, D, 0);
+        mul(dst, isrc, rscratch1);
+        umov(rscratch1, vsrc, D, 1);
+        mul(dst, dst, rscratch1);
+        break;
+      default:
+        assert(false, "unsupported");
+        ShouldNotReachHere();
+    }
+  BLOCK_COMMENT("} neon_reduce_mul_integral");
+}
+
+// Vector reduction multiply for floating-point type with ASIMD instructions.
+void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
+                                           FloatRegister fsrc, FloatRegister vsrc,
+                                           unsigned vector_length_in_bytes,
+                                           FloatRegister vtmp) {
+  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
+  bool isQ = vector_length_in_bytes == 16;
+
+  BLOCK_COMMENT("neon_reduce_mul_fp {");
+    switch(bt) {
+      case T_FLOAT:
+        fmuls(dst, fsrc, vsrc);
+        ins(vtmp, S, vsrc, 0, 1);
+        fmuls(dst, dst, vtmp);
+        if (isQ) {
+          ins(vtmp, S, vsrc, 0, 2);
+          fmuls(dst, dst, vtmp);
+          ins(vtmp, S, vsrc, 0, 3);
+          fmuls(dst, dst, vtmp);
+         }
+        break;
+      case T_DOUBLE:
+        assert(isQ, "unsupported");
+        fmuld(dst, fsrc, vsrc);
+        ins(vtmp, D, vsrc, 0, 1);
+        fmuld(dst, dst, vtmp);
+        break;
+      default:
+        assert(false, "unsupported");
+        ShouldNotReachHere();
+    }
+  BLOCK_COMMENT("} neon_reduce_mul_fp");
+}
+
+// Helper to select logical instruction
+void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
+                                                   Register Rn, Register Rm,
+                                                   enum shift_kind kind, unsigned shift) {
+  switch(opc) {
+    case Op_AndReductionV:
+      is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
+      break;
+    case Op_OrReductionV:
+      is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
+      break;
+    case Op_XorReductionV:
+      is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
+      break;
+    default:
+      assert(false, "unsupported");
+      ShouldNotReachHere();
+  }
+}
+
+// Vector reduction logical operations And, Or, Xor
+// Clobbers: rscratch1
+void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
+                                            Register isrc, FloatRegister vsrc,
+                                            unsigned vector_length_in_bytes) {
+  assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
+         "unsupported");
+  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
+  assert_different_registers(dst, isrc);
+  bool isQ = vector_length_in_bytes == 16;
+
+  BLOCK_COMMENT("neon_reduce_logical {");
+    umov(rscratch1, vsrc, isQ ? D : S, 0);
+    umov(dst, vsrc, isQ ? D : S, 1);
+    neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
+    switch(bt) {
+      case T_BYTE:
+        if (isQ) {
+          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
+        }
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
+        sxtb(dst, dst);
+        break;
+      case T_SHORT:
+        if (isQ) {
+          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
+        }
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
+        sxth(dst, dst);
+        break;
+      case T_INT:
+        if (isQ) {
+          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
+        }
+        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
+        break;
+      case T_LONG:
+        assert(isQ, "unsupported");
+        neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
+        break;
+      default:
+        assert(false, "unsupported");
+        ShouldNotReachHere();
+    }
+  BLOCK_COMMENT("} neon_reduce_logical");
+}
+
+// Vector reduction min/max for integral type with ASIMD instructions.
+// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
+// Clobbers: rscratch1, rflags
+void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
+                                                    Register isrc, FloatRegister vsrc,
+                                                    unsigned vector_length_in_bytes,
+                                                    FloatRegister vtmp) {
+  assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
+  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
+  assert_different_registers(dst, isrc);
+  bool isQ = vector_length_in_bytes == 16;
+  bool is_min = opc == Op_MinReductionV;
+
+  BLOCK_COMMENT("neon_reduce_minmax_integral {");
+    if (bt == T_LONG) {
+      assert(vtmp == fnoreg, "should be");
+      assert(isQ, "should be");
+      umov(rscratch1, vsrc, D, 0);
+      cmp(isrc, rscratch1);
+      csel(dst, isrc, rscratch1, is_min ? LT : GT);
+      umov(rscratch1, vsrc, D, 1);
+      cmp(dst, rscratch1);
+      csel(dst, dst, rscratch1, is_min ? LT : GT);
+    } else {
+      SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
+      if (size == T2S) {
+        is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
+      } else {
+        is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
+      }
+      if (bt == T_INT) {
+        umov(dst, vtmp, S, 0);
+      } else {
+        smov(dst, vtmp, elemType_to_regVariant(bt), 0);
+      }
+      cmpw(dst, isrc);
+      cselw(dst, dst, isrc, is_min ? LT : GT);
+    }
+  BLOCK_COMMENT("} neon_reduce_minmax_integral");
+}
+
+// Vector reduction for integral type with SVE instruction.
+// Supported operations are Add, And, Or, Xor, Max, Min.
+// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
                                            FloatRegister src2, PRegister pg, FloatRegister tmp) {
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
@ -1267,12 +1592,14 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
  switch (opc) {
    case Op_AddReductionVI: {
      sve_uaddv(tmp, size, pg, src2);
-      smov(dst, tmp, size, 0);
      if (bt == T_BYTE) {
+        smov(dst, tmp, size, 0);
        addw(dst, src1, dst, ext::sxtb);
      } else if (bt == T_SHORT) {
+        smov(dst, tmp, size, 0);
        addw(dst, src1, dst, ext::sxth);
      } else {
+        umov(dst, tmp, size, 0);
        addw(dst, dst, src1);
      }
      break;
@ -1285,45 +1612,57 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
    }
    case Op_AndReductionV: {
      sve_andv(tmp, size, pg, src2);
-      if (bt == T_LONG) {
+      if (bt == T_INT || bt == T_LONG) {
        umov(dst, tmp, size, 0);
-        andr(dst, dst, src1);
      } else {
        smov(dst, tmp, size, 0);
+      }
+      if (bt == T_LONG) {
+        andr(dst, dst, src1);
+      } else {
        andw(dst, dst, src1);
      }
      break;
    }
    case Op_OrReductionV: {
      sve_orv(tmp, size, pg, src2);
-      if (bt == T_LONG) {
+      if (bt == T_INT || bt == T_LONG) {
        umov(dst, tmp, size, 0);
-        orr(dst, dst, src1);
      } else {
        smov(dst, tmp, size, 0);
+      }
+      if (bt == T_LONG) {
+        orr(dst, dst, src1);
+      } else {
        orrw(dst, dst, src1);
      }
      break;
    }
    case Op_XorReductionV: {
      sve_eorv(tmp, size, pg, src2);
-      if (bt == T_LONG) {
+      if (bt == T_INT || bt == T_LONG) {
        umov(dst, tmp, size, 0);
-        eor(dst, dst, src1);
      } else {
        smov(dst, tmp, size, 0);
+      }
+      if (bt == T_LONG) {
+        eor(dst, dst, src1);
+      } else {
        eorw(dst, dst, src1);
      }
      break;
    }
    case Op_MaxReductionV: {
      sve_smaxv(tmp, size, pg, src2);
-      if (bt == T_LONG) {
+      if (bt == T_INT || bt == T_LONG) {
        umov(dst, tmp, size, 0);
+      } else {
+        smov(dst, tmp, size, 0);
+      }
+      if (bt == T_LONG) {
        cmp(dst, src1);
        csel(dst, dst, src1, Assembler::GT);
      } else {
-        smov(dst, tmp, size, 0);
        cmpw(dst, src1);
        cselw(dst, dst, src1, Assembler::GT);
      }
@ -1331,12 +1670,15 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
    }
    case Op_MinReductionV: {
      sve_sminv(tmp, size, pg, src2);
-      if (bt == T_LONG) {
+      if (bt == T_INT || bt == T_LONG) {
        umov(dst, tmp, size, 0);
+      } else {
+        smov(dst, tmp, size, 0);
+      }
+      if (bt == T_LONG) {
        cmp(dst, src1);
        csel(dst, dst, src1, Assembler::LT);
      } else {
-        smov(dst, tmp, size, 0);
        cmpw(dst, src1);
        cselw(dst, dst, src1, Assembler::LT);
      }
@ -1573,23 +1915,32 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src,

 // Extract a scalar element from an sve vector at position 'idx'.
 // The input elements in src are expected to be of integral type.
-void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
-                                             bool is_signed, FloatRegister vtmp) {
-  assert(UseSVE > 0 && size != Q, "unsupported");
-  assert(!(is_signed && size == D), "signed extract (D) not supported.");
+void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
+                                             int idx, FloatRegister vtmp) {
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
+  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
  if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
-    is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx);
+    if (bt == T_INT || bt == T_LONG) {
+      umov(dst, src, size, idx);
+    } else {
+      smov(dst, src, size, idx);
+    }
  } else {
    sve_orr(vtmp, src, src);
    sve_ext(vtmp, vtmp, idx << size);
-    is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0);
+    if (bt == T_INT || bt == T_LONG) {
+      umov(dst, vtmp, size, 0);
+    } else {
+      smov(dst, vtmp, size, 0);
+    }
  }
 }

 // java.lang.Math::round intrinsics

+// Clobbers: rscratch1, rflags
 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
-                                       FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
+                                          FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
  assert_different_registers(tmp1, tmp2, tmp3, src, dst);
  switch (T) {
    case T2S:
@ -1620,8 +1971,10 @@ void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src,
  // result in dst
 }

+// Clobbers: rscratch1, rflags
 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
-                                      FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) {
+                                         FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
+  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
  assert_different_registers(tmp1, tmp2, src, dst);

  switch (T) {
@ -1632,7 +1985,7 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
      mov(rscratch1, julong_cast(0x1.0p52));
      break;
    default:
-      assert(T == S || T == D, "invalid arrangement");
+      assert(T == S || T == D, "invalid register variant");
  }

  sve_frinta(dst, T, ptrue, src);
@ -1642,12 +1995,12 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F

  sve_fneg(tmp1, T, ptrue, src);
  sve_dup(tmp2, T, rscratch1);
-  sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1);
+  sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
  br(EQ, none);
  {
-    sve_cpy(tmp1, T, ptmp, 0.5);
-    sve_fadd(tmp1, T, ptmp, src);
-    sve_frintm(dst, T, ptmp, tmp1);
+    sve_cpy(tmp1, T, pgtmp, 0.5);
+    sve_fadd(tmp1, T, pgtmp, src);
+    sve_frintm(dst, T, pgtmp, tmp1);
    // dst = floor(src + 0.5, ties to even)
  }
  bind(none);
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@ -31,6 +31,9 @@
  // Return true if the phase output is in the scratch emit size mode.
  virtual bool in_scratch_emit_size() override;

+  void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
+                                  enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
+
 public:
  void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
  static int entry_barrier_stub_size();
@ -84,6 +87,13 @@

  void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);

+  // Vector cast
+  void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
+                          FloatRegister src, BasicType src_bt);
+
+  void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
+                          FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
+
  void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
                         FloatRegister src, SIMD_RegVariant src_size);

@ -96,6 +106,27 @@
  void sve_vmaskcast_narrow(PRegister dst, PRegister src,
                            uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);

+  // Vector reduction
+  void neon_reduce_add_integral(Register dst, BasicType bt,
+                                Register isrc, FloatRegister vsrc,
+                                unsigned vector_length_in_bytes, FloatRegister vtmp);
+
+  void neon_reduce_mul_integral(Register dst, BasicType bt,
+                                Register isrc, FloatRegister vsrc,
+                                unsigned vector_length_in_bytes,
+                                FloatRegister vtmp1, FloatRegister vtmp2);
+
+  void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
+                          FloatRegister fsrc, FloatRegister vsrc,
+                          unsigned vector_length_in_bytes, FloatRegister vtmp);
+
+  void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
+                           FloatRegister vsrc, unsigned vector_length_in_bytes);
+
+  void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
+                                   Register isrc, FloatRegister vsrc,
+                                   unsigned vector_length_in_bytes, FloatRegister vtmp);
+
  void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
                           FloatRegister src2, PRegister pg, FloatRegister tmp);

@ -107,15 +138,15 @@

  // Extract a scalar element from an sve vector at position 'idx'.
  // The input elements in src are expected to be of integral type.
-  void sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
-                            bool is_signed, FloatRegister vtmp);
+  void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
+                            int idx, FloatRegister vtmp);

  // java.lang.Math::round intrinsics
  void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
                         FloatRegister tmp2, FloatRegister tmp3,
                         SIMD_Arrangement T);
  void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
-                        FloatRegister tmp2, PRegister ptmp,
+                        FloatRegister tmp2, PRegister pgtmp,
                        SIMD_RegVariant T);

  // Pack active elements of src, under the control of mask, into the
--- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
@ -52,8 +52,8 @@
  // the cpu only look at the lower 5/6 bits anyway?
  static const bool need_masked_shift_count = false;

-  // No support for generic vector operands.
-  static const bool supports_generic_vector_operands = false;
+  // aarch64 supports generic vector operands: vReg.
+  static const bool supports_generic_vector_operands = true;

  static constexpr bool isSimpleConstant64(jlong value) {
    // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
--- a/src/hotspot/cpu/aarch64/register_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/register_aarch64.hpp
@ -137,7 +137,7 @@ class FloatRegisterImpl: public AbstractRegisterImpl {
 public:
  enum {
    number_of_registers = 32,
-    max_slots_per_register = 8,
+    max_slots_per_register = 4,
    save_slots_per_register = 2,
    slots_per_neon_register = 4,
    extra_save_slots_per_neon_register = slots_per_neon_register - save_slots_per_register
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp
@ -167,6 +167,12 @@ public:
  static void initialize_cpu_information(void);

  static bool use_rop_protection() { return _rop_protection; }
+
+  // For common 64/128-bit unpredicated vector operations, we may prefer
+  // emitting NEON instructions rather than the corresponding SVE instructions.
+  static bool use_neon_for_vector(int vector_length_in_bytes) {
+    return vector_length_in_bytes <= 16;
+  }
 };

 #endif // CPU_AARCH64_VM_VERSION_AARCH64_HPP
--- a/src/hotspot/share/adlc/output_c.cpp
+++ b/src/hotspot/share/adlc/output_c.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -2274,6 +2274,9 @@ private:
 #if defined(PPC64)
    if (strcmp(rep_var,"$VectorRegister") == 0)   return "as_VectorRegister";
    if (strcmp(rep_var,"$VectorSRegister") == 0)  return "as_VectorSRegister";
+#endif
+#if defined(AARCH64)
+    if (strcmp(rep_var,"$PRegister") == 0)  return "as_PRegister";
 #endif
    return NULL;
  }
--- a/src/hotspot/share/opto/machnode.hpp
+++ b/src/hotspot/share/opto/machnode.hpp
@ -135,6 +135,14 @@ public:
    return ::as_VectorSRegister(reg(ra_, node, idx));
  }
 #endif
+#if defined(AARCH64)
+  PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node) const {
+    return ::as_PRegister(reg(ra_, node));
+  }
+  PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node, int idx) const {
+    return ::as_PRegister(reg(ra_, node, idx));
+  }
+#endif

  virtual intptr_t  constant() const;
  virtual relocInfo::relocType constant_reloc() const;
--- a/src/hotspot/share/opto/regmask.hpp
+++ b/src/hotspot/share/opto/regmask.hpp
@ -99,7 +99,7 @@ class RegMask {
  // requirement is internal to the allocator, and independent of any
  // particular platform.
  enum { SlotsPerLong = 2,
-         SlotsPerVecA = RISCV_ONLY(4) NOT_RISCV(8),
+         SlotsPerVecA = 4,
         SlotsPerVecS = 1,
         SlotsPerVecD = 2,
         SlotsPerVecX = 4,
--- a/test/hotspot/jtreg/compiler/vectorapi/AllBitsSetVectorMatchRuleTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/AllBitsSetVectorMatchRuleTest.java
@ -84,7 +84,7 @@ public class AllBitsSetVectorMatchRuleTest {

    @Test
    @Warmup(10000)
-    @IR(counts = { "bic", " >= 1" })
+    @IR(counts = { "vand_notI", " >= 1" })
    public static void testAllBitsSetVector() {
        IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
        IntVector bv = IntVector.fromArray(I_SPECIES, ib, 0);
@ -98,7 +98,7 @@ public class AllBitsSetVectorMatchRuleTest {

    @Test
    @Warmup(10000)
-    @IR(counts = { "bic", " >= 1" })
+    @IR(counts = { "and_notL", " >= 1" })
    public static void testAllBitsSetMask() {
        VectorMask<Long> avm = VectorMask.fromArray(L_SPECIES, ma, 0);
        VectorMask<Long> bvm = VectorMask.fromArray(L_SPECIES, mb, 0);
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java
@ -224,7 +224,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mla", ">= 1" })
+    @IR(counts = { "vmla_masked", ">= 1" })
    public static void testByteMultiplyAddMasked() {
        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
@ -237,7 +237,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mls", ">= 1" })
+    @IR(counts = { "vmls_masked", ">= 1" })
    public static void testByteMultiplySubMasked() {
        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
@ -250,7 +250,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mla", ">= 1" })
+    @IR(counts = { "vmla_masked", ">= 1" })
    public static void testShortMultiplyAddMasked() {
        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
@ -263,7 +263,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mls", ">= 1" })
+    @IR(counts = { "vmls_masked", ">= 1" })
    public static void testShortMultiplySubMasked() {
        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
@ -276,7 +276,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mla", ">= 1" })
+    @IR(counts = { "vmla_masked", ">= 1" })
    public static void testIntMultiplyAddMasked() {
        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
@ -289,7 +289,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mls", ">= 1" })
+    @IR(counts = { "vmls_masked", ">= 1" })
    public static void testIntMultiplySubMasked() {
        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
@ -302,7 +302,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mla", ">= 1" })
+    @IR(counts = { "vmla_masked", ">= 1" })
    public static void testLongMultiplyAddMasked() {
        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
@ -315,7 +315,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_mls", ">= 1" })
+    @IR(counts = { "vmls_masked", ">= 1" })
    public static void testLongMultiplySubMasked() {
        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
@ -328,7 +328,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fmsb", ">= 1" })
+    @IR(counts = { "vfmsb_masked", ">= 1" })
    public static void testFloatMultiplySubMasked() {
        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -341,7 +341,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fnmad", ">= 1" })
+    @IR(counts = { "vfnmad_masked", ">= 1" })
    public static void testFloatNegatedMultiplyAddMasked() {
        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -354,7 +354,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fnmsb", ">= 1" })
+    @IR(counts = { "vfnmsb_masked", ">= 1" })
    public static void testFloatNegatedMultiplySubMasked() {
        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -367,7 +367,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fmsb", ">= 1" })
+    @IR(counts = { "vfmsb_masked", ">= 1" })
    public static void testDoubleMultiplySubMasked() {
        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
@ -380,7 +380,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fnmad", ">= 1" })
+    @IR(counts = { "vfnmad_masked", ">= 1" })
    public static void testDoubleNegatedMultiplyAddMasked() {
        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
@ -393,7 +393,7 @@ public class VectorFusedMultiplyAddSubTest {
    }

    @Test
-    @IR(counts = { "sve_fnmsb", ">= 1" })
+    @IR(counts = { "vfnmsb_masked", ">= 1" })
    public static void testDoubleNegatedMultiplySubMasked() {
        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskedNotTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskedNotTest.java
@ -77,7 +77,7 @@ public class VectorMaskedNotTest {

    @Test
    @Warmup(10000)
-    @IR(counts = { "sve_not", ">= 1" })
+    @IR(counts = { "vnotI_masked", ">= 1" })
    public static void testIntNotMasked() {
        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
        IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
@ -95,7 +95,7 @@ public class VectorMaskedNotTest {

    @Test
    @Warmup(10000)
-    @IR(counts = { "sve_not", ">= 1" })
+    @IR(counts = { "vnotL_masked", ">= 1" })
    public static void testLongNotMasked() {
        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
        LongVector av = LongVector.fromArray(L_SPECIES, la, 0);