8282541: AArch64: Auto-vectorize Math.round API

Reviewed-by: njian, ngasson, adinn
2022-04-26 13:43:12 +00:00 · 2022-04-26 13:43:12 +00:00 · a7b5157375
commit a7b5157375
parent 8de3c65545
20 changed files with 1197 additions and 611 deletions
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -15141,6 +15141,30 @@ instruct convL2D_reg_reg(vRegD dst, iRegL src) %{
  ins_pipe(fp_l2d);
 %}

+instruct round_double_reg(iRegLNoSp dst, vRegD src, vRegD ftmp, rFlagsReg cr)
+%{
+  match(Set dst (RoundD src));
+  effect(TEMP_DEF dst, TEMP ftmp, KILL cr);
+  format %{ "java_round_double $dst,$src"%}
+  ins_encode %{
+    __ java_round_double($dst$$Register, as_FloatRegister($src$$reg),
+                         as_FloatRegister($ftmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct round_float_reg(iRegINoSp dst, vRegF src, vRegF ftmp, rFlagsReg cr)
+%{
+  match(Set dst (RoundF src));
+  effect(TEMP_DEF dst, TEMP ftmp, KILL cr);
+  format %{ "java_round_float $dst,$src"%}
+  ins_encode %{
+    __ java_round_float($dst$$Register, as_FloatRegister($src$$reg),
+                        as_FloatRegister($ftmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // stack <-> reg and reg <-> reg shuffles with no conversion

 instruct MoveF2I_stack_reg(iRegINoSp dst, stackSlotF src) %{
--- a/src/hotspot/cpu/aarch64/aarch64_neon.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad
@ -570,6 +570,52 @@ instruct vcvt2Dto2F(vecD dst, vecX src)
  ins_pipe(pipe_class_default);
 %}

+
+instruct vroundvecD2Fto2I(vecD dst, vecD src, vecD tmp1, vecD tmp2, vecD tmp3)
+%{
+  predicate(UseSVE == 0 &&
+            n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
+  match(Set dst (RoundVF src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "vround  $dst, T2S, $src\t# round vecD 2F to 2I vector" %}
+  ins_encode %{
+    __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                         as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                         as_FloatRegister($tmp3$$reg), __ T2S);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vroundvecX4Fto4I(vecX dst, vecX src, vecX tmp1, vecX tmp2, vecX tmp3)
+%{
+  predicate(UseSVE == 0 &&
+            n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
+  match(Set dst (RoundVF src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "vround  $dst, T4S, $src\t# round vecX 4F to 4I vector" %}
+  ins_encode %{
+    __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                         as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                         as_FloatRegister($tmp3$$reg), __ T4S);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vroundvecX2Dto2L(vecX dst, vecX src, vecX tmp1, vecX tmp2, vecX tmp3)
+%{
+  predicate(UseSVE == 0 &&
+            n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
+  match(Set dst (RoundVD src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "vround  $dst, T2D, $src\t# round vecX 2D to 2L vector" %}
+  ins_encode %{
+    __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                         as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                         as_FloatRegister($tmp3$$reg), __ T2D);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // ------------------------------ Reduction -------------------------------

 instruct reduce_add8B(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, vecD tmp)
--- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
@ -349,6 +349,25 @@ VECTOR_CAST_F2F(F, D, X, D, fcvtl, 2S, 2D)
 VECTOR_CAST_F2F(D, F, D, X, fcvtn, 2D, 2S)
 dnl

+define(`VECTOR_JAVA_FROUND', `
+instruct vround$7$2to$5$3($7 dst, $7 src, $7 tmp1, $7 tmp2, $7 tmp3)
+%{
+  predicate(UseSVE == 0 &&
+            n->as_Vector()->length() == $5 && n->bottom_type()->is_vect()->element_basic_type() == T_$6);
+  match(Set dst (RoundV$1 src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "vround  $dst, $4, $src\t# round $7 $2 to $5$3 vector" %}
+  ins_encode %{
+    __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                         as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                         as_FloatRegister($tmp3$$reg), __ $4);
+  %}
+  ins_pipe(pipe_class_default);
+%}')dnl           $1  $2  $3   $4 $5    $6    $7
+VECTOR_JAVA_FROUND(F, 2F,  I, T2S, 2,  INT, vecD)
+VECTOR_JAVA_FROUND(F, 4F,  I, T4S, 4,  INT, vecX)
+VECTOR_JAVA_FROUND(D, 2D,  L, T2D, 2, LONG, vecX)
+
 // ------------------------------ Reduction -------------------------------
 dnl
 define(`REDUCE_ADD_BORS', `
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
@ -162,7 +162,6 @@ source %{
    }
    return op_sve_supported(opcode, vlen, bt);
  }
-
 %}

 definitions %{
@ -3277,6 +3276,54 @@ instruct vroundD(vReg dst, vReg src, immI rmode) %{
  ins_pipe(pipe_slow);
 %}

+instruct vroundFtoI(vReg dst, vReg src, vReg tmp1, vReg tmp2, vReg tmp3, pRegGov ptmp)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst (RoundVF src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
+  format %{ "sve_vround  $dst, S, $src\t# round F to I vector" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int vlen = Matcher::vector_length_in_bytes(this);
+    if (vlen > 16) {
+      __ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                          as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                          as_PRegister($ptmp$$reg), __ S);
+    } else {
+      __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                           as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                           as_FloatRegister($tmp3$$reg),
+                           __ esize2arrangement(type2aelembytes(bt),
+                              /*isQ*/ vlen == 16));
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vroundDtoL(vReg dst, vReg src, vReg tmp1, vReg tmp2, vReg tmp3, pRegGov ptmp)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst (RoundVD src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
+  format %{ "sve_vround  $dst, D, $src\t# round D to L vector" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int vlen = Matcher::vector_length_in_bytes(this);
+    if (vlen > 16) {
+      __ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                          as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                          as_PRegister($ptmp$$reg), __ D);
+    } else {
+      __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                           as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                           as_FloatRegister($tmp3$$reg),
+                           __ esize2arrangement(type2aelembytes(bt),
+                              /*isQ*/ vlen == 16));
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // vector replicate

 instruct replicateB(vReg dst, iRegIorL2I src) %{
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
@ -157,7 +157,6 @@ source %{
    }
    return op_sve_supported(opcode, vlen, bt);
  }
-
 %}

 definitions %{
@ -1793,6 +1792,32 @@ instruct vroundD(vReg dst, vReg src, immI rmode) %{
  %}
  ins_pipe(pipe_slow);
 %}
+define(`VECTOR_JAVA_FROUND', `
+instruct vround$1to$3($7 dst, $7 src, $7 tmp1, $7 tmp2, $7 tmp3, pRegGov ptmp)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst (RoundV$1 src));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
+  format %{ "sve_vround  $dst, $4, $src\t# round $1 to $3 vector" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int vlen = Matcher::vector_length_in_bytes(this);
+    if (vlen > 16) {
+      __ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                          as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                          as_PRegister($ptmp$$reg), __ $4);
+    } else {
+      __ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
+                           as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
+                           as_FloatRegister($tmp3$$reg),
+                           __ esize2arrangement(type2aelembytes(bt),
+                              /*isQ*/ vlen == 16));
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}')dnl           $1  $2  $3 $4 $5    $6    $7
+VECTOR_JAVA_FROUND(F, 8F,  I, S, 8,  INT, vReg)
+VECTOR_JAVA_FROUND(D, 4D,  L, D, 4, LONG, vReg)
 dnl
 dnl REPLICATE($1,        $2,      $3,      $4,   $5         )
 dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len)
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -306,14 +306,6 @@ public:
    assert_cond((bits & mask) == mask);
    return (insn & mask) >> lsb;
  }
-
-  void fixed(unsigned value, unsigned mask) {
-    assert_cond ((mask & bits) == 0);
-#ifdef ASSERT
-    bits |= mask;
-#endif
-    insn |= value;
-  }
 };

 #define starti Instruction_aarch64 current_insn(this);
@ -698,7 +690,6 @@ public:
 #define zrf current_insn.zrf
 #define prf current_insn.prf
 #define pgrf current_insn.pgrf
-#define fixed current_insn.fixed

  typedef void (Assembler::* uncond_branch_insn)(address dest);
  typedef void (Assembler::* compare_and_branch_insn)(Register Rt, address dest);
@ -1085,7 +1076,7 @@ public:

  // A more convenient access to dmb for our purposes
  enum Membar_mask_bits {
-    // We can use ISH for a barrier because the ARM ARM says "This
+    // We can use ISH for a barrier because the Arm ARM says "This
    // architecture assumes that all Processing Elements that use the
    // same operating system or hypervisor are in the same Inner
    // Shareable shareability domain."
@ -2082,46 +2073,55 @@ public:
 #undef INSN

   // Floating-point<->integer conversions
-  void float_int_convert(unsigned op31, unsigned type,
+  void float_int_convert(unsigned sflag, unsigned ftype,
                         unsigned rmode, unsigned opcode,
                         Register Rd, Register Rn) {
    starti;
-    f(op31, 31, 29);
+    f(sflag, 31);
+    f(0b00, 30, 29);
    f(0b11110, 28, 24);
-    f(type, 23, 22), f(1, 21), f(rmode, 20, 19);
+    f(ftype, 23, 22), f(1, 21), f(rmode, 20, 19);
    f(opcode, 18, 16), f(0b000000, 15, 10);
    zrf(Rn, 5), zrf(Rd, 0);
  }

-#define INSN(NAME, op31, type, rmode, opcode)                           \
-  void NAME(Register Rd, FloatRegister Vn) {                            \
-    float_int_convert(op31, type, rmode, opcode, Rd, as_Register(Vn));  \
+#define INSN(NAME, sflag, ftype, rmode, opcode)                          \
+  void NAME(Register Rd, FloatRegister Vn) {                             \
+    float_int_convert(sflag, ftype, rmode, opcode, Rd, as_Register(Vn)); \
  }

-  INSN(fcvtzsw, 0b000, 0b00, 0b11, 0b000);
-  INSN(fcvtzs,  0b100, 0b00, 0b11, 0b000);
-  INSN(fcvtzdw, 0b000, 0b01, 0b11, 0b000);
-  INSN(fcvtzd,  0b100, 0b01, 0b11, 0b000);
+  INSN(fcvtzsw, 0b0, 0b00, 0b11, 0b000);
+  INSN(fcvtzs,  0b1, 0b00, 0b11, 0b000);
+  INSN(fcvtzdw, 0b0, 0b01, 0b11, 0b000);
+  INSN(fcvtzd,  0b1, 0b01, 0b11, 0b000);

-  INSN(fmovs, 0b000, 0b00, 0b00, 0b110);
-  INSN(fmovd, 0b100, 0b01, 0b00, 0b110);
+  // RoundToNearestTiesAway
+  INSN(fcvtassw, 0b0, 0b00, 0b00, 0b100);  // float -> signed word
+  INSN(fcvtasd,  0b1, 0b01, 0b00, 0b100);  // double -> signed xword

-  INSN(fmovhid, 0b100, 0b10, 0b01, 0b110);
+  // RoundTowardsNegative
+  INSN(fcvtmssw, 0b0, 0b00, 0b10, 0b000);  // float -> signed word
+  INSN(fcvtmsd,  0b1, 0b01, 0b10, 0b000);  // double -> signed xword
+
+  INSN(fmovs, 0b0, 0b00, 0b00, 0b110);
+  INSN(fmovd, 0b1, 0b01, 0b00, 0b110);
+
+  INSN(fmovhid, 0b1, 0b10, 0b01, 0b110);

 #undef INSN

-#define INSN(NAME, op31, type, rmode, opcode)                           \
+#define INSN(NAME, sflag, type, rmode, opcode)                           \
  void NAME(FloatRegister Vd, Register Rn) {                            \
-    float_int_convert(op31, type, rmode, opcode, as_Register(Vd), Rn);  \
+    float_int_convert(sflag, type, rmode, opcode, as_Register(Vd), Rn);  \
  }

-  INSN(fmovs, 0b000, 0b00, 0b00, 0b111);
-  INSN(fmovd, 0b100, 0b01, 0b00, 0b111);
+  INSN(fmovs, 0b0, 0b00, 0b00, 0b111);
+  INSN(fmovd, 0b1, 0b01, 0b00, 0b111);

-  INSN(scvtfws, 0b000, 0b00, 0b00, 0b010);
-  INSN(scvtfs,  0b100, 0b00, 0b00, 0b010);
-  INSN(scvtfwd, 0b000, 0b01, 0b00, 0b010);
-  INSN(scvtfd,  0b100, 0b01, 0b00, 0b010);
+  INSN(scvtfws, 0b0, 0b00, 0b00, 0b010);
+  INSN(scvtfs,  0b1, 0b00, 0b00, 0b010);
+  INSN(scvtfwd, 0b0, 0b01, 0b00, 0b010);
+  INSN(scvtfd,  0b1, 0b01, 0b00, 0b010);

  // INSN(fmovhid, 0b100, 0b10, 0b01, 0b111);

@ -2510,6 +2510,7 @@ public:

 #undef INSN

+// Advanced SIMD modified immediate
 #define INSN(NAME, op0, cmode0) \
  void NAME(FloatRegister Vd, SIMD_Arrangement T, unsigned imm8, unsigned lsl = 0) {   \
    unsigned cmode = cmode0;                                                           \
@ -2537,7 +2538,22 @@ public:

 #undef INSN

-#define INSN(NAME, op1, op2, op3) \
+#define INSN(NAME, op, cmode)                                           \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, double imm) {         \
+    unsigned imm8 = pack(imm);                                          \
+    starti;                                                             \
+    f(0, 31), f((int)T & 1, 30), f(op, 29), f(0b0111100000, 28, 19);    \
+    f(imm8 >> 5, 18, 16), f(cmode, 15, 12), f(0x01, 11, 10), f(imm8 & 0b11111, 9, 5); \
+    rf(Vd, 0);                                                          \
+  }
+
+  INSN(fmovs, 0, 0b1111);
+  INSN(fmovd, 1, 0b1111);
+
+#undef INSN
+
+// Advanced SIMD three same
+#define INSN(NAME, op1, op2, op3)                                                       \
  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
    starti;                                                                             \
    assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");                    \
@ -2984,7 +3000,9 @@ public:
  INSN(frintn, 0, 0b00, 0b01, 0b11000);
  INSN(frintm, 0, 0b00, 0b01, 0b11001);
  INSN(frintp, 0, 0b10, 0b01, 0b11000);
+  INSN(fcvtas, 0, 0b00, 0b01, 0b11100);
  INSN(fcvtzs, 0, 0b10, 0b01, 0b11011);
+  INSN(fcvtms, 0, 0b00, 0b01, 0b11011);
 #undef ASSERTION

 #define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T == T2S || T == T4S)
@ -3154,6 +3172,7 @@ public:
  INSN(sve_fneg,    0b00000100, 0b011101101);
  INSN(sve_frintm,  0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity
  INSN(sve_frintn,  0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even
+  INSN(sve_frinta,  0b01100101, 0b000100101); // floating-point round to integral value, nearest with ties to away
  INSN(sve_frintp,  0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity
  INSN(sve_fsqrt,   0b01100101, 0b001101101);
  INSN(sve_fsub,    0b01100101, 0b000001100);
@ -3449,8 +3468,9 @@ public:
    pgrf(Pg, 10), srf(Rn, 5), rf(Zd, 0);
  }

-  // SVE copy signed integer immediate to vector elements (predicated)
-  void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8, bool isMerge) {
+private:
+  void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8,
+               bool isMerge, bool isFloat) {
    starti;
    assert(T != Q, "invalid size");
    int sh = 0;
@ -3464,7 +3484,17 @@ public:
    }
    int m = isMerge ? 1 : 0;
    f(0b00000101, 31, 24), f(T, 23, 22), f(0b01, 21, 20);
-    prf(Pg, 16), f(0b0, 15), f(m, 14), f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0);
+    prf(Pg, 16), f(isFloat ? 1 : 0, 15), f(m, 14), f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0);
+  }
+
+public:
+  // SVE copy signed integer immediate to vector elements (predicated)
+  void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8, bool isMerge) {
+    sve_cpy(Zd, T, Pg, imm8, isMerge, /*isFloat*/false);
+  }
+  // SVE copy floating-point immediate to vector elements (predicated)
+  void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, double d) {
+    sve_cpy(Zd, T, Pg, checked_cast<int8_t>(pack(d)), /*isMerge*/true, /*isFloat*/true);
  }

  // SVE conditionally select elements from two vectors
@ -3528,6 +3558,29 @@ void sve_cmp(Condition cond, PRegister Pd, SIMD_RegVariant T,
  f(cond_op & 0x1, 4), prf(Pd, 0);
 }

+// SVE Floating-point compare vector with zero
+void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
+             PRegister Pg, FloatRegister Zn, double d) {
+  starti;
+  assert(T != Q, "invalid size");
+  guarantee(d == 0.0, "invalid immediate");
+  int cond_op;
+  switch(cond) {
+    case EQ: cond_op = 0b100; break;
+    case GT: cond_op = 0b001; break;
+    case GE: cond_op = 0b000; break;
+    case LT: cond_op = 0b010; break;
+    case LE: cond_op = 0b011; break;
+    case NE: cond_op = 0b110; break;
+    default:
+      ShouldNotReachHere();
+  }
+  f(0b01100101, 31, 24), f(T, 23, 22), f(0b0100, 21, 18),
+  f((cond_op >> 1) & 0x3, 17, 16), f(0b001, 15, 13),
+  pgrf(Pg, 10), rf(Zn, 5);
+  f(cond_op & 0x1, 4), prf(Pd, 0);
+}
+
 // SVE unpack vector elements
 #define INSN(NAME, op) \
  void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn) { \
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -1267,3 +1267,74 @@ void C2_MacroAssembler::sve_ptrue_lanecnt(PRegister dst, SIMD_RegVariant size, i
      ShouldNotReachHere();
  }
 }
+
+// java.lang.Math::round intrinsics
+
+void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
+                                       FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
+  assert_different_registers(tmp1, tmp2, tmp3, src, dst);
+  switch (T) {
+    case T2S:
+    case T4S:
+      fmovs(tmp1, T, 0.5f);
+      mov(rscratch1, jint_cast(0x1.0p23f));
+      break;
+    case T2D:
+      fmovd(tmp1, T, 0.5);
+      mov(rscratch1, julong_cast(0x1.0p52));
+      break;
+    default:
+      assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
+  }
+  fadd(tmp1, T, tmp1, src);
+  fcvtms(tmp1, T, tmp1);
+  // tmp1 = floor(src + 0.5, ties to even)
+
+  fcvtas(dst, T, src);
+  // dst = round(src), ties to away
+
+  fneg(tmp3, T, src);
+  dup(tmp2, T, rscratch1);
+  cmhs(tmp3, T, tmp3, tmp2);
+  // tmp3 is now a set of flags
+
+  bif(dst, T16B, tmp1, tmp3);
+  // result in dst
+}
+
+void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
+                                      FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) {
+  assert_different_registers(tmp1, tmp2, src, dst);
+
+  switch (T) {
+    case S:
+      mov(rscratch1, jint_cast(0x1.0p23f));
+      break;
+    case D:
+      mov(rscratch1, julong_cast(0x1.0p52));
+      break;
+    default:
+      assert(T == S || T == D, "invalid arrangement");
+  }
+
+  sve_frinta(dst, T, ptrue, src);
+  // dst = round(src), ties to away
+
+  Label none;
+
+  sve_fneg(tmp1, T, ptrue, src);
+  sve_dup(tmp2, T, rscratch1);
+  sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1);
+  br(EQ, none);
+  {
+    sve_cpy(tmp1, T, ptmp, 0.5);
+    sve_fadd(tmp1, T, ptmp, src);
+    sve_frintm(dst, T, ptmp, tmp1);
+    // dst = floor(src + 0.5, ties to even)
+  }
+  bind(none);
+
+  sve_fcvtzs(dst, T, ptrue, dst, T);
+  // result in dst
+}
+
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@ -103,4 +103,12 @@
    sve_lastb(dst, size, pg, src);
  }

+  // java.lang.Math::round intrinsics
+  void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
+                         FloatRegister tmp2, FloatRegister tmp3,
+                         SIMD_Arrangement T);
+  void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
+                        FloatRegister tmp2, PRegister ptmp,
+                        SIMD_RegVariant T);
+
 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@ -5178,6 +5178,56 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
  csel(res, res, zr, EQ);
 }

+// java.math.round(double a)
+// Returns the closest long to the argument, with ties rounding to
+// positive infinity.  This requires some fiddling for corner
+// cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
+void MacroAssembler::java_round_double(Register dst, FloatRegister src,
+                                       FloatRegister ftmp) {
+  Label DONE;
+  BLOCK_COMMENT("java_round_double: { ");
+  fmovd(rscratch1, src);
+  // Use RoundToNearestTiesAway unless src small and -ve.
+  fcvtasd(dst, src);
+  // Test if src >= 0 || abs(src) >= 0x1.0p52
+  eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
+  mov(rscratch2, julong_cast(0x1.0p52));
+  cmp(rscratch1, rscratch2);
+  br(HS, DONE); {
+    // src < 0 && abs(src) < 0x1.0p52
+    // src may have a fractional part, so add 0.5
+    fmovd(ftmp, 0.5);
+    faddd(ftmp, src, ftmp);
+    // Convert double to jlong, use RoundTowardsNegative
+    fcvtmsd(dst, ftmp);
+  }
+  bind(DONE);
+  BLOCK_COMMENT("} java_round_double");
+}
+
+void MacroAssembler::java_round_float(Register dst, FloatRegister src,
+                                      FloatRegister ftmp) {
+  Label DONE;
+  BLOCK_COMMENT("java_round_float: { ");
+  fmovs(rscratch1, src);
+  // Use RoundToNearestTiesAway unless src small and -ve.
+  fcvtassw(dst, src);
+  // Test if src >= 0 || abs(src) >= 0x1.0p23
+  eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
+  mov(rscratch2, jint_cast(0x1.0p23f));
+  cmp(rscratch1, rscratch2);
+  br(HS, DONE); {
+    // src < 0 && |src| < 0x1.0p23
+    // src may have a fractional part, so add 0.5
+    fmovs(ftmp, 0.5f);
+    fadds(ftmp, src, ftmp);
+    // Convert float to jint, use RoundTowardsNegative
+    fcvtmssw(dst, ftmp);
+  }
+  bind(DONE);
+  BLOCK_COMMENT("} java_round_float");
+}
+
 // get_thread() can be called anywhere inside generated code so we
 // need to save whatever non-callee save context might get clobbered
 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -877,6 +877,10 @@ public:
  // Round up to a power of two
  void round_to(Register reg, int modulus);

+  // java.lang.Math::round intrinsics
+  void java_round_double(Register dst, FloatRegister src, FloatRegister ftmp);
+  void java_round_float(Register dst, FloatRegister src, FloatRegister ftmp);
+
  // allocation
  void eden_allocate(
    Register obj,                      // result: pointer to object after successful allocation
--- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
@ -165,8 +165,23 @@

  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
-    return 0;
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 15;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 15;
+      }
+    }
  }

-
 #endif // CPU_AARCH64_MATCHER_AARCH64_HPP
--- a/src/hotspot/cpu/arm/matcher_arm.hpp
+++ b/src/hotspot/cpu/arm/matcher_arm.hpp
@ -155,9 +155,25 @@
  // Implements a variant of EncodeISOArrayNode that encode ASCII only
  static const bool supports_encode_ascii_array = false;

-  // Returns pre-selection estimated cost of a vector operation.
+  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
-    return 0;
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 30;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 30;
+      }
+    }
  }

 #endif // CPU_ARM_MATCHER_ARM_HPP
--- a/src/hotspot/cpu/ppc/matcher_ppc.hpp
+++ b/src/hotspot/cpu/ppc/matcher_ppc.hpp
@ -164,10 +164,25 @@
  // Implements a variant of EncodeISOArrayNode that encode ASCII only
  static const bool supports_encode_ascii_array = true;

-  // Returns pre-selection estimated cost of a vector operation.
+  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
-    return 0;
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 30;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 30;
+      }
+    }
  }

-
 #endif // CPU_PPC_MATCHER_PPC_HPP
--- a/src/hotspot/cpu/riscv/matcher_riscv.hpp
+++ b/src/hotspot/cpu/riscv/matcher_riscv.hpp
@ -163,7 +163,23 @@

  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
-    return 0;
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 30;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 30;
+      }
+    }
  }

 #endif // CPU_RISCV_MATCHER_RISCV_HPP
--- a/src/hotspot/cpu/s390/matcher_s390.hpp
+++ b/src/hotspot/cpu/s390/matcher_s390.hpp
@ -153,9 +153,25 @@
  // Implements a variant of EncodeISOArrayNode that encode ASCII only
  static const bool supports_encode_ascii_array = true;

-  // Returns pre-selection estimated cost of a vector operation.
+  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
-    return 0;
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 30;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 30;
+      }
+    }
  }

 #endif // CPU_S390_MATCHER_S390_HPP
--- a/src/hotspot/cpu/x86/matcher_x86.hpp
+++ b/src/hotspot/cpu/x86/matcher_x86.hpp
@ -183,12 +183,26 @@
  // Implements a variant of EncodeISOArrayNode that encode ASCII only
  static const bool supports_encode_ascii_array = true;

-  // Returns pre-selection estimated cost of a vector operation.
+  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
    switch(vopc) {
      default: return 0;
      case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
      case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
+      case Op_RoundVF: // fall through
+      case Op_RoundVD: {
+        return 30;
+      }
+    }
+  }
+  // Returns pre-selection estimated size of a scalar operation.
+  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
+    switch(vopc) {
+      default: return 0;
+      case Op_RoundF: // fall through
+      case Op_RoundD: {
+        return 30;
+      }
    }
  }

--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -970,10 +970,12 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
      case Op_ModL: body_size += 30; break;
      case Op_DivL: body_size += 30; break;
      case Op_MulL: body_size += 10; break;
-      case Op_RoundF: body_size += 30; break;
-      case Op_RoundD: body_size += 30; break;
-      case Op_RoundVF: body_size += 30; break;
-      case Op_RoundVD: body_size += 30; break;
+      case Op_RoundF:
+      case Op_RoundD: {
+          body_size += Matcher::scalar_op_pre_select_sz_estimate(n->Opcode(), n->bottom_type()->basic_type());
+      } break;
+      case Op_RoundVF:
+      case Op_RoundVD:
      case Op_PopCountVI:
      case Op_PopCountVL: {
        const TypeVect* vt = n->bottom_type()->is_vect();
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -462,6 +462,29 @@ class SVEBinaryImmOp(Instruction):
        return (formatStr
                % tuple([Instruction.astr(self)] + Regs + [self.immed]))

+class SVEComparisonWithZero(Instruction):
+     def __init__(self, arg):
+          Instruction.__init__(self, "fcm")
+          self.condition = arg
+          self.dest = OperandFactory.create('p').generate()
+          self.reg = SVEVectorRegister().generate()
+          self._width = RegVariant(2, 3)
+          self.preg = OperandFactory.create('P').generate()
+
+     def generate(self):
+          return Instruction.generate(self)
+
+     def cstr(self):
+          return ("%s(%s, %s, %s, %s, %s, 0.0);"
+                  % ("__ sve_" + self._name, "Assembler::" + self.condition,
+                     str(self.dest), self._width.cstr(), str(self.preg), str(self.reg)))
+
+     def astr(self):
+          val = ("%s%s\t%s%s, %s/z, %s%s, #0.0"
+                 % (self._name, self.condition.lower(), str(self.dest), self._width.astr(),
+                    str(self.preg), str(self.reg), self._width.astr()))
+          return val
+
 class MultiOp():

    def multipleForms(self):
@ -1444,6 +1467,8 @@ generate(FloatConvertOp, [["fcvtzsw", "fcvtzs", "ws"], ["fcvtzs", "fcvtzs", "xs"
                          ["fcvtzdw", "fcvtzs", "wd"], ["fcvtzd", "fcvtzs", "xd"],
                          ["scvtfws", "scvtf", "sw"], ["scvtfs", "scvtf", "sx"],
                          ["scvtfwd", "scvtf", "dw"], ["scvtfd", "scvtf", "dx"],
+                          ["fcvtassw", "fcvtas", "ws"], ["fcvtasd", "fcvtas", "xd"],
+                          ["fcvtmssw", "fcvtms", "ws"], ["fcvtmsd", "fcvtms", "xd"],
                          ["fmovs", "fmov", "ws"], ["fmovd", "fmov", "xd"],
                          ["fmovs", "fmov", "sw"], ["fmovd", "fmov", "dx"]])

@ -1590,6 +1615,8 @@ generate(ThreeRegNEONOp,
          ["fcmge", "fcmge", "2D"],
          ])

+generate(SVEComparisonWithZero, ["EQ", "GT", "GE", "LT", "LE", "NE"])
+
 generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",                "ccmn\txzr, xzr, #3, LE"],
                        ["ccmnw",  "__ ccmnw(zr, zr, 5u, Assembler::EQ);",               "ccmn\twzr, wzr, #5, EQ"],
                        ["ccmp",   "__ ccmp(zr, 1, 4u, Assembler::NE);",                 "ccmp\txzr, 1, #4, NE"],
@ -1613,8 +1640,12 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                        ["umov",   "__ umov(r0, v1, __ H, 2);",                          "umov\tw0, v1.h[2]"],
                        ["umov",   "__ umov(r0, v1, __ B, 3);",                          "umov\tw0, v1.b[3]"],
                        ["fmov",   "__ fmovhid(r0, v1);",                                "fmov\tx0, v1.d[1]"],
+                        ["fmov",   "__ fmovs(v9, __ T2S, 0.5f);",                        "fmov\tv9.2s, 0.5"],
+                        ["fmov",   "__ fmovd(v14, __ T2D, 0.5f);",                       "fmov\tv14.2d, 0.5"],
                        ["ld1",    "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"],
-                        ["fcvtzs", "__ fcvtzs(v0, __ T4S, v1);",                         "fcvtzs\tv0.4s, v1.4s"],
+                        ["fcvtzs", "__ fcvtzs(v0, __ T2S, v1);",                         "fcvtzs\tv0.2s, v1.2s"],
+                        ["fcvtas", "__ fcvtas(v2, __ T4S, v3);",                         "fcvtas\tv2.4s, v3.4s"],
+                        ["fcvtms", "__ fcvtms(v4, __ T2D, v5);",                         "fcvtms\tv4.2d, v5.2d"],
                        # SVE instructions
                        ["cpy",     "__ sve_cpy(z0, __ S, p0, v1);",                      "mov\tz0.s, p0/m, s1"],
                        ["cpy",     "__ sve_cpy(z0, __ B, p0, 127, true);",               "mov\tz0.b, p0/m, 127"],
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
--- a/test/hotspot/jtreg/compiler/vectorization/TestRoundVectAArch64.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestRoundVectAArch64.java
@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8282541
+ * @summary Auto-vectorize Math.round API
+ * @requires vm.compiler2.enabled
+ * @requires os.simpleArch == "aarch64"
+ * @library /test/lib /
+ * @run driver compiler.vectorization.TestRoundVectAArch64
+ */
+
+package compiler.vectorization;
+
+import compiler.lib.ir_framework.*;
+
+public class TestRoundVectAArch64 {
+  private static final int ARRLEN = 1024;
+  private static final int ITERS  = 11000;
+
+  private static double [] dinp;
+  private static long   [] lout;
+  private static float  [] finp;
+  private static int    [] iout;
+
+  public static void main(String args[]) {
+      if (System.getProperty("os.arch").equals("aarch64")) {
+          TestFramework.runWithFlags("-XX:-TieredCompilation",
+                                     "-XX:CompileThresholdScaling=0.3");
+      }
+      System.out.println("PASSED");
+  }
+
+  @Test
+  @IR(counts = {"RoundVD" , " > 0 "})
+  public void test_round_double(long[] lout, double[] dinp) {
+      for (int i = 0; i < lout.length; i+=1) {
+          lout[i] = Math.round(dinp[i]);
+      }
+  }
+
+  @Run(test = {"test_round_double"}, mode = RunMode.STANDALONE)
+  public void kernel_test_round_double() {
+      dinp = new double[ARRLEN];
+      lout = new long[ARRLEN];
+      for(int i = 0 ; i < ARRLEN; i++) {
+          dinp[i] = (double)i*1.4;
+      }
+      for (int i = 0; i < ITERS; i++) {
+          test_round_double(lout , dinp);
+      }
+  }
+
+  @Test
+  @IR(counts = {"RoundVF" , " > 0 "})
+  public void test_round_float(int[] iout, float[] finp) {
+      for (int i = 0; i < finp.length; i+=1) {
+          iout[i] = Math.round(finp[i]);
+      }
+  }
+
+  @Run(test = {"test_round_float"}, mode = RunMode.STANDALONE)
+  public void kernel_test_round() {
+      finp = new float[ARRLEN];
+      iout = new int[ARRLEN];
+      for(int i = 0 ; i < ARRLEN; i++) {
+          finp[i] = (float)i*1.4f;
+      }
+      for (int i = 0; i < ITERS; i++) {
+          test_round_float(iout , finp);
+      }
+  }
+}