8279508: Auto-vectorize Math.round API

Reviewed-by: sviswanathan, aph
2022-04-02 18:00:33 +00:00 · 2022-04-02 18:00:33 +00:00 · 003ec21f3c
commit 003ec21f3c
parent c1e67b6603
25 changed files with 765 additions and 90 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -1995,6 +1995,13 @@ void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
  emit_int16(0x2C, (0xC0 | encode));
 }

+void Assembler::cvtss2sil(Register dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x2D, (0xC0 | encode));
+}
+
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
@ -2088,6 +2095,21 @@ void Assembler::vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len) {
  emit_int16(0x5B, (0xC0 | encode));
 }

+void Assembler::vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x5B, (0xC0 | encode));
+}
+
+void Assembler::evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x7B, (0xC0 | encode));
+}
+
 void Assembler::evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
@ -6493,7 +6515,6 @@ void Assembler::vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vec
  emit_int8((rmode));
 }

-
 void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_avx(), "");
  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
@ -12285,6 +12306,13 @@ void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
  emit_int16(0x2C, (0xC0 | encode));
 }

+void Assembler::cvtsd2siq(Register dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x2D, (0xC0 | encode));
+}
+
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -1149,6 +1149,7 @@ private:
  void cvtss2sd(XMMRegister dst, Address src);

  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
+  void cvtsd2siq(Register dst, XMMRegister src);
  void cvttsd2sil(Register dst, Address src);
  void cvttsd2sil(Register dst, XMMRegister src);
  void cvttsd2siq(Register dst, Address src);
@ -1157,6 +1158,7 @@ private:
  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
  void cvttss2sil(Register dst, XMMRegister src);
  void cvttss2siq(Register dst, XMMRegister src);
+  void cvtss2sil(Register dst, XMMRegister src);

  // Convert vector double to int
  void cvttpd2dq(XMMRegister dst, XMMRegister src);
@ -1166,6 +1168,7 @@ private:
  void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len);

  // Convert vector float and int
+  void vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len);
  void vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len);

  // Convert vector long to vector FP
@ -1173,6 +1176,7 @@ private:
  void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len);

  // Convert vector double to long
+  void evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len);
  void evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len);

  // Evex casts with truncation
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@ -4061,41 +4061,18 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
 }

 /*
- * Algorithm for vector D2L and F2I conversions:-
- * a) Perform vector D2L/F2I cast.
- * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
- *    It signifies that source value could be any of the special floating point
- *    values(NaN,-Inf,Inf,Max,-Min).
- * c) Set destination to zero if source is NaN value.
- * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
+ * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
+ * If src is NaN, the result is 0.
+ * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
+ * the result is equal to the value of Integer.MIN_VALUE.
+ * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
+ * the result is equal to the value of Integer.MAX_VALUE.
 */
-
-void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
-                                            KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
-                                            Register scratch, int vec_enc) {
-  Label done;
-  evcvttpd2qq(dst, src, vec_enc);
-  evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
-  evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
-  kortestwl(ktmp1, ktmp1);
-  jccb(Assembler::equal, done);
-
-  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
-  evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
-  evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
-
-  kxorwl(ktmp1, ktmp1, ktmp2);
-  evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
-  vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
-  evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
-  bind(done);
-}
-
-void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
                                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
-                                           AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
+                                                            Register scratch, AddressLiteral float_sign_flip,
+                                                            int vec_enc) {
  Label done;
-  vcvttps2dq(dst, src, vec_enc);
  vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc);
  vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
  vptest(xtmp2, xtmp2, vec_enc);
@ -4120,11 +4097,11 @@ void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMM
  bind(done);
 }

-void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
-                                            KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
-                                            Register scratch, int vec_enc) {
+void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
+                                                             Register scratch, AddressLiteral float_sign_flip,
+                                                             int vec_enc) {
  Label done;
-  vcvttps2dq(dst, src, vec_enc);
  evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch);
  Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
  kortestwl(ktmp1, ktmp1);
@ -4141,6 +4118,115 @@ void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XM
  bind(done);
 }

+/*
+ * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
+ * If src is NaN, the result is 0.
+ * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
+ * the result is equal to the value of Long.MIN_VALUE.
+ * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
+ * the result is equal to the value of Long.MAX_VALUE.
+ */
+void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                              XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
+                                                              Register scratch, AddressLiteral double_sign_flip,
+                                                              int vec_enc) {
+  Label done;
+  evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
+  evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
+  kortestwl(ktmp1, ktmp1);
+  jccb(Assembler::equal, done);
+
+  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
+  evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
+  evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
+
+  kxorwl(ktmp1, ktmp1, ktmp2);
+  evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
+  vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
+  evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
+  bind(done);
+}
+
+/*
+ * Algorithm for vector D2L and F2I conversions:-
+ * a) Perform vector D2L/F2I cast.
+ * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
+ *    It signifies that source value could be any of the special floating point
+ *    values(NaN,-Inf,Inf,Max,-Min).
+ * c) Set destination to zero if source is NaN value.
+ * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
+ */
+
+void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                            KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
+                                            Register scratch, int vec_enc) {
+  evcvttpd2qq(dst, src, vec_enc);
+  vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc);
+}
+
+void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                           XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
+                                           AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
+  vcvttps2dq(dst, src, vec_enc);
+  vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc);
+}
+
+void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                            KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
+                                            Register scratch, int vec_enc) {
+  vcvttps2dq(dst, src, vec_enc);
+  vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc);
+}
+
+#ifdef _LP64
+void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                                 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
+                                                 AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
+  // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
+  // and re-instantiate original MXCSR.RC mode after that.
+  ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
+  ldmxcsr(new_mxcsr, scratch);
+  mov64(scratch, julong_cast(0.5L));
+  evpbroadcastq(xtmp1, scratch, vec_enc);
+  vaddpd(xtmp1, src , xtmp1, vec_enc);
+  evcvtpd2qq(dst, xtmp1, vec_enc);
+  vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc);
+  ldmxcsr(mxcsr_std, scratch);
+}
+
+void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                                KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
+                                                AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
+  // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
+  // and re-instantiate original MXCSR.RC mode after that.
+  ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
+  ldmxcsr(new_mxcsr, scratch);
+  movl(scratch, jint_cast(0.5));
+  movq(xtmp1, scratch);
+  vbroadcastss(xtmp1, xtmp1, vec_enc);
+  vaddps(xtmp1, src , xtmp1, vec_enc);
+  vcvtps2dq(dst, xtmp1, vec_enc);
+  vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc);
+  ldmxcsr(mxcsr_std, scratch);
+}
+
+void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                               XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip,
+                                               AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
+  // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
+  // and re-instantiate original MXCSR.RC mode after that.
+  ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
+  ldmxcsr(new_mxcsr, scratch);
+  movl(scratch, jint_cast(0.5));
+  movq(xtmp1, scratch);
+  vbroadcastss(xtmp1, xtmp1, vec_enc);
+  vaddps(xtmp1, src , xtmp1, vec_enc);
+  vcvtps2dq(dst, xtmp1, vec_enc);
+  vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc);
+  ldmxcsr(mxcsr_std, scratch);
+}
+#endif
+
 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
                                             BasicType from_elem_bt, BasicType to_elem_bt) {
  switch (from_elem_bt) {
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@ -303,6 +303,7 @@ public:
                           KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
                           Register scratch, int vec_enc);

+
  void vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
                           KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
                           Register scratch, int vec_enc);
@ -310,6 +311,33 @@ public:
  void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
                            BasicType from_elem_bt, BasicType to_elem_bt);

+  void vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                             KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral double_sign_flip,
+                                             int vec_enc);
+
+  void vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                            KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral float_sign_flip,
+                                            int vec_enc);
+
+  void vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                           XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
+                                           Register scratch, AddressLiteral float_sign_flip,
+                                           int vec_enc);
+
+#ifdef _LP64
+  void vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                                KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
+                                AddressLiteral new_mxcsr, Register scratch, int vec_enc);
+
+  void vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                               KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
+                               AddressLiteral new_mxcsr, Register scratch, int vec_enc);
+
+  void vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
+                              XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip,
+                              AddressLiteral new_mxcsr, Register scratch, int vec_enc);
+#endif
+
  void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
                  bool merge, BasicType bt, int vlen_enc);

--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -2252,12 +2252,12 @@ void MacroAssembler::fld_x(AddressLiteral src) {
  Assembler::fld_x(as_Address(src));
 }

-void MacroAssembler::ldmxcsr(AddressLiteral src) {
+void MacroAssembler::ldmxcsr(AddressLiteral src, Register scratchReg) {
  if (reachable(src)) {
    Assembler::ldmxcsr(as_Address(src));
  } else {
-    lea(rscratch1, src);
-    Assembler::ldmxcsr(Address(rscratch1, 0));
+    lea(scratchReg, src);
+    Assembler::ldmxcsr(Address(scratchReg, 0));
  }
 }

@ -9120,6 +9120,80 @@ void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
  bind(done);
 }

+void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
+  // Following code is line by line assembly translation rounding algorithm.
+  // Please refer to java.lang.Math.round(float) algorithm for details.
+  const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
+  const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
+  const int32_t FloatConsts_EXP_BIAS = 127;
+  const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
+  const int32_t MINUS_32 = 0xFFFFFFE0;
+  Label L_special_case, L_block1, L_exit;
+  movl(rtmp, FloatConsts_EXP_BIT_MASK);
+  movdl(dst, src);
+  andl(dst, rtmp);
+  sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
+  movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
+  subl(rtmp, dst);
+  movl(rcx, rtmp);
+  movl(dst, MINUS_32);
+  testl(rtmp, dst);
+  jccb(Assembler::notEqual, L_special_case);
+  movdl(dst, src);
+  andl(dst, FloatConsts_SIGNIF_BIT_MASK);
+  orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
+  movdl(rtmp, src);
+  testl(rtmp, rtmp);
+  jccb(Assembler::greaterEqual, L_block1);
+  negl(dst);
+  bind(L_block1);
+  sarl(dst);
+  addl(dst, 0x1);
+  sarl(dst, 0x1);
+  jmp(L_exit);
+  bind(L_special_case);
+  convert_f2i(dst, src);
+  bind(L_exit);
+}
+
+void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
+  // Following code is line by line assembly translation rounding algorithm.
+  // Please refer to java.lang.Math.round(double) algorithm for details.
+  const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
+  const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
+  const int64_t DoubleConsts_EXP_BIAS = 1023;
+  const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
+  const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
+  Label L_special_case, L_block1, L_exit;
+  mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
+  movq(dst, src);
+  andq(dst, rtmp);
+  sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
+  mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
+  subq(rtmp, dst);
+  movq(rcx, rtmp);
+  mov64(dst, MINUS_64);
+  testq(rtmp, dst);
+  jccb(Assembler::notEqual, L_special_case);
+  movq(dst, src);
+  mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
+  andq(dst, rtmp);
+  mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
+  orq(dst, rtmp);
+  movq(rtmp, src);
+  testq(rtmp, rtmp);
+  jccb(Assembler::greaterEqual, L_block1);
+  negq(dst);
+  bind(L_block1);
+  sarq(dst);
+  addq(dst, 0x1);
+  sarq(dst, 0x1);
+  jmp(L_exit);
+  bind(L_special_case);
+  convert_d2l(dst, src);
+  bind(L_exit);
+}
+
 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
  Label done;
  cvttsd2siq(dst, src);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -906,7 +906,7 @@ public:
  void fld_x(AddressLiteral src);

  void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
-  void ldmxcsr(AddressLiteral src);
+  void ldmxcsr(AddressLiteral src, Register scratchReg = rscratch1);

 #ifdef _LP64
 private:
@ -1994,6 +1994,8 @@ public:
  void convert_d2i(Register dst, XMMRegister src);
  void convert_f2l(Register dst, XMMRegister src);
  void convert_d2l(Register dst, XMMRegister src);
+  void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
+  void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);

  void cache_wb(Address line);
  void cache_wbsync(bool is_pre);
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1468,6 +1468,16 @@ const bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
+    case Op_RoundVF:
+      if (UseAVX < 2) { // enabled for AVX2 only
+        return false;
+      }
+      break;
+    case Op_RoundVD:
+      if (UseAVX < 3) {
+        return false;  // enabled for AVX3 only
+      }
+      break;
    case Op_CompareAndSwapL:
 #ifdef _LP64
    case Op_CompareAndSwapP:
@ -1572,6 +1582,12 @@ const bool Matcher::match_rule_supported(int opcode) {
         return false;
      }
      break;
+    case Op_RoundF:
+    case Op_RoundD:
+      if (!is_LP64) {
+        return false;
+      }
+      break;
    case Op_CopySignD:
    case Op_CopySignF:
      if (UseAVX < 3 || !is_LP64)  {
@ -1817,6 +1833,11 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
        return false;
      }
      break;
+    case Op_RoundVD:
+      if (!VM_Version::supports_avx512dq()) {
+        return false;
+      }
+      break;
    case Op_VectorCastF2X:
      if (is_subword_type(bt) || bt == T_LONG) {
        return false;
@ -7173,13 +7194,14 @@ instruct vcastFtoD_reg(vec dst, vec src) %{
  ins_pipe( pipe_slow );
 %}

-instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
+
+instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
  predicate(!VM_Version::supports_avx512vl() &&
            Matcher::vector_length_in_bytes(n) < 64 &&
            Matcher::vector_element_basic_type(n) == T_INT);
  match(Set dst (VectorCastF2X src));
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
-  format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
+  format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
  ins_encode %{
    int vlen_enc = vector_length_encoding(this);
    __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
@ -7189,13 +7211,13 @@ instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, ve
  ins_pipe( pipe_slow );
 %}

-instruct vcastFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
+instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
  predicate((VM_Version::supports_avx512vl() ||
             Matcher::vector_length_in_bytes(n) == 64) &&
             Matcher::vector_element_basic_type(n) == T_INT);
  match(Set dst (VectorCastF2X src));
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
-  format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
+  format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
  ins_encode %{
    int vlen_enc = vector_length_encoding(this);
    __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
@ -7216,11 +7238,11 @@ instruct vcastDtoF_reg(vec dst, vec src) %{
  ins_pipe( pipe_slow );
 %}

-instruct vcastDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
+instruct castDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst (VectorCastD2X src));
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
-  format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
+  format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
  ins_encode %{
    int vlen_enc = vector_length_encoding(this);
    __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
@ -7246,6 +7268,56 @@ instruct vucast(vec dst, vec src) %{
  ins_pipe( pipe_slow );
 %}

+#ifdef _LP64
+instruct vround_float_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
+  predicate(!VM_Version::supports_avx512vl() &&
+            Matcher::vector_length_in_bytes(n) < 64 &&
+            Matcher::vector_element_basic_type(n) == T_INT);
+  match(Set dst (RoundVF src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
+  format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
+    __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                              $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
+                              ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vround_float_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
+  predicate((VM_Version::supports_avx512vl() ||
+             Matcher::vector_length_in_bytes(n) == 64) &&
+             Matcher::vector_element_basic_type(n) == T_INT);
+  match(Set dst (RoundVF src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
+  format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
+    __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                               $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
+                               ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vround_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
+  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
+  match(Set dst (RoundVD src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
+  format %{ "vector_round_long $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
+    __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
+                                ExternalAddress(vector_double_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif
 // --------------------------------- VectorMaskCmp --------------------------------------

 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -10821,6 +10821,28 @@ instruct convD2L_reg_reg(rRegL dst, regD src, rFlagsReg cr)
  ins_pipe(pipe_slow);
 %}

+instruct round_double_reg(rRegL dst, regD src, rRegL rtmp, rcx_RegL rcx, rFlagsReg cr)
+%{
+  match(Set dst (RoundD src));
+  effect(TEMP dst, TEMP rtmp, TEMP rcx, KILL cr);
+  format %{ "round_double $dst,$src \t! using $rtmp and $rcx as TEMP"%}
+  ins_encode %{
+    __ round_double($dst$$Register, $src$$XMMRegister, $rtmp$$Register, $rcx$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct round_float_reg(rRegI dst, regF src, rRegL rtmp, rcx_RegL rcx, rFlagsReg cr)
+%{
+  match(Set dst (RoundF src));
+  effect(TEMP dst, TEMP rtmp, TEMP rcx, KILL cr);
+  format %{ "round_float $dst,$src" %}
+  ins_encode %{
+    __ round_float($dst$$Register, $src$$XMMRegister, $rtmp$$Register, $rcx$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct convI2F_reg_reg(regF dst, rRegI src)
 %{
  predicate(!UseXmmI2F);
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -4239,6 +4239,7 @@ bool MatchRule::is_vector() const {
    "FmaVD","FmaVF","PopCountVI", "PopCountVL", "VectorLongToMask",
    // Next are vector mask ops.
    "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast",
+    "RoundVF", "RoundVD",
    // Next are not supported currently.
    "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
    "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -134,6 +134,7 @@ class methodHandle;
  do_name(log_name,"log")       do_name(log10_name,"log10")     do_name(pow_name,"pow")                                 \
  do_name(exp_name,"exp")       do_name(min_name,"min")         do_name(max_name,"max")                                 \
  do_name(floor_name, "floor")  do_name(ceil_name, "ceil")      do_name(rint_name, "rint")                              \
+  do_name(round_name, "round")                                                                                          \
                                                                                                                        \
  do_name(addExact_name,"addExact")                                                                                     \
  do_name(decrementExact_name,"decrementExact")                                                                         \
@ -185,6 +186,8 @@ class methodHandle;
  do_intrinsic(_minF,                     java_lang_Math,         min_name,           float2_float_signature,    F_S)   \
  do_intrinsic(_maxD,                     java_lang_Math,         max_name,           double2_double_signature,  F_S)   \
  do_intrinsic(_minD,                     java_lang_Math,         min_name,           double2_double_signature,  F_S)   \
+  do_intrinsic(_roundD,                   java_lang_Math,         round_name,         double_long_signature,     F_S)   \
+  do_intrinsic(_roundF,                   java_lang_Math,         round_name,         float_int_signature,       F_S)   \
  do_intrinsic(_dcopySign,                java_lang_Math,         copySign_name,      double2_double_signature,  F_S)   \
  do_intrinsic(_fcopySign,                java_lang_Math,         copySign_name,      float2_float_signature,    F_S)   \
  do_intrinsic(_dsignum,                  java_lang_Math,         signum_name,        double_double_signature,   F_S)   \
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -523,6 +523,8 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
  case vmIntrinsics::_dlog:
  case vmIntrinsics::_dlog10:
  case vmIntrinsics::_dpow:
+  case vmIntrinsics::_roundD:
+  case vmIntrinsics::_roundF:
  case vmIntrinsics::_min:
  case vmIntrinsics::_max:
  case vmIntrinsics::_min_strict:
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -311,6 +311,8 @@ macro(SignumD)
 macro(SignumF)
 macro(SqrtD)
 macro(SqrtF)
+macro(RoundF)
+macro(RoundD)
 macro(Start)
 macro(StartOSR)
 macro(StoreB)
@ -446,6 +448,8 @@ macro(ReplicateI)
 macro(ReplicateL)
 macro(ReplicateF)
 macro(ReplicateD)
+macro(RoundVF)
+macro(RoundVD)
 macro(Extract)
 macro(ExtractB)
 macro(ExtractUB)
--- a/src/hotspot/share/opto/convertnode.hpp
+++ b/src/hotspot/share/opto/convertnode.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -81,6 +81,14 @@ class ConvD2LNode : public Node {
  virtual uint ideal_reg() const { return Op_RegL; }
 };

+class RoundDNode : public Node {
+  public:
+  RoundDNode( Node *dbl ) : Node(0,dbl) {}
+  virtual int Opcode() const;
+  virtual const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
 //------------------------------ConvF2DNode------------------------------------
 // Convert Float to a Double.
 class ConvF2DNode : public Node {
@ -105,6 +113,7 @@ class ConvF2INode : public Node {
  virtual uint  ideal_reg() const { return Op_RegI; }
 };

+
 //------------------------------ConvF2LNode------------------------------------
 // Convert float to long
 class ConvF2LNode : public Node {
@ -141,6 +150,14 @@ class ConvI2FNode : public Node {
  virtual uint  ideal_reg() const { return Op_RegF; }
 };

+class RoundFNode : public Node {
+  public:
+  RoundFNode( Node *in1 ) : Node(0,in1) {}
+  virtual int Opcode() const;
+  virtual const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint  ideal_reg() const { return Op_RegI; }
+};
+
 //------------------------------ConvI2LNode------------------------------------
 // Convert integer to long
 class ConvI2LNode : public TypeNode {
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -269,6 +269,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_dcopySign:
  case vmIntrinsics::_fcopySign:
  case vmIntrinsics::_dsignum:
+  case vmIntrinsics::_roundF:
+  case vmIntrinsics::_roundD:
  case vmIntrinsics::_fsignum:                  return inline_math_native(intrinsic_id());

  case vmIntrinsics::_notify:
@ -1605,6 +1607,7 @@ Node* LibraryCallKit::round_double_node(Node* n) {
 // public static double Math.sqrt(double)
 // public static double Math.log(double)
 // public static double Math.log10(double)
+// public static double Math.round(double)
 bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) {
  Node* arg = round_double_node(argument(0));
  Node* n = NULL;
@ -1616,6 +1619,7 @@ bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) {
  case vmIntrinsics::_ceil:   n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_ceil); break;
  case vmIntrinsics::_floor:  n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_floor); break;
  case vmIntrinsics::_rint:   n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_rint); break;
+  case vmIntrinsics::_roundD: n = new RoundDNode(arg); break;
  case vmIntrinsics::_dcopySign: n = CopySignDNode::make(_gvn, arg, round_double_node(argument(2))); break;
  case vmIntrinsics::_dsignum: n = SignumDNode::make(_gvn, arg); break;
  default:  fatal_unexpected_iid(id);  break;
@ -1637,6 +1641,7 @@ bool LibraryCallKit::inline_math(vmIntrinsics::ID id) {
  case vmIntrinsics::_labs:   n = new AbsLNode(                arg);  break;
  case vmIntrinsics::_fcopySign: n = new CopySignFNode(arg, argument(1)); break;
  case vmIntrinsics::_fsignum: n = SignumFNode::make(_gvn, arg); break;
+  case vmIntrinsics::_roundF: n = new RoundFNode(arg); break;
  default:  fatal_unexpected_iid(id);  break;
  }
  set_result(_gvn.transform(n));
@ -1752,9 +1757,11 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
      runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog10), "LOG10");

    // These intrinsics are supported on all hardware
+  case vmIntrinsics::_roundD: return Matcher::match_rule_supported(Op_RoundD) ? inline_double_math(id) : false;
  case vmIntrinsics::_ceil:
  case vmIntrinsics::_floor:
  case vmIntrinsics::_rint:   return Matcher::match_rule_supported(Op_RoundDoubleMode) ? inline_double_math(id) : false;
+
  case vmIntrinsics::_dsqrt:
  case vmIntrinsics::_dsqrt_strict:
                              return Matcher::match_rule_supported(Op_SqrtD) ? inline_double_math(id) : false;
@ -1774,6 +1781,7 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
  case vmIntrinsics::_fcopySign: return inline_math(id);
  case vmIntrinsics::_dsignum: return Matcher::match_rule_supported(Op_SignumD) ? inline_double_math(id) : false;
  case vmIntrinsics::_fsignum: return Matcher::match_rule_supported(Op_SignumF) ? inline_math(id) : false;
+  case vmIntrinsics::_roundF: return Matcher::match_rule_supported(Op_RoundF) ? inline_math(id) : false;

   // These intrinsics are not yet correctly implemented
  case vmIntrinsics::_datan2:
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -970,6 +970,10 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
      case Op_ModL: body_size += 30; break;
      case Op_DivL: body_size += 30; break;
      case Op_MulL: body_size += 10; break;
+      case Op_RoundF: body_size += 30; break;
+      case Op_RoundD: body_size += 30; break;
+      case Op_RoundVF: body_size += 30; break;
+      case Op_RoundVD: body_size += 30; break;
      case Op_PopCountVI:
      case Op_PopCountVL: {
        const TypeVect* vt = n->bottom_type()->is_vect();
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -2563,6 +2563,7 @@ bool SuperWord::output() {
                 opc == Op_AbsF || opc == Op_AbsD ||
                 opc == Op_AbsI || opc == Op_AbsL ||
                 opc == Op_NegF || opc == Op_NegD ||
+                 opc == Op_RoundF || opc == Op_RoundD ||
                 opc == Op_PopCountI || opc == Op_PopCountL) {
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -157,6 +157,10 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    return (bt == T_FLOAT ? Op_SqrtVF : 0);
  case Op_SqrtD:
    return (bt == T_DOUBLE ? Op_SqrtVD : 0);
+  case Op_RoundF:
+    return (bt == T_INT ? Op_RoundVF : 0);
+  case Op_RoundD:
+    return (bt == T_LONG ? Op_RoundVD : 0);
  case Op_PopCountI:
    // Unimplemented for subword types since bit count changes
    // depending on size of lane (and sign bit).
@ -585,6 +589,9 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b
  case Op_SqrtVF: return new SqrtVFNode(n1, vt);
  case Op_SqrtVD: return new SqrtVDNode(n1, vt);

+  case Op_RoundVF: return new RoundVFNode(n1, vt);
+  case Op_RoundVD: return new RoundVDNode(n1, vt);
+
  case Op_PopCountVI: return new PopCountVINode(n1, vt);
  case Op_PopCountVL: return new PopCountVLNode(n1, vt);
  case Op_RotateLeftV: return new RotateLeftVNode(n1, n2, vt);
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -1544,6 +1544,14 @@ class VectorCastD2XNode : public VectorCastNode {
  virtual int Opcode() const;
 };

+class RoundVFNode : public VectorNode {
+ public:
+  RoundVFNode(Node* in, const TypeVect* vt) :VectorNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float");
+  }
+  virtual int Opcode() const;
+};
+
 class VectorUCastB2XNode : public VectorCastNode {
 public:
  VectorUCastB2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
@ -1552,6 +1560,14 @@ class VectorUCastB2XNode : public VectorCastNode {
  virtual int Opcode() const;
 };

+class RoundVDNode : public VectorNode {
+ public:
+  RoundVDNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE, "must be double");
+  }
+  virtual int Opcode() const;
+};
+
 class VectorUCastS2XNode : public VectorCastNode {
 public:
  VectorUCastS2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
--- a/src/java.base/share/classes/java/lang/Math.java
+++ b/src/java.base/share/classes/java/lang/Math.java
@ -753,6 +753,7 @@ public final class Math {
     * @see     java.lang.Integer#MAX_VALUE
     * @see     java.lang.Integer#MIN_VALUE
     */
+    @IntrinsicCandidate
    public static int round(float a) {
        int intBits = Float.floatToRawIntBits(a);
        int biasedExp = (intBits & FloatConsts.EXP_BIT_MASK)
@ -802,6 +803,7 @@ public final class Math {
     * @see     java.lang.Long#MAX_VALUE
     * @see     java.lang.Long#MIN_VALUE
     */
+    @IntrinsicCandidate
    public static long round(double a) {
        long longBits = Double.doubleToRawLongBits(a);
        long biasedExp = (longBits & DoubleConsts.EXP_BIT_MASK)
--- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -52,6 +52,8 @@ public class TestDoubleVect {

  static int test() {
    double[] a0 = new double[ARRLEN];
+    long  [] l0 = new long[ARRLEN];
+
    double[] a1 = new double[ARRLEN];
    double[] a2 = new double[ARRLEN];
    double[] a3 = new double[ARRLEN];
@ -91,6 +93,7 @@ public class TestDoubleVect {
      test_ceil(a0, a1);
      test_floor(a0, a1);
      test_sqrt(a0, a1);
+      test_round(l0, a1);
    }
    // Test and verify results
    System.out.println("Verification");
@ -355,6 +358,7 @@ public class TestDoubleVect {
        errn += verify("test_negc: ", i, a0[i], (double)(-((double)(ADD_INIT+i))));
      }

+
      // To test -ve and +ve Zero scenarios.
      double [] other_corner_cases     = { -0.0, 0.0, 9.007199254740992E15 };
      double [] other_corner_cases_res = new double[3];
@ -421,6 +425,35 @@ public class TestDoubleVect {
      for (int i=8; i<ARRLEN; i++) {
        errn += verify("test_sqrt: ", i, a0[i], Math.sqrt((double)(ADD_INIT+i)));
      }
+
+      a1[6] = +0x1.fffffffffffffp-2;
+      a1[7] = +0x1.0p-1;
+      a1[8] = +0x1.0000000000001p-1;
+      a1[9] = -0x1.fffffffffffffp-2;
+      a1[10] = -0x1.0p-1;
+      a1[11] = -0x1.0000000000001p-1;
+      a1[12] = 1.7976931348623157E19;
+      a1[13] = -1.7976931348623157E19;
+
+      test_round(l0, a1);
+      errn += verify("test_round: ", 0, l0[0], 0L);
+      errn += verify("test_round: ", 1, l0[1], Long.MAX_VALUE);
+      errn += verify("test_round: ", 2, l0[2], Long.MIN_VALUE);
+      errn += verify("test_round: ", 3, l0[3], Long.MAX_VALUE);
+      errn += verify("test_round: ", 4, l0[4], 0L);
+      errn += verify("test_round: ", 5, l0[5], 0L);
+
+      errn += verify("test_round: ", 6, l0[6], 0L);
+      errn += verify("test_round: ", 7, l0[7], 1L);
+      errn += verify("test_round: ", 8, l0[8], 1L);
+      errn += verify("test_round: ", 9, l0[9], 0L);
+      errn += verify("test_round: ", 10, l0[10], 0L);
+      errn += verify("test_round: ", 11, l0[11], -1L);
+      errn += verify("test_round: ", 12, l0[12], Long.MAX_VALUE);
+      errn += verify("test_round: ", 13, l0[13], Long.MIN_VALUE);
+      for (int i=14; i<ARRLEN; i++) {
+        errn += verify("test_round: ", i, l0[i], Math.round((double)(ADD_INIT+i)));
+      }
    }

    if (errn > 0)
@ -564,6 +597,12 @@ public class TestDoubleVect {
    end = System.currentTimeMillis();
    System.out.println("test_sqrt_n: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_round(l0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_round_n: " + (end - start));
    return errn;
  }

@ -691,6 +730,20 @@ public class TestDoubleVect {
    }
  }

+  static void test_round(long[] a0, double[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = Math.round(a1[i]);
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
  static int verify(String text, int i, double elem, double val) {
    if (elem != val && !(Double.isNaN(elem) && Double.isNaN(val))) {
      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
--- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -52,6 +52,7 @@ public class TestFloatVect {

  static int test() {
    float[] a0 = new float[ARRLEN];
+    int[] i0 = new int[ARRLEN];
    float[] a1 = new float[ARRLEN];
    float[] a2 = new float[ARRLEN];
    float[] a3 = new float[ARRLEN];
@ -88,7 +89,9 @@ public class TestFloatVect {
      test_diva(a0, a1, a3);
      test_negc(a0, a1);
      test_sqrt(a0, a1);
+      test_round(i0, a1);
    }
+
    // Test and verify results
    System.out.println("Verification");
    int errn = 0;
@ -369,6 +372,35 @@ public class TestFloatVect {
        errn += verify("test_sqrt: ", i, a0[i], (float)(Math.sqrt((double)(ADD_INIT+i))));
      }

+      a1[6] = +0x1.fffffep-2f;
+      a1[7] = +0x1.0p-1f;
+      a1[8] = +0x1.000002p-1f;
+      a1[9] = -0x1.fffffep-2f;
+      a1[10] = -0x1.0p-1f;
+      a1[11] = -0x1.000002p-1f;
+      a1[12] = 3.4028235E10f;
+      a1[13] = -3.4028235E10f;
+
+      test_round(i0, a1);
+      errn += verify("test_round: ", 0, i0[0], 0);
+      errn += verify("test_round: ", 1, i0[1], Integer.MAX_VALUE);
+      errn += verify("test_round: ", 2, i0[2], Integer.MIN_VALUE);
+      errn += verify("test_round: ", 3, i0[3], Integer.MAX_VALUE);
+      errn += verify("test_round: ", 4, i0[4], 0);
+      errn += verify("test_round: ", 5, i0[5], 0);
+      errn += verify("test_round: ", 6, i0[6], 0);
+      errn += verify("test_round: ", 7, i0[7], 1);
+      errn += verify("test_round: ", 8, i0[8], 1);
+      errn += verify("test_round: ", 9, i0[9], 0);
+      errn += verify("test_round: ", 10, i0[10], 0);
+      errn += verify("test_round: ", 11, i0[11], -1);
+      errn += verify("test_round: ", 12, i0[12], Integer.MAX_VALUE);
+      errn += verify("test_round: ", 13, i0[13], Integer.MIN_VALUE);
+
+      for (int i=14; i<ARRLEN; i++) {
+        errn += verify("test_round: ", i, i0[i], Math.round(((float)(ADD_INIT+i))));
+      }
+
    }

    if (errn > 0)
@ -512,6 +544,12 @@ public class TestFloatVect {
    end = System.currentTimeMillis();
    System.out.println("test_sqrt_n: " + (end - start));

+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_round(i0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_round_n: " + (end - start));
    return errn;
  }

@ -609,6 +647,20 @@ public class TestFloatVect {
    }
  }

+  static void test_round(int[] a0, float[] a1) {
+    for (int i = 0; i < a0.length; i+=1) {
+      a0[i] = Math.round(a1[i]);
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+
  static int verify(String text, int i, float elem, float val) {
    if (elem != val && !(Float.isNaN(elem) && Float.isNaN(val))) {
      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
--- a/test/hotspot/jtreg/compiler/vectorization/TestRoundVectDouble.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestRoundVectDouble.java
@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8279508
+ * @summary Auto-vectorize Math.round API
+ * @requires vm.compiler2.enabled
+ * @requires vm.cpu.features ~= ".*avx512dq.*"
+ * @requires os.simpleArch == "x64"
+ * @library /test/lib /
+ * @run driver compiler.vectorization.TestRoundVectDouble
+ */
+
+package compiler.vectorization;
+
+import compiler.lib.ir_framework.*;
+
+public class TestRoundVectDouble {
+  private static final int ARRLEN = 1024;
+  private static final int ITERS  = 11000;
+
+  private static double [] dinp;
+  private static long   [] lout;
+
+  public static void main(String args[]) {
+      TestFramework.runWithFlags("-XX:-TieredCompilation",
+                                  "-XX:UseAVX=3",
+                                  "-XX:CompileThresholdScaling=0.3");
+      System.out.println("PASSED");
+  }
+
+  @Test
+  @IR(applyIf = {"UseAVX", "3"}, counts = {"RoundVD" , " > 0 "})
+  public void test_round_double(long[] lout, double[] dinp) {
+      for (int i = 0; i < lout.length; i+=1) {
+          lout[i] = Math.round(dinp[i]);
+      }
+  }
+
+  @Run(test = {"test_round_double"}, mode = RunMode.STANDALONE)
+  public void kernel_test_round_double() {
+      dinp = new double[ARRLEN];
+      lout = new long[ARRLEN];
+      for(int i = 0 ; i < ARRLEN; i++) {
+          dinp[i] = (double)i*1.4;
+      }
+      for (int i = 0; i < ITERS; i++) {
+          test_round_double(lout , dinp);
+      }
+  }
+}
--- a/test/hotspot/jtreg/compiler/vectorization/TestRoundVectFloat.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestRoundVectFloat.java
@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8279508
+ * @summary Auto-vectorize Math.round API
+ * @requires vm.compiler2.enabled
+ * @requires vm.cpu.features ~= ".*avx.*"
+ * @requires os.simpleArch == "x64"
+ * @library /test/lib /
+ * @run driver compiler.vectorization.TestRoundVectFloat
+ */
+
+package compiler.vectorization;
+
+import compiler.lib.ir_framework.*;
+
+public class TestRoundVectFloat {
+  private static final int ARRLEN = 1024;
+  private static final int ITERS  = 11000;
+  private static float  [] finp;
+  private static int    [] iout;
+
+  public static void main(String args[]) {
+      TestFramework.runWithFlags("-XX:-TieredCompilation",
+                                 "-XX:UseAVX=1",
+                                 "-XX:CompileThresholdScaling=0.3");
+      System.out.println("PASSED");
+  }
+
+  @Test
+  @IR(applyIf = {"UseAVX", " > 1"}, counts = {"RoundVF" , " > 0 "})
+  public void test_round_float(int[] iout, float[] finp) {
+      for (int i = 0; i < finp.length; i+=1) {
+          iout[i] = Math.round(finp[i]);
+      }
+  }
+
+  @Run(test = {"test_round_float"}, mode = RunMode.STANDALONE)
+  public void kernel_test_round() {
+      finp = new float[ARRLEN];
+      iout = new int[ARRLEN];
+      for(int i = 0 ; i < ARRLEN; i++) {
+          finp[i] = (float)i*1.4f;
+      }
+      for (int i = 0; i < ITERS; i++) {
+          test_round_float(iout , finp);
+      }
+  }
+}
--- a/test/jdk/java/lang/Math/RoundTests.java
+++ b/test/jdk/java/lang/Math/RoundTests.java
@ -25,11 +25,12 @@
 * @test
 * @bug 6430675 8010430
 * @summary Check for correct implementation of {Math, StrictMath}.round
+ * @run main/othervm -XX:Tier3CompileThreshold=50 -XX:CompileThresholdScaling=0.01 -XX:+TieredCompilation RoundTests
 */
 public class RoundTests {
    public static void main(String... args) {
        int failures = 0;
-
+        for (int i = 0; i < 500; i++) {
            failures += testNearFloatHalfCases();
            failures += testNearDoubleHalfCases();
            failures += testUnityULPCases();
@ -41,6 +42,7 @@ public class RoundTests {
                throw new RuntimeException();
            }
        }
+    }

    private static int testNearDoubleHalfCases() {
        int failures = 0;
--- a/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java
+++ b/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -26,53 +26,97 @@ package org.openjdk.bench.java.math;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.openjdk.jmh.annotations.*;
-import org.openjdk.jmh.infra.Blackhole;

@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
 public class FpRoundingBenchmark {

-  @Param({"1024"})
+  @Param({"1024", "2048"})
  public int TESTSIZE;

  public double[] DargV1;
-
-  public double[] Res;
+  public double[] ResD;
+  public long[] ResL;
+  public float[] FargV1;
+  public float[] ResF;
+  public int[] ResI;

  public final double[] DspecialVals = {
-      0.0, -0.0, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY};
+      0.0, -0.0, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY,
+      Double.MAX_VALUE, -Double.MAX_VALUE, Double.MIN_VALUE, -Double.MIN_VALUE,
+      Double.MIN_NORMAL
+  };
+
+  public final float[] FspecialVals = {
+      0.0f, -0.0f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY,
+      Float.MAX_VALUE, -Float.MAX_VALUE, Float.MIN_VALUE, -Float.MIN_VALUE,
+      Float.MIN_NORMAL
+  };

  @Setup(Level.Trial)
  public void BmSetup() {
      int i = 0;
      Random r = new Random(1024);
+
      DargV1 = new double[TESTSIZE];
-    Res = new double[TESTSIZE];
+      ResD = new double[TESTSIZE];

      for (; i < DspecialVals.length; i++) {
          DargV1[i] = DspecialVals[i];
      }

      for (; i < TESTSIZE; i++) {
-      DargV1[i] = r.nextDouble()*TESTSIZE;
+          DargV1[i] = Double.longBitsToDouble(r.nextLong());;
+      }
+
+      FargV1 = new float[TESTSIZE];
+      ResF = new float[TESTSIZE];
+
+      i = 0;
+      for (; i < FspecialVals.length; i++) {
+          FargV1[i] = FspecialVals[i];
+      }
+
+      for (; i < TESTSIZE; i++) {
+          FargV1[i] = Float.intBitsToFloat(r.nextInt());
+      }
+
+      ResI = new int[TESTSIZE];
+      ResL = new long[TESTSIZE];
+  }
+
+  @Benchmark
+  public void test_ceil() {
+      for (int i = 0; i < TESTSIZE; i++) {
+          ResD[i] = Math.ceil(DargV1[i]);
      }
  }

  @Benchmark
-  public void testceil(Blackhole bh) {
-    for (int i = 0; i < TESTSIZE; i++)
-      Res[i] = Math.ceil(DargV1[i]);
+  public void test_floor() {
+      for (int i = 0; i < TESTSIZE; i++) {
+          ResD[i] = Math.floor(DargV1[i]);
+      }
  }

  @Benchmark
-  public void testfloor(Blackhole bh) {
-    for (int i = 0; i < TESTSIZE; i++)
-      Res[i] = Math.floor(DargV1[i]);
+  public void test_rint() {
+      for (int i = 0; i < TESTSIZE; i++) {
+          ResD[i] = Math.rint(DargV1[i]);
+      }
  }

  @Benchmark
-  public void testrint(Blackhole bh) {
-    for (int i = 0; i < TESTSIZE; i++)
-      Res[i] = Math.rint(DargV1[i]);
+  public void test_round_double() {
+      for (int i = 0; i < TESTSIZE; i++) {
+          ResL[i] = Math.round(DargV1[i]);
+      }
+  }
+
+  @Benchmark
+  public void test_round_float() {
+      for (int i = 0; i < TESTSIZE; i++) {
+          ResI[i] = Math.round(FargV1[i]);
+      }
  }
 }