diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
index 89e093f6d12..d1831aac96c 100644
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -5008,6 +5008,40 @@ assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
   emit_int16(0x04, (0xC0 | encode));
 }
 
+void Assembler::evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  evpmadd52luq(dst, k0, src1, src2, false, vector_len);
+}
+
+void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512ifma(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xB4, (0xC0 | encode));
+}
+
+void Assembler::evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  evpmadd52huq(dst, k0, src1, src2, false, vector_len);
+}
+
+void Assembler::evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512ifma(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xB5, (0xC0 | encode));
+}
+
 void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   assert(VM_Version::supports_avx512_vnni(), "must support vnni");
@@ -5425,6 +5459,42 @@ void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
   emit_int16(0x6C, (0xC0 | encode));
 }
 
+void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  evpunpcklqdq(dst, k0, src1, src2, false, vector_len);
+}
+
+void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x6C, (0xC0 | encode));
+}
+
+void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  evpunpckhqdq(dst, k0, src1, src2, false, vector_len);
+}
+
+void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x6D, (0xC0 | encode));
+}
+
 void Assembler::push(int32_t imm32) {
   // in 64bits we push 64bits onto the stack but only
   // take a 32bit immediate
@@ -5869,6 +5939,18 @@ void Assembler::shrdl(Register dst, Register src, int8_t imm8) {
   emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
 }
 
+#ifdef _LP64
+void Assembler::shldq(Register dst, Register src, int8_t imm8) {
+  int encode = prefixq_and_encode(src->encoding(), dst->encoding());
+  emit_int32(0x0F, (unsigned char)0xA4, (0xC0 | encode), imm8);
+}
+
+void Assembler::shrdq(Register dst, Register src, int8_t imm8) {
+  int encode = prefixq_and_encode(src->encoding(), dst->encoding());
+  emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
+}
+#endif
+
 // copies a single word from [esi] to [edi]
 void Assembler::smovl() {
   emit_int8((unsigned char)0xA5);
@@ -7740,11 +7822,12 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_
   emit_operand(dst, src, 0);
 }
 
-void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
-  emit_int16((unsigned char)0xDB, (0xC0 | encode));
+void Assembler::evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  evpandq(dst, k0, nds, src, false, vector_len);
+}
+
+void Assembler::evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  evpandq(dst, k0, nds, src, false, vector_len);
 }
 
 //Variable Shift packed integers logically left.
@@ -7857,13 +7940,13 @@ void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_l
   emit_operand(dst, src, 0);
 }
 
-void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
-  emit_int16((unsigned char)0xEB, (0xC0 | encode));
+void Assembler::evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  evporq(dst, k0, nds, src, false, vector_len);
 }
 
+void Assembler::evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  evporq(dst, k0, nds, src, false, vector_len);
+}
 
 void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
   assert(VM_Version::supports_evex(), "");
@@ -8004,7 +8087,8 @@ void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
 }
 
 void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -8016,7 +8100,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMReg
 }
 
 void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
-  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
@@ -8031,7 +8116,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
 }
 
 void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -8043,7 +8129,8 @@ void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegi
 }
 
 void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
-  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
@@ -8201,8 +8288,8 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address
 }
 
 void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
-  assert(VM_Version::supports_evex(), "requires EVEX support");
-  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
+  assert(VM_Version::supports_evex(), "requires AVX512F");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -8211,6 +8298,20 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis
   emit_int8(imm8);
 }
 
+void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x25);
+  emit_operand(dst, src3, 1);
+  emit_int8(imm8);
+}
+
 void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
@@ -13452,6 +13553,13 @@ void Assembler::vzeroupper() {
   emit_copy(code_section(), vzup_code, vzup_len);
 }
 
+void Assembler::vzeroall() {
+  assert(VM_Version::supports_avx(), "requires AVX");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x77);
+}
+
 void Assembler::pushq(Address src) {
   InstructionMark im(this);
   emit_int16(get_prefixq(src), (unsigned char)0xFF);
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
index c7316ae01fc..04dbb7907be 100644
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -1891,6 +1891,10 @@ private:
   void pmaddwd(XMMRegister dst, XMMRegister src);
   void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
+  void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
 
   // Multiply add accumulate
   void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -1990,6 +1994,11 @@ private:
   // Interleave Low Quadwords
   void punpcklqdq(XMMRegister dst, XMMRegister src);
 
+  void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
+  void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
+
   // Vector sum of absolute difference.
   void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
@@ -2092,6 +2101,10 @@ private:
   void shldl(Register dst, Register src, int8_t imm8);
   void shrdl(Register dst, Register src);
   void shrdl(Register dst, Register src, int8_t imm8);
+#ifdef _LP64
+  void shldq(Register dst, Register src, int8_t imm8);
+  void shrdq(Register dst, Register src, int8_t imm8);
+#endif
 
   void shll(Register dst, int imm8);
   void shll(Register dst);
@@ -2616,7 +2629,8 @@ private:
   void pand(XMMRegister dst, XMMRegister src);
   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
-  void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Andn packed integers
   void pandn(XMMRegister dst, XMMRegister src);
@@ -2626,7 +2640,8 @@ private:
   void por(XMMRegister dst, XMMRegister src);
   void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
-  void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evporq(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
 
   // Xor packed integers
   void pxor(XMMRegister dst, XMMRegister src);
@@ -2640,6 +2655,7 @@ private:
   void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
   void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
+  void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
 
   // Vector compress/expand instructions.
   void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
@@ -2753,6 +2769,8 @@ private:
   // runtime code and native libraries.
   void vzeroupper();
 
+  void vzeroall();
+
   // Vector double compares
   void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
   void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
index ec03a1e4844..26c19ee6f1d 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -5255,7 +5255,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg
     // Get the reverse bit sequence of lower nibble of each byte.
     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
-    vpandq(dst, xtmp2, src, vec_enc);
+    evpandq(dst, xtmp2, src, vec_enc);
     vpshufb(dst, xtmp1, dst, vec_enc);
     vpsllq(dst, dst, 4, vec_enc);
 
@@ -5266,7 +5266,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg
 
     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
-    vporq(xtmp2, dst, xtmp2, vec_enc);
+    evporq(xtmp2, dst, xtmp2, vec_enc);
     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
 
   } else if(vec_enc == Assembler::AVX_512bit) {
@@ -5321,11 +5321,11 @@ void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, X
 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
-  vpandq(dst, xtmp1, src, vec_enc);
+  evpandq(dst, xtmp1, src, vec_enc);
   vpsllq(dst, dst, nbits, vec_enc);
   vpandn(xtmp1, xtmp1, src, vec_enc);
   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
-  vporq(dst, dst, xtmp1, vec_enc);
+  evporq(dst, dst, xtmp1, vec_enc);
 }
 
 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
index f013993b964..b55cc5fc9f6 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -1217,6 +1217,19 @@ void MacroAssembler::andptr(Register dst, int32_t imm32) {
   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
 }
 
+#ifdef _LP64
+void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
+  assert(rscratch != noreg || always_reachable(src), "missing");
+
+  if (reachable(src)) {
+    andq(dst, as_Address(src));
+  } else {
+    lea(rscratch, src);
+    andq(dst, Address(rscratch, 0));
+  }
+}
+#endif
+
 void MacroAssembler::atomic_incl(Address counter_addr) {
   lock();
   incrementl(counter_addr);
@@ -9105,6 +9118,40 @@ void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMM
       fatal("Unexpected type argument %s", type2name(type)); break;
   }
 }
+
+void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
+  assert(rscratch != noreg || always_reachable(src), "missing");
+
+  if (reachable(src)) {
+    evpandq(dst, nds, as_Address(src), vector_len);
+  } else {
+    lea(rscratch, src);
+    evpandq(dst, nds, Address(rscratch, 0), vector_len);
+  }
+}
+
+void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
+  assert(rscratch != noreg || always_reachable(src), "missing");
+
+  if (reachable(src)) {
+    evporq(dst, nds, as_Address(src), vector_len);
+  } else {
+    lea(rscratch, src);
+    evporq(dst, nds, Address(rscratch, 0), vector_len);
+  }
+}
+
+void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
+  assert(rscratch != noreg || always_reachable(src3), "missing");
+
+  if (reachable(src3)) {
+    vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
+  } else {
+    lea(rscratch, src3);
+    vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
+  }
+}
+
 #if COMPILER2_OR_JVMCI
 
 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
index 3318c5669b4..5a0a3d8c9a1 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -730,6 +730,11 @@ public:
   void andptr(Register dst, int32_t src);
   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 
+#ifdef _LP64
+  using Assembler::andq;
+  void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
+#endif
+
   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 
   // renamed to drag out the casting of address to int32_t/intptr_t
@@ -1754,6 +1759,15 @@ public:
   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
 
+  using Assembler::evpandq;
+  void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
+
+  using Assembler::evporq;
+  void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
+
+  using Assembler::vpternlogq;
+  void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
+
   void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch);
   void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch);
 
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 77be6f9e871..c3e0b79dc46 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -2519,7 +2519,7 @@ address StubGenerator::generate_base64_decodeBlock() {
     // Decode all bytes within our merged input
     __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
     __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
-    __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
+    __ evporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
 
     // Check for error.  Compare (decoded | initial) to all invalid.
     // If any bytes have their high-order bit set, then we have an error.
@@ -3709,6 +3709,10 @@ void StubGenerator::generate_initial() {
     StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
   }
 
+  if (UsePolyIntrinsics) {
+    StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
+  }
+
   if (UseCRC32CIntrinsics) {
     bool supports_clmul = VM_Version::supports_clmul();
     StubRoutines::x86::generate_CRC32C_table(supports_clmul);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
index 7d5e25de381..5e97e1e9a44 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@@ -387,6 +387,24 @@ class StubGenerator: public StubCodeGenerator {
   // Ghash single and multi block operations using AVX instructions
   address generate_avx_ghash_processBlocks();
 
+  // Poly1305 multiblock using IFMA instructions
+  address generate_poly1305_processBlocks();
+  void poly1305_process_blocks_avx512(const Register input, const Register length,
+                                      const Register A0, const Register A1, const Register A2,
+                                      const Register R0, const Register R1, const Register C1);
+  void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2,
+                                const Register r0, const Register r1, const Register c1, bool only128,
+                                const Register t0, const Register t1, const Register t2,
+                                const Register mulql, const Register mulqh);
+  void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
+                                 const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
+                                 const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
+                                 const XMMRegister TMP, const Register rscratch);
+  void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1);
+  void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1);
+  void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1,
+                             const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
+                             const XMMRegister TMP, const Register rscratch);
 
   // BASE64 stubs
 
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp
new file mode 100644
index 00000000000..97f9f6ccc47
--- /dev/null
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2022, Intel Corporation. All rights reserved.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "macroAssembler_x86.hpp"
+#include "stubGenerator_x86_64.hpp"
+
+#define __ _masm->
+
+// References:
+//  - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols
+//  - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
+//  - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design
+
+// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5:
+//
+// Reduction by 2^130-5 can be expressed as follows:
+//    ( a×2^130 + b ) mod 2^130-5     //i.e. number split along the 130-bit boundary
+//                                 = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5
+//                                 = ( a×(2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop
+//                                 = ( 5×a + b ) mod 2^130-5
+// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add'
+// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment
+//
+
+// Pseudocode for this file (in general):
+//    * used for poly1305_multiply_scalar
+//    × used for poly1305_multiply8_avx512
+//    lower-case variables are scalar numbers in 3×44-bit limbs (in gprs)
+//    upper-case variables are 8-element vector numbers in 3×44-bit limbs (in zmm registers)
+//    [ ] used to denote vector numbers (with their elements)
+
+// Constant Pool:
+ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_PAD_MSG[] = {
+  0x0000010000000000, 0x0000010000000000,
+  0x0000010000000000, 0x0000010000000000,
+  0x0000010000000000, 0x0000010000000000,
+  0x0000010000000000, 0x0000010000000000,
+};
+static address poly1305_pad_msg() {
+  return (address)POLY1305_PAD_MSG;
+}
+
+ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK42[] = {
+  0x000003ffffffffff, 0x000003ffffffffff,
+  0x000003ffffffffff, 0x000003ffffffffff,
+  0x000003ffffffffff, 0x000003ffffffffff,
+  0x000003ffffffffff, 0x000003ffffffffff
+};
+static address poly1305_mask42() {
+  return (address)POLY1305_MASK42;
+}
+
+ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK44[] = {
+  0x00000fffffffffff, 0x00000fffffffffff,
+  0x00000fffffffffff, 0x00000fffffffffff,
+  0x00000fffffffffff, 0x00000fffffffffff,
+  0x00000fffffffffff, 0x00000fffffffffff,
+};
+static address poly1305_mask44() {
+  return (address)POLY1305_MASK44;
+}
+
+// Compute product for 8 16-byte message blocks,
+// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0]
+//
+// Each block/number is represented by 3 44-bit limb digits, start with multiplication
+//
+//      a2       a1       a0
+// ×    r2       r1       r0
+// ----------------------------------
+//     a2×r0    a1×r0    a0×r0
+// +   a1×r1    a0×r1  5×a2×r1'     (r1' = r1<<2)
+// +   a0×r2  5×a2×r2' 5×a1×r2'     (r2' = r2<<2)
+// ----------------------------------
+//        p2       p1       p0
+//
+// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs.
+// Then, modular reduction from upper limb wrapped to lower limbs
+//
+// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above):
+//    ( p2×2^88 ) mod 2^130-5
+//                             = ( p2'×2^88 + p2''×2^130) mod 2^130-5 // Split on 130-bit boudary
+//                             = ( p2'×2^88 + p2''×2^130 - 5×p2'' + 5×p2'') mod 2^130-5
+//                             = ( p2'×2^88 + p2''×(2^130 - 5) + 5×p2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop
+//                             = ( p2'×2^88 + 5×p2'') mod 2^130-5
+//
+// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication.
+// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5.
+// For example, partial product (a2×r2):
+//    (a2×2^88)×(r2×2^88) mod 2^130-5
+//                                    = (a2×r2 × 2^176) mod 2^130-5
+//                                    = (a2×r2 × 2^46×2^130) mod 2^130-5
+//                                    = (a2×r2×2^46 × 2^130- 5×a2×r2×2^46 + 5×a2×r2×2^46) mod 2^130-5
+//                                    = (a2×r2×2^46 × (2^130- 5) + 5×a2×r2×2^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop
+//                                    = (5×a2×r2×2^46) mod 2^130-5
+//                                    = (a2×5×r2×2^2 × 2^44) mod 2^130-5 // Align to limb boudary
+//                                    = (a2×[5×r2×4] × 2^44) mod 2^130-5
+//                                    = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2
+//
+void StubGenerator::poly1305_multiply8_avx512(
+  const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
+  const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
+  const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
+  const XMMRegister TMP, const Register rscratch)
+{
+
+  // Reset partial sums
+  __ evpxorq(P0L, P0L, P0L, Assembler::AVX_512bit);
+  __ evpxorq(P0H, P0H, P0H, Assembler::AVX_512bit);
+  __ evpxorq(P1L, P1L, P1L, Assembler::AVX_512bit);
+  __ evpxorq(P1H, P1H, P1H, Assembler::AVX_512bit);
+  __ evpxorq(P2L, P2L, P2L, Assembler::AVX_512bit);
+  __ evpxorq(P2H, P2H, P2H, Assembler::AVX_512bit);
+
+  // Calculate partial products
+  // p0 = a2×r1'
+  // p1 = a2×r2'
+  // p2 = a2×r0
+  __ evpmadd52luq(P0L, A2, R1P, Assembler::AVX_512bit);
+  __ evpmadd52huq(P0H, A2, R1P, Assembler::AVX_512bit);
+  __ evpmadd52luq(P1L, A2, R2P, Assembler::AVX_512bit);
+  __ evpmadd52huq(P1H, A2, R2P, Assembler::AVX_512bit);
+  __ evpmadd52luq(P2L, A2, R0, Assembler::AVX_512bit);
+  __ evpmadd52huq(P2H, A2, R0, Assembler::AVX_512bit);
+
+  // p0 += a0×r0
+  // p1 += a0×r1
+  // p2 += a0×r2
+  __ evpmadd52luq(P1L, A0, R1, Assembler::AVX_512bit);
+  __ evpmadd52huq(P1H, A0, R1, Assembler::AVX_512bit);
+  __ evpmadd52luq(P2L, A0, R2, Assembler::AVX_512bit);
+  __ evpmadd52huq(P2H, A0, R2, Assembler::AVX_512bit);
+  __ evpmadd52luq(P0L, A0, R0, Assembler::AVX_512bit);
+  __ evpmadd52huq(P0H, A0, R0, Assembler::AVX_512bit);
+
+  // p0 += a1×r2'
+  // p1 += a1×r0
+  // p2 += a1×r1
+  __ evpmadd52luq(P0L, A1, R2P, Assembler::AVX_512bit);
+  __ evpmadd52huq(P0H, A1, R2P, Assembler::AVX_512bit);
+  __ evpmadd52luq(P1L, A1, R0, Assembler::AVX_512bit);
+  __ evpmadd52huq(P1H, A1, R0, Assembler::AVX_512bit);
+  __ evpmadd52luq(P2L, A1, R1, Assembler::AVX_512bit);
+  __ evpmadd52huq(P2H, A1, R1, Assembler::AVX_512bit);
+
+  // Carry propagation:
+  // (Not quite aligned)                         | More mathematically correct:
+  //         P2L   P1L   P0L                     |                 P2L×2^88 + P1L×2^44 + P0L×2^0
+  // + P2H   P1H   P0H                           |   + P2H×2^140 + P1H×2^96 + P0H×2^52
+  // ---------------------------                 |   -----------------------------------------------
+  // = P2H    A2    A1    A0                     |   = P2H×2^130 + A2×2^88 +   A1×2^44 +  A0×2^0
+  //
+  __ vpsrlq(TMP, P0L, 44, Assembler::AVX_512bit);
+  __ evpandq(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
+
+  __ vpsllq(P0H, P0H, 8, Assembler::AVX_512bit);
+  __ vpaddq(P0H, P0H, TMP, Assembler::AVX_512bit);
+  __ vpaddq(P1L, P1L, P0H, Assembler::AVX_512bit);
+  __ evpandq(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
+
+  __ vpsrlq(TMP, P1L, 44, Assembler::AVX_512bit);
+  __ vpsllq(P1H, P1H, 8, Assembler::AVX_512bit);
+  __ vpaddq(P1H, P1H, TMP, Assembler::AVX_512bit);
+  __ vpaddq(P2L, P2L, P1H, Assembler::AVX_512bit);
+  __ evpandq(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits
+
+  __ vpsrlq(TMP, P2L, 42, Assembler::AVX_512bit);
+  __ vpsllq(P2H, P2H, 10, Assembler::AVX_512bit);
+  __ vpaddq(P2H, P2H, TMP, Assembler::AVX_512bit);
+
+  // Reduction: p2->a0->a1
+  // Multiply by 5 the highest bits (p2 is above 130 bits)
+  __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
+  __ vpsllq(P2H, P2H, 2, Assembler::AVX_512bit);
+  __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
+  __ vpsrlq(TMP, A0, 44, Assembler::AVX_512bit);
+  __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
+  __ vpaddq(A1, A1, TMP, Assembler::AVX_512bit);
+}
+
+// Compute product for a single 16-byte message blocks
+// - Assumes that r = [r1 r0] is only 128 bits (not 130)
+// - Input [a2 a1 a0]; when only128 is set, input is 128 bits (i.e. a2==0)
+// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used regardless of only128)
+//
+// Note 1: a2 here is only two bits so anything above is subject of reduction.
+// Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations
+//
+// Flow of the code below is as follows:
+//
+//          a2        a1        a0
+//        x           r1        r0
+//   -----------------------------
+//       a2×r0     a1×r0     a0×r0
+//   +             a0×r1
+//   +           5xa2xr1   5xa1xr1
+//   -----------------------------
+//     [0|L2L] [L1H|L1L] [L0H|L0L]
+//
+//   Registers:  t2:t1     t0:a0
+//
+// Completing the multiply and adding (with carry) 3x128-bit limbs into
+// 192-bits again (3x64-bits):
+// a0 = L0L
+// a1 = L0H + L1L
+// t2 = L1H + L2L
+void StubGenerator::poly1305_multiply_scalar(
+  const Register a0, const Register a1, const Register a2,
+  const Register r0, const Register r1, const Register c1, bool only128,
+  const Register t0, const Register t1, const Register t2,
+  const Register mulql, const Register mulqh)
+{
+  // mulq instruction requires/clobers rax, rdx (mulql, mulqh)
+
+  // t2:t1 = (a0 * r1)
+  __ movq(rax, r1);
+  __ mulq(a0);
+  __ movq(t1, rax);
+  __ movq(t2, rdx);
+
+  // t0:a0 = (a0 * r0)
+  __ movq(rax, r0);
+  __ mulq(a0);
+  __ movq(a0, rax); // a0 not used in other operations
+  __ movq(t0, rdx);
+
+  // t2:t1 += (a1 * r0)
+  __ movq(rax, r0);
+  __ mulq(a1);
+  __ addq(t1, rax);
+  __ adcq(t2, rdx);
+
+  // t0:a0 += (a1 * r1x5)
+  __ movq(rax, c1);
+  __ mulq(a1);
+  __ addq(a0, rax);
+  __ adcq(t0, rdx);
+
+  // Note: a2 is clamped to 2-bits,
+  //       r1/r0 is clamped to 60-bits,
+  //       their product is less than 2^64.
+
+  if (only128) { // Accumulator only 128 bits, i.e. a2 == 0
+    // just move and add t0-t1 to a1
+    __ movq(a1, t0);
+    __ addq(a1, t1);
+    __ adcq(t2, 0);
+  } else {
+    // t2:t1 += (a2 * r1x5)
+    __ movq(a1, a2); // use a1 for a2
+    __ imulq(a1, c1);
+    __ addq(t1, a1);
+    __ adcq(t2, 0);
+
+    __ movq(a1, t0); // t0:a0 => a1:a0
+
+    // t2:a1 += (a2 * r0):t1
+    __ imulq(a2, r0);
+    __ addq(a1, t1);
+    __ adcq(t2, a2);
+  }
+
+  // At this point, 3 64-bit limbs are in t2:a1:a0
+  // t2 can span over more than 2 bits so final partial reduction step is needed.
+  //
+  // Partial reduction (just to fit into 130 bits)
+  //    a2 = t2 & 3
+  //    k = (t2 & ~3) + (t2 >> 2)
+  //         Y    x4  +  Y    x1
+  //    a2:a1:a0 += k
+  //
+  // Result will be in a2:a1:a0
+  __ movq(t0, t2);
+  __ movl(a2, t2); // DWORD
+  __ andq(t0, ~3);
+  __ shrq(t2, 2);
+  __ addq(t0, t2);
+  __ andl(a2, 3); // DWORD
+
+  // a2:a1:a0 += k (kept in t0)
+  __ addq(a0, t0);
+  __ adcq(a1, 0);
+  __ adcl(a2, 0); // DWORD
+}
+
+// Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2)
+// Optionally pad all the numbers (i.e. add 2^128)
+//
+//         +-------------------------+-------------------------+
+//  D0:D1  | h0 h1 g0 g1 f0 f1 e0 e1 | d0 d1 c0 c1 b0 b1 a0 a1 |
+//         +-------------------------+-------------------------+
+//         +-------------------------+
+//  L2     | h2 d2 g2 c2 f2 b2 e2 a2 |
+//         +-------------------------+
+//         +-------------------------+
+//  L1     | h1 d1 g1 c1 f1 b1 e1 a1 |
+//         +-------------------------+
+//         +-------------------------+
+//  L0     | h0 d0 g0 c0 f0 b0 e0 a0 |
+//         +-------------------------+
+//
+void StubGenerator::poly1305_limbs_avx512(
+    const XMMRegister D0, const XMMRegister D1,
+    const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
+    const XMMRegister TMP, const Register rscratch)
+{
+  // Interleave blocks of data
+  __ evpunpckhqdq(TMP, D0, D1, Assembler::AVX_512bit);
+  __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit);
+
+  // Highest 42-bit limbs of new blocks
+  __ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit);
+  if (padMSG) {
+    __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message
+  }
+
+  // Middle 44-bit limbs of new blocks
+  __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit);
+  __ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit);
+  __ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C)
+
+  // Lowest 44-bit limbs of new blocks
+  __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
+}
+
+/**
+ * Copy 5×26-bit (unreduced) limbs stored at Register limbs into  a2:a1:a0 (3×64-bit limbs)
+ *
+ * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R)
+ */
+void StubGenerator::poly1305_limbs(
+    const Register limbs, const Register a0, const Register a1, const Register a2,
+    const Register t0, const Register t1)
+{
+  __ movq(a0, Address(limbs, 0));
+  __ movq(t0, Address(limbs, 8));
+  __ shlq(t0, 26);
+  __ addq(a0, t0);
+  __ movq(t0, Address(limbs, 16));
+  __ movq(t1, Address(limbs, 24));
+  __ movq(a1, t0);
+  __ shlq(t0, 52);
+  __ shrq(a1, 12);
+  __ shlq(t1, 14);
+  __ addq(a0, t0);
+  __ adcq(a1, t1);
+  __ movq(t0, Address(limbs, 32));
+  if (a2 != noreg) {
+    __ movq(a2, t0);
+    __ shrq(a2, 24);
+  }
+  __ shlq(t0, 40);
+  __ addq(a1, t0);
+  if (a2 != noreg) {
+    __ adcq(a2, 0);
+
+    // One round of reduction
+    // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
+    __ movq(t0, a2);
+    __ andq(t0, ~3);
+    __ andq(a2, 3);
+    __ movq(t1, t0);
+    __ shrq(t1, 2);
+    __ addq(t0, t1);
+
+    __ addq(a0, t0);
+    __ adcq(a1, 0);
+    __ adcq(a2, 0);
+  }
+}
+
+/**
+ * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs`
+ */
+void StubGenerator::poly1305_limbs_out(
+    const Register a0, const Register a1, const Register a2,
+    const Register limbs,
+    const Register t0, const Register t1)
+{
+  // Extra round of reduction
+  // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
+  __ movq(t0, a2);
+  __ andq(t0, ~3);
+  __ andq(a2, 3);
+  __ movq(t1, t0);
+  __ shrq(t1, 2);
+  __ addq(t0, t1);
+
+  __ addq(a0, t0);
+  __ adcq(a1, 0);
+  __ adcq(a2, 0);
+
+  // Chop a2:a1:a0 into 26-bit limbs
+  __ movl(t0, a0);
+  __ andl(t0, 0x3ffffff);
+  __ movq(Address(limbs, 0), t0);
+
+  __ shrq(a0, 26);
+  __ movl(t0, a0);
+  __ andl(t0, 0x3ffffff);
+  __ movq(Address(limbs, 8), t0);
+
+  __ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1
+  __ movl(t0, a1);
+  __ shll(t0, 12);
+  __ addl(t0, a0);
+  __ andl(t0, 0x3ffffff);
+  __ movq(Address(limbs, 16), t0);
+
+  __ shrq(a1, 14); // already used up 14 bits
+  __ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced
+  __ addq(a1, a2); // put remaining bits into a1
+
+  __ movl(t0, a1);
+  __ andl(t0, 0x3ffffff);
+  __ movq(Address(limbs, 24), t0);
+
+  __ shrq(a1, 26);
+  __ movl(t0, a1);
+  //andl(t0, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s)
+  __ movq(Address(limbs, 32), t0);
+}
+
+// This function consumes as many whole 16*16-byte blocks as available in input
+// After execution, input and length will point at remaining (unprocessed) data
+// and [a2 a1 a0] will contain the current accumulator value
+//
+// Math Note:
+//    Main loop in this function multiplies each message block by r^16; And some glue before and after..
+//    Proof (for brevity, split into 4 'rows' instead of 16):
+//
+//     hash = ((((m1*r + m2)*r + m3)*r ...  mn)*r
+//          = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r  // Horner's rule
+//
+//          = m1*r^n     + m4*r^(n-4) + m8*r^(n-8) ...    // split into 4 groups for brevity, same applies to 16 blocks
+//          + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ...
+//          + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ...
+//          + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ...
+//
+//          = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3)   // factor out r^4..r; same applies to 16 but r^16..r factors
+//          + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2)
+//          + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1)
+//          + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0)   // Note last column: message group has no multiplier
+//
+//          = (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) * r^4   // reverse Horner's rule, for each group
+//          + (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) * r^3   // each column is multiplied by r^4, except last
+//          + (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) * r^2
+//          + (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) * r^1
+//
+// Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
+//
+// Pseudocode:
+//  * used for poly1305_multiply_scalar
+//  × used for poly1305_multiply8_avx512
+//  lower-case variables are scalar numbers in 3×44-bit limbs (in gprs)
+//  upper-case variables are 8&16-element vector numbers in 3×44-bit limbs (in zmm registers)
+//
+//    CL = a       // [0 0 0 0 0 0 0 a]
+//    AL = poly1305_limbs_avx512(input)
+//    AH = poly1305_limbs_avx512(input+8)
+//    AL = AL + C
+//    input+=16, length-=16
+//
+//    a = r
+//    a = a*r
+//  r^2 = a
+//    a = a*r
+//  r^3 = a
+//    r = a*r
+//  r^4 = a
+//
+//    T  = r^4 || r^3 || r^2 || r
+//    B  = limbs(T)           // [r^4  0  r^3  0  r^2  0  r^1  0 ]
+//    CL = B >> 1             // [ 0  r^4  0  r^3  0  r^2  0  r^1]
+//    R  = r^4 || r^4 || ..   // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4]
+//    B  = B×R                // [r^8  0  r^7  0  r^6  0  r^5  0 ]
+//    B  = B | CL             // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1]
+//    CL = B
+//    R  = r^8 || r^8 || ..   // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8]
+//    B  = B × R              // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9]
+//    CH = B
+//    R = r^16 || r^16 || ..  // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16]
+//
+// for (;length>=16; input+=16, length-=16)
+//     BL = poly1305_limbs_avx512(input)
+//     BH = poly1305_limbs_avx512(input+8)
+//     AL = AL × R
+//     AH = AH × R
+//     AL = AL + BL
+//     AH = AH + BH
+//
+//  AL = AL × CL
+//  AH = AH × CH
+//  A = AL + AH // 16->8 blocks
+//  T = A >> 4  // 8 ->4 blocks
+//  A = A + T
+//  T = A >> 2  // 4 ->2 blocks
+//  A = A + T
+//  T = A >> 1  // 2 ->1 blocks
+//  A = A + T
+//  a = A
+//
+// Register Map:
+// GPRs:
+//   input        = rdi
+//   length       = rbx
+//   accumulator  = rcx
+//   R   = r8
+//   a0  = rsi
+//   a1  = r9
+//   a2  = r10
+//   r0  = r11
+//   r1  = r12
+//   c1  = r8;
+//   t0  = r13
+//   t1  = r14
+//   t2  = r15
+//   rscratch = r13
+//   stack(rsp, rbp)
+//   mulq(rax, rdx) in poly1305_multiply_scalar
+//
+// ZMMs:
+//   D: xmm0-1
+//   TMP: xmm2
+//   T: xmm3-8
+//   A: xmm9-14
+//   B: xmm15-20
+//   C: xmm21-26
+//   R: xmm27-31
+void StubGenerator::poly1305_process_blocks_avx512(
+    const Register input, const Register length,
+    const Register a0, const Register a1, const Register a2,
+    const Register r0, const Register r1, const Register c1)
+{
+  Label L_process256Loop, L_process256LoopDone;
+  const Register t0 = r13;
+  const Register t1 = r14;
+  const Register t2 = r15;
+  const Register rscratch = r13;
+  const Register mulql = rax;
+  const Register mulqh = rdx;
+
+  const XMMRegister D0 = xmm0;
+  const XMMRegister D1 = xmm1;
+  const XMMRegister TMP = xmm2;
+
+  const XMMRegister T0 = xmm3;
+  const XMMRegister T1 = xmm4;
+  const XMMRegister T2 = xmm5;
+  const XMMRegister T3 = xmm6;
+  const XMMRegister T4 = xmm7;
+  const XMMRegister T5 = xmm8;
+
+  const XMMRegister A0 = xmm9;
+  const XMMRegister A1 = xmm10;
+  const XMMRegister A2 = xmm11;
+  const XMMRegister A3 = xmm12;
+  const XMMRegister A4 = xmm13;
+  const XMMRegister A5 = xmm14;
+
+  const XMMRegister B0 = xmm15;
+  const XMMRegister B1 = xmm16;
+  const XMMRegister B2 = xmm17;
+  const XMMRegister B3 = xmm18;
+  const XMMRegister B4 = xmm19;
+  const XMMRegister B5 = xmm20;
+
+  const XMMRegister C0 = xmm21;
+  const XMMRegister C1 = xmm22;
+  const XMMRegister C2 = xmm23;
+  const XMMRegister C3 = xmm24;
+  const XMMRegister C4 = xmm25;
+  const XMMRegister C5 = xmm26;
+
+  const XMMRegister R0 = xmm27;
+  const XMMRegister R1 = xmm28;
+  const XMMRegister R2 = xmm29;
+  const XMMRegister R1P = xmm30;
+  const XMMRegister R2P = xmm31;
+
+  // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2
+  __ movq(t0, a0);
+  __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (Acc[43:0])
+  __ movq(C0, t0);
+
+  __ movq(t0, a1);
+  __ shrdq(a0, t0, 44);
+  __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (Acc[77:52])
+  __ movq(C1, a0);
+
+  __ shrdq(a1, a2, 24);
+  __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (Acc[129:88])
+  __ movq(C2, a1);
+
+  // To add accumulator, we must unroll first loop iteration
+
+  // Load first block of data (128 bytes) and pad
+  // A0 to have bits 0-43 of all 8 blocks in 8 qwords
+  // A1 to have bits 87-44 of all 8 blocks in 8 qwords
+  // A2 to have bits 127-88 of all 8 blocks in 8 qwords
+  __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
+  __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
+  poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, rscratch);
+
+  // Add accumulator to the fist message block
+  __ vpaddq(A0, A0, C0, Assembler::AVX_512bit);
+  __ vpaddq(A1, A1, C1, Assembler::AVX_512bit);
+  __ vpaddq(A2, A2, C2, Assembler::AVX_512bit);
+
+  // Load next blocks of data (128 bytes)  and pad
+  // A3 to have bits 0-43 of all 8 blocks in 8 qwords
+  // A4 to have bits 87-44 of all 8 blocks in 8 qwords
+  // A5 to have bits 127-88 of all 8 blocks in 8 qwords
+  __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
+  __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
+  poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, rscratch);
+
+  __ subl(length, 16*16);
+  __ lea(input, Address(input,16*16));
+
+  // Compute the powers of R^1..R^4 and form 44-bit limbs of each
+  // T0 to have bits 0-127 in 4 quadword pairs
+  // T1 to have bits 128-129 in alternating 8 qwords
+  __ vpxorq(T1, T1, T1, Assembler::AVX_512bit);
+  __ movq(T2, r0);
+  __ vpinsrq(T2, T2, r1, 1);
+  __ vinserti32x4(T0, T0, T2, 3);
+
+  // Calculate R^2
+  __ movq(a0, r0);
+  __ movq(a1, r1);
+  // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator
+  poly1305_multiply_scalar(a0, a1, a2,
+                           r0, r1, c1, true,
+                           t0, t1, t2, mulql, mulqh);
+
+  __ movq(T2, a0);
+  __ vpinsrq(T2, T2, a1, 1);
+  __ vinserti32x4(T0, T0, T2, 2);
+  __ movq(T2, a2);
+  __ vinserti32x4(T1, T1, T2, 2);
+
+  // Calculate R^3
+  poly1305_multiply_scalar(a0, a1, a2,
+                           r0, r1, c1, false,
+                           t0, t1, t2, mulql, mulqh);
+
+  __ movq(T2, a0);
+  __ vpinsrq(T2, T2, a1, 1);
+  __ vinserti32x4(T0, T0, T2, 1);
+  __ movq(T2, a2);
+  __ vinserti32x4(T1, T1, T2, 1);
+
+  // Calculate R^4
+  poly1305_multiply_scalar(a0, a1, a2,
+                           r0, r1, c1, false,
+                           t0, t1, t2, mulql, mulqh);
+
+  __ movq(T2, a0);
+  __ vpinsrq(T2, T2, a1, 1);
+  __ vinserti32x4(T0, T0, T2, 0);
+  __ movq(T2, a2);
+  __ vinserti32x4(T1, T1, T2, 0);
+
+  // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty)
+  // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords
+  // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords
+  // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords
+  __ vpxorq(T2, T2, T2, Assembler::AVX_512bit);
+  poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, rscratch);
+
+  // T1 contains the 2 highest bits of the powers of R
+  __ vpsllq(T1, T1, 40, Assembler::AVX_512bit);
+  __ evporq(B2, B2, T1, Assembler::AVX_512bit);
+
+  // Broadcast 44-bit limbs of R^4 into R0,R1,R2
+  __ mov(t0, a0);
+  __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (R^4[43:0])
+  __ evpbroadcastq(R0, t0, Assembler::AVX_512bit);
+
+  __ movq(t0, a1);
+  __ shrdq(a0, t0, 44);
+  __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (R^4[87:44])
+  __ evpbroadcastq(R1, a0, Assembler::AVX_512bit);
+
+  __ shrdq(a1, a2, 24);
+  __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (R^4[129:88])
+  __ evpbroadcastq(R2, a1, Assembler::AVX_512bit);
+
+  // Generate 4*5*R^4 into {R2P,R1P}
+  // Used as multiplier in poly1305_multiply8_avx512 so can
+  // ignore bottom limb and carry propagation
+  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);    // 4*R^4
+  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
+  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);  // 5*R^4
+  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
+  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);   // 4*5*R^4
+  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
+
+  // Move R^4..R^1 one element over
+  __ vpslldq(C0, B0, 8, Assembler::AVX_512bit);
+  __ vpslldq(C1, B1, 8, Assembler::AVX_512bit);
+  __ vpslldq(C2, B2, 8, Assembler::AVX_512bit);
+
+  // Calculate R^8-R^5
+  poly1305_multiply8_avx512(B0, B1, B2,             // ACC=R^4..R^1
+                            R0, R1, R2, R1P, R2P,   // R^4..R^4, 4*5*R^4
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+
+  // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
+  __ evporq(B0, B0, C0, Assembler::AVX_512bit);
+  __ evporq(B1, B1, C1, Assembler::AVX_512bit);
+  __ evporq(B2, B2, C2, Assembler::AVX_512bit);
+
+  // Store R^8-R for later use
+  __ evmovdquq(C0, B0, Assembler::AVX_512bit);
+  __ evmovdquq(C1, B1, Assembler::AVX_512bit);
+  __ evmovdquq(C2, B2, Assembler::AVX_512bit);
+
+  // Broadcast R^8
+  __ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
+  __ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
+  __ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
+
+  // Generate 4*5*R^8
+  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
+  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
+  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);    // 5*R^8
+  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
+  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);     // 4*5*R^8
+  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
+
+  // Calculate R^16-R^9
+  poly1305_multiply8_avx512(B0, B1, B2,            // ACC=R^8..R^1
+                            R0, R1, R2, R1P, R2P,  // R^8..R^8, 4*5*R^8
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+
+  // Store R^16-R^9 for later use
+  __ evmovdquq(C3, B0, Assembler::AVX_512bit);
+  __ evmovdquq(C4, B1, Assembler::AVX_512bit);
+  __ evmovdquq(C5, B2, Assembler::AVX_512bit);
+
+  // Broadcast R^16
+  __ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
+  __ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
+  __ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
+
+  // Generate 4*5*R^16
+  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
+  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
+  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);  // 5*R^16
+  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
+  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);   // 4*5*R^16
+  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
+
+  // VECTOR LOOP: process 16 * 16-byte message block at a time
+  __ bind(L_process256Loop);
+  __ cmpl(length, 16*16);
+  __ jcc(Assembler::less, L_process256LoopDone);
+
+  // Load and interleave next block of data (128 bytes)
+  __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
+  __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
+  poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, rscratch);
+
+  // Load and interleave next block of data (128 bytes)
+  __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
+  __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
+  poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, rscratch);
+
+  poly1305_multiply8_avx512(A0, A1, A2,            // MSG/ACC 16 blocks
+                            R0, R1, R2, R1P, R2P,  // R^16..R^16, 4*5*R^16
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+  poly1305_multiply8_avx512(A3, A4, A5,            // MSG/ACC 16 blocks
+                            R0, R1, R2, R1P, R2P,  // R^16..R^16, 4*5*R^16
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+
+  __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
+  __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
+  __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
+  __ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
+  __ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
+  __ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
+
+  __ subl(length, 16*16);
+  __ lea(input, Address(input,16*16));
+  __ jmp(L_process256Loop);
+
+  __ bind(L_process256LoopDone);
+
+  // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value
+  // Generate 4*5*[R^16..R^9] (ignore lowest limb)
+  // Use D0 ~ R1P, D1 ~ R2P for higher powers
+  __ vpsllq(R1P, C4, 2, Assembler::AVX_512bit);
+  __ vpsllq(R2P, C5, 2, Assembler::AVX_512bit);
+  __ vpaddq(R1P, R1P, C4, Assembler::AVX_512bit);    // 5*R^8
+  __ vpaddq(R2P, R2P, C5, Assembler::AVX_512bit);
+  __ vpsllq(D0, R1P, 2, Assembler::AVX_512bit);      // 4*5*R^8
+  __ vpsllq(D1, R2P, 2, Assembler::AVX_512bit);
+
+  // Generate 4*5*[R^8..R^1] (ignore lowest limb)
+  __ vpsllq(R1P, C1, 2, Assembler::AVX_512bit);
+  __ vpsllq(R2P, C2, 2, Assembler::AVX_512bit);
+  __ vpaddq(R1P, R1P, C1, Assembler::AVX_512bit);    // 5*R^8
+  __ vpaddq(R2P, R2P, C2, Assembler::AVX_512bit);
+  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);     // 4*5*R^8
+  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
+
+  poly1305_multiply8_avx512(A0, A1, A2,            // MSG/ACC 16 blocks
+                            C3, C4, C5, D0, D1,    // R^16-R^9, R1P, R2P
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+  poly1305_multiply8_avx512(A3, A4, A5,            // MSG/ACC 16 blocks
+                            C0, C1, C2, R1P, R2P,  // R^8-R, R1P, R2P
+                            T0, T1, T2, T3, T4, T5, TMP, rscratch);
+
+  // Add all blocks (horizontally)
+  // 16->8 blocks
+  __ vpaddq(A0, A0, A3, Assembler::AVX_512bit);
+  __ vpaddq(A1, A1, A4, Assembler::AVX_512bit);
+  __ vpaddq(A2, A2, A5, Assembler::AVX_512bit);
+
+  // 8 -> 4 blocks
+  __ vextracti64x4(T0, A0, 1);
+  __ vextracti64x4(T1, A1, 1);
+  __ vextracti64x4(T2, A2, 1);
+  __ vpaddq(A0, A0, T0, Assembler::AVX_256bit);
+  __ vpaddq(A1, A1, T1, Assembler::AVX_256bit);
+  __ vpaddq(A2, A2, T2, Assembler::AVX_256bit);
+
+  // 4 -> 2 blocks
+  __ vextracti32x4(T0, A0, 1);
+  __ vextracti32x4(T1, A1, 1);
+  __ vextracti32x4(T2, A2, 1);
+  __ vpaddq(A0, A0, T0, Assembler::AVX_128bit);
+  __ vpaddq(A1, A1, T1, Assembler::AVX_128bit);
+  __ vpaddq(A2, A2, T2, Assembler::AVX_128bit);
+
+  // 2 -> 1 blocks
+  __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit);
+  __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit);
+  __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit);
+
+  // Finish folding and clear second qword
+  __ mov64(t0, 0xfd);
+  __ kmovql(k1, t0);
+  __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit);
+  __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit);
+  __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit);
+
+  // Carry propagation
+  __ vpsrlq(D0, A0, 44, Assembler::AVX_512bit);
+  __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
+  __ vpaddq(A1, A1, D0, Assembler::AVX_512bit);
+  __ vpsrlq(D0, A1, 44, Assembler::AVX_512bit);
+  __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
+  __ vpaddq(A2, A2, D0, Assembler::AVX_512bit);
+  __ vpsrlq(D0, A2, 42, Assembler::AVX_512bit);
+  __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits
+  __ vpsllq(D1, D0, 2, Assembler::AVX_512bit);
+  __ vpaddq(D0, D0, D1, Assembler::AVX_512bit);
+  __ vpaddq(A0, A0, D0, Assembler::AVX_512bit);
+
+  // Put together A (accumulator)
+  __ movq(a0, A0);
+
+  __ movq(t0, A1);
+  __ movq(t1, t0);
+  __ shlq(t1, 44);
+  __ shrq(t0, 20);
+
+  __ movq(a2, A2);
+  __ movq(a1, a2);
+  __ shlq(a1, 24);
+  __ shrq(a2, 40);
+
+  __ addq(a0, t1);
+  __ adcq(a1, t0);
+  __ adcq(a2, 0);
+
+  // Cleanup
+  // Zero out zmm0-zmm31.
+  __ vzeroall();
+  for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) {
+    __ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit);
+  }
+}
+
+// This function consumes as many whole 16-byte blocks as available in input
+// After execution, input and length will point at remaining (unprocessed) data
+// and accumulator will point to the current accumulator value
+address StubGenerator::generate_poly1305_processBlocks() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
+  address start = __ pc();
+  __ enter();
+
+  // Save all 'SOE' registers
+  __ push(rbx);
+  #ifdef _WIN64
+  __ push(rsi);
+  __ push(rdi);
+  #endif
+  __ push(r12);
+  __ push(r13);
+  __ push(r14);
+  __ push(r15);
+
+  // Register Map
+  const Register input        = rdi;
+  const Register length       = rbx;
+  const Register accumulator  = rcx;
+  const Register R            = r8;
+
+  const Register a0 = rsi;  // [in/out] accumulator bits 63..0
+  const Register a1 = r9;   // [in/out] accumulator bits 127..64
+  const Register a2 = r10;  // [in/out] accumulator bits 195..128
+  const Register r0 = r11;  // R constant bits 63..0
+  const Register r1 = r12;  // R constant bits 127..64
+  const Register c1 = r8;   // 5*R (upper limb only)
+  const Register t0 = r13;
+  const Register t1 = r14;
+  const Register t2 = r15;
+  const Register mulql = rax;
+  const Register mulqh = rdx;
+
+  // Normalize input
+  // pseudo-signature: void poly1305_processBlocks(byte[] input, int length, int[5] accumulator, int[5] R)
+  // input, a, r pointers point at first array element
+  // java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks
+  #ifdef _WIN64
+  // c_rarg0 - rcx
+  // c_rarg1 - rdx
+  // c_rarg2 - r8
+  // c_rarg3 - r9
+  __ mov(input, c_rarg0);
+  __ mov(length, c_rarg1);
+  __ mov(accumulator, c_rarg2);
+  __ mov(R, c_rarg3);
+  #else
+  // c_rarg0 - rdi
+  // c_rarg1 - rsi
+  // c_rarg2 - rdx
+  // c_rarg3 - rcx
+  // dont clobber R, args copied out-of-order
+  __ mov(length, c_rarg1);
+  __ mov(R, c_rarg3);
+  __ mov(accumulator, c_rarg2);
+  #endif
+
+  Label L_process16Loop, L_process16LoopDone;
+
+  // Load R into r1:r0
+  poly1305_limbs(R, r0, r1, noreg, t0, t1);
+
+  // Compute 5*R (Upper limb only)
+  __ movq(c1, r1);
+  __ shrq(c1, 2);
+  __ addq(c1, r1); // c1 = r1 + (r1 >> 2)
+
+  // Load accumulator into a2:a1:a0
+  poly1305_limbs(accumulator, a0, a1, a2, t0, t1);
+
+  // VECTOR LOOP: Minimum of 256 bytes to run vectorized code
+  __ cmpl(length, 16*16);
+  __ jcc(Assembler::less, L_process16Loop);
+
+  poly1305_process_blocks_avx512(input, length,
+                                  a0, a1, a2,
+                                  r0, r1, c1);
+
+  // SCALAR LOOP: process one 16-byte message block at a time
+  __ bind(L_process16Loop);
+  __ cmpl(length, 16);
+  __ jcc(Assembler::less, L_process16LoopDone);
+
+  __ addq(a0, Address(input,0));
+  __ adcq(a1, Address(input,8));
+  __ adcq(a2,1);
+  poly1305_multiply_scalar(a0, a1, a2,
+                           r0, r1, c1, false,
+                           t0, t1, t2, mulql, mulqh);
+
+  __ subl(length, 16);
+  __ lea(input, Address(input,16));
+  __ jmp(L_process16Loop);
+  __ bind(L_process16LoopDone);
+
+  // Write output
+  poly1305_limbs_out(a0, a1, a2, accumulator, t0, t1);
+
+  __ pop(r15);
+  __ pop(r14);
+  __ pop(r13);
+  __ pop(r12);
+  #ifdef _WIN64
+  __ pop(rdi);
+  __ pop(rsi);
+  #endif
+  __ pop(rbx);
+
+  __ leave();
+  __ ret(0);
+  return start;
+}
diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
index 989536da2a5..bb98fcf46cd 100644
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
 
 enum platform_dependent_constants {
   code_size1 = 20000 LP64_ONLY(+10000),                    // simply increase if too small (assembler will crash if too small)
-  code_size2 = 35300 LP64_ONLY(+35000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small)
+  code_size2 = 35300 LP64_ONLY(+45000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp
index d665339ee9e..e981498f7d6 100644
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -947,6 +947,7 @@ void VM_Version::get_processor_features() {
     _features &= ~CPU_AVX512_VBMI;
     _features &= ~CPU_AVX512_VBMI2;
     _features &= ~CPU_AVX512_BITALG;
+    _features &= ~CPU_AVX512_IFMA;
   }
 
   if (UseAVX < 2)
@@ -978,6 +979,7 @@ void VM_Version::get_processor_features() {
       _features &= ~CPU_FLUSHOPT;
       _features &= ~CPU_GFNI;
       _features &= ~CPU_AVX512_BITALG;
+      _features &= ~CPU_AVX512_IFMA;
     }
   }
 
@@ -1330,6 +1332,18 @@ void VM_Version::get_processor_features() {
   }
 #endif // COMPILER2 && ASSERT
 
+#ifdef _LP64
+  if (supports_avx512ifma() && supports_avx512vlbw() && MaxVectorSize >= 64) {
+    if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) {
+      FLAG_SET_DEFAULT(UsePolyIntrinsics, true);
+    }
+  } else
+#endif
+  if (UsePolyIntrinsics) {
+    warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UsePolyIntrinsics, false);
+  }
+
 #ifdef _LP64
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
@@ -2894,6 +2908,8 @@ uint64_t VM_Version::feature_flags() {
         result |= CPU_AVX512CD;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0)
         result |= CPU_AVX512DQ;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.avx512ifma != 0)
+        result |= CPU_AVX512_IFMA;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0)
         result |= CPU_AVX512PF;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0)
diff --git a/src/hotspot/cpu/x86/vm_version_x86.hpp b/src/hotspot/cpu/x86/vm_version_x86.hpp
index 1f4cfd39e8d..9213d42bc57 100644
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@@ -223,7 +223,9 @@ class VM_Version : public Abstract_VM_Version {
                avx512dq : 1,
                         : 1,
                     adx : 1,
-                        : 3,
+                        : 1,
+             avx512ifma : 1,
+                        : 1,
              clflushopt : 1,
                    clwb : 1,
                         : 1,
@@ -387,7 +389,8 @@ protected:
     decl(PKU,               "pku",               54) /* Protection keys for user-mode pages */ \
     decl(OSPKE,             "ospke",             55) /* OS enables protection keys */ \
     decl(CET_IBT,           "cet_ibt",           56) /* Control Flow Enforcement - Indirect Branch Tracking */ \
-    decl(CET_SS,            "cet_ss",            57) /* Control Flow Enforcement - Shadow Stack */
+    decl(CET_SS,            "cet_ss",            57) /* Control Flow Enforcement - Shadow Stack */ \
+    decl(AVX512_IFMA,       "avx512_ifma",       58) /* Integer Vector FMA instructions*/
 
 #define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
     CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
@@ -667,6 +670,7 @@ public:
   static bool supports_adx()          { return (_features & CPU_ADX) != 0; }
   static bool supports_evex()         { return (_features & CPU_AVX512F) != 0; }
   static bool supports_avx512dq()     { return (_features & CPU_AVX512DQ) != 0; }
+  static bool supports_avx512ifma()   { return (_features & CPU_AVX512_IFMA) != 0; }
   static bool supports_avx512pf()     { return (_features & CPU_AVX512PF) != 0; }
   static bool supports_avx512er()     { return (_features & CPU_AVX512ER) != 0; }
   static bool supports_avx512cd()     { return (_features & CPU_AVX512CD) != 0; }
diff --git a/src/hotspot/share/classfile/vmIntrinsics.cpp b/src/hotspot/share/classfile/vmIntrinsics.cpp
index bd7d9d2d9cb..0f8ba1e808d 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.cpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.cpp
@@ -479,6 +479,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
   case vmIntrinsics::_base64_decodeBlock:
     if (!UseBASE64Intrinsics) return true;
     break;
+  case vmIntrinsics::_poly1305_processBlocks:
+    if (!UsePolyIntrinsics) return true;
+    break;
   case vmIntrinsics::_updateBytesCRC32C:
   case vmIntrinsics::_updateDirectByteBufferCRC32C:
     if (!UseCRC32CIntrinsics) return true;
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 03c47b7fbdc..9b0cd3f366f 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -519,7 +519,7 @@ class methodHandle;
   do_class(java_util_Base64_Decoder, "java/util/Base64$Decoder")                                                        \
   do_intrinsic(_base64_decodeBlock, java_util_Base64_Decoder, decodeBlock_name, decodeBlock_signature, F_R)             \
    do_name(decodeBlock_name, "decodeBlock")                                                                             \
-   do_signature(decodeBlock_signature, "([BII[BIZZ)I")                                                                   \
+   do_signature(decodeBlock_signature, "([BII[BIZZ)I")                                                                  \
                                                                                                                         \
   /* support for com.sun.crypto.provider.GHASH */                                                                       \
   do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH")                                              \
@@ -527,6 +527,11 @@ class methodHandle;
    do_name(processBlocks_name, "processBlocks")                                                                         \
    do_signature(ghash_processBlocks_signature, "([BII[J[J)V")                                                           \
                                                                                                                         \
+  /* support for com.sun.crypto.provider.Poly1305 */                                                                    \
+  do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305")                                        \
+  do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \
+   do_name(processMultipleBlocks_name, "processMultipleBlocks")                                                         \
+                                                                                                                        \
   /* support for java.util.zip */                                                                                       \
   do_class(java_util_zip_CRC32,           "java/util/zip/CRC32")                                                        \
   do_intrinsic(_updateCRC32,               java_util_zip_CRC32,   update_name, int2_int_signature,               F_SN)  \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index 7aeedff86f9..a683b259b92 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -739,6 +739,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
   case vmIntrinsics::_ghash_processBlocks:
   case vmIntrinsics::_base64_encodeBlock:
   case vmIntrinsics::_base64_decodeBlock:
+  case vmIntrinsics::_poly1305_processBlocks:
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp
index 11d8c4cf142..b221ca932e3 100644
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@@ -1166,6 +1166,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "electronicCodeBook_decryptAESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 ||
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 2bba12a05e6..a59923ed21a 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -612,6 +612,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
     return inline_base64_encodeBlock();
   case vmIntrinsics::_base64_decodeBlock:
     return inline_base64_decodeBlock();
+  case vmIntrinsics::_poly1305_processBlocks:
+    return inline_poly1305_processBlocks();
 
   case vmIntrinsics::_encodeISOArray:
   case vmIntrinsics::_encodeByteISOArray:
@@ -6962,6 +6964,42 @@ bool LibraryCallKit::inline_base64_decodeBlock() {
   return true;
 }
 
+bool LibraryCallKit::inline_poly1305_processBlocks() {
+  address stubAddr;
+  const char *stubName;
+  assert(UsePolyIntrinsics, "need Poly intrinsics support");
+  assert(callee()->signature()->size() == 5, "poly1305_processBlocks has %d parameters", callee()->signature()->size());
+  stubAddr = StubRoutines::poly1305_processBlocks();
+  stubName = "poly1305_processBlocks";
+
+  if (!stubAddr) return false;
+  null_check_receiver();  // null-check receiver
+  if (stopped())  return true;
+
+  Node* input = argument(1);
+  Node* input_offset = argument(2);
+  Node* len = argument(3);
+  Node* alimbs = argument(4);
+  Node* rlimbs = argument(5);
+
+  input = must_be_not_null(input, true);
+  alimbs = must_be_not_null(alimbs, true);
+  rlimbs = must_be_not_null(rlimbs, true);
+
+  Node* input_start = array_element_address(input, input_offset, T_BYTE);
+  assert(input_start, "input array is NULL");
+  Node* acc_start = array_element_address(alimbs, intcon(0), T_LONG);
+  assert(acc_start, "acc array is NULL");
+  Node* r_start = array_element_address(rlimbs, intcon(0), T_LONG);
+  assert(r_start, "r array is NULL");
+
+  Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
+                                 OptoRuntime::poly1305_processBlocks_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 input_start, len, acc_start, r_start);
+  return true;
+}
+
 //------------------------------inline_digestBase_implCompress-----------------------
 //
 // Calculate MD5 for single-block byte[] array.
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 1f660f7d987..35c699b73bf 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -293,6 +293,7 @@ class LibraryCallKit : public GraphKit {
   bool inline_ghash_processBlocks();
   bool inline_base64_encodeBlock();
   bool inline_base64_decodeBlock();
+  bool inline_poly1305_processBlocks();
   bool inline_digestBase_implCompress(vmIntrinsics::ID id);
   bool inline_digestBase_implCompressMB(int predicate);
   bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass,
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index b5326838194..769bbd191ff 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -1266,6 +1266,26 @@ const TypeFunc* OptoRuntime::base64_decodeBlock_Type() {
   return TypeFunc::make(domain, range);
 }
 
+// Poly1305 processMultipleBlocks function
+const TypeFunc* OptoRuntime::poly1305_processBlocks_Type() {
+  int argcnt = 4;
+
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // input array
+  fields[argp++] = TypeInt::INT;        // input length
+  fields[argp++] = TypePtr::NOTNULL;    // accumulator array
+  fields[argp++] = TypePtr::NOTNULL;    // r array
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = NULL; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state access for on stack replacement
 const TypeFunc* OptoRuntime::osr_end_Type() {
   // create input type (domain)
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index 43e4cff5228..1de8ffb18fb 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -280,6 +280,7 @@ private:
   static const TypeFunc* ghash_processBlocks_Type();
   static const TypeFunc* base64_encodeBlock_Type();
   static const TypeFunc* base64_decodeBlock_Type();
+  static const TypeFunc* poly1305_processBlocks_Type();
 
   static const TypeFunc* updateBytesCRC32_Type();
   static const TypeFunc* updateBytesCRC32C_Type();
diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp
index f76ae8d8d5e..c5750d6d68e 100644
--- a/src/hotspot/share/runtime/globals.hpp
+++ b/src/hotspot/share/runtime/globals.hpp
@@ -238,6 +238,9 @@ const int ObjectAlignmentInBytes = 8;
   product(bool, UseBASE64Intrinsics, false,                                 \
           "Use intrinsics for java.util.Base64")                            \
                                                                             \
+  product(bool, UsePolyIntrinsics, false, DIAGNOSTIC,                       \
+          "Use intrinsics for sun.security.util.math.intpoly")              \
+                                                                            \
   product(size_t, LargePageSizeInBytes, 0,                                  \
           "Maximum large page size used (0 will use the default large "     \
           "page size for the environment as the maximum)")                  \
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 93927ad0f89..9418b758387 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -130,6 +130,7 @@ address StubRoutines::_galoisCounterMode_AESCrypt          = NULL;
 address StubRoutines::_ghash_processBlocks                 = NULL;
 address StubRoutines::_base64_encodeBlock                  = NULL;
 address StubRoutines::_base64_decodeBlock                  = NULL;
+address StubRoutines::_poly1305_processBlocks              = NULL;
 
 address StubRoutines::_md5_implCompress      = NULL;
 address StubRoutines::_md5_implCompressMB    = NULL;
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index 30f58519ea9..f4cec54aa7f 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -211,6 +211,7 @@ class StubRoutines: AllStatic {
   static address _ghash_processBlocks;
   static address _base64_encodeBlock;
   static address _base64_decodeBlock;
+  static address _poly1305_processBlocks;
 
   static address _md5_implCompress;
   static address _md5_implCompressMB;
@@ -384,6 +385,7 @@ class StubRoutines: AllStatic {
   static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
   static address electronicCodeBook_encryptAESCrypt()   { return _electronicCodeBook_encryptAESCrypt; }
   static address electronicCodeBook_decryptAESCrypt()   { return _electronicCodeBook_decryptAESCrypt; }
+  static address poly1305_processBlocks()               { return _poly1305_processBlocks; }
   static address counterMode_AESCrypt()  { return _counterMode_AESCrypt; }
   static address ghash_processBlocks()   { return _ghash_processBlocks; }
   static address base64_encodeBlock()    { return _base64_encodeBlock; }
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index d339b0a1ee1..bd06ed29010 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -544,6 +544,7 @@
      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
      static_field(StubRoutines,                _base64_encodeBlock,                           address)                               \
      static_field(StubRoutines,                _base64_decodeBlock,                           address)                               \
+     static_field(StubRoutines,                _poly1305_processBlocks,                       address)                               \
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
      static_field(StubRoutines,                _crc32c_table_addr,                            address)                               \
diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java
index cd78df84bed..d24b29cedbf 100644
--- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java
+++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java
@@ -34,6 +34,8 @@ import java.util.Objects;
 
 import sun.security.util.math.*;
 import sun.security.util.math.intpoly.*;
+import jdk.internal.vm.annotation.IntrinsicCandidate;
+import jdk.internal.vm.annotation.ForceInline;
 
 /**
  * This class represents the Poly1305 function defined in RFC 7539.
@@ -59,8 +61,10 @@ final class Poly1305 {
     private IntegerModuloP s;
     private MutableIntegerModuloP a;
     private final MutableIntegerModuloP n = ipl1305.get1().mutable();
+    private final boolean checkWeakKey;
 
-    Poly1305() { }
+    Poly1305() { this(true); }
+    Poly1305(boolean checkKey) { checkWeakKey = checkKey; }
 
     /**
      * Initialize the Poly1305 object
@@ -165,11 +169,15 @@ final class Poly1305 {
                 blockOffset = 0;
             }
         }
-        while (len >= BLOCK_LENGTH) {
-            processBlock(input, offset, BLOCK_LENGTH);
-            offset += BLOCK_LENGTH;
-            len -= BLOCK_LENGTH;
-        }
+
+        int blockMultipleLength = len & (~(BLOCK_LENGTH-1));
+        long[] aLimbs = a.getLimbs();
+        long[] rLimbs = r.getLimbs();
+        processMultipleBlocksCheck(input, offset, blockMultipleLength, aLimbs, rLimbs);
+        processMultipleBlocks(input, offset, blockMultipleLength, aLimbs, rLimbs);
+        offset += blockMultipleLength;
+        len -= blockMultipleLength;
+
         if (len > 0) { // and len < BLOCK_LENGTH
             System.arraycopy(input, offset, block, 0, len);
             blockOffset = len;
@@ -235,12 +243,35 @@ final class Poly1305 {
         a.setProduct(r);                // a = (a * r) % p
     }
 
+    // This is an intrinsified method. The unused parameters aLimbs and rLimbs are used by the intrinsic.
+    // They correspond to this.a and this.r respectively
+    @ForceInline
+    @IntrinsicCandidate
+    private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) {
+        while (length >= BLOCK_LENGTH) {
+            processBlock(input, offset, BLOCK_LENGTH);
+            offset += BLOCK_LENGTH;
+            length -= BLOCK_LENGTH;
+        }
+    }
+
+    private static void processMultipleBlocksCheck(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) {
+        Objects.checkFromIndexSize(offset, length, input.length);
+        final int numLimbs = 5; // Intrinsic expects exactly 5 limbs
+        if (aLimbs.length != numLimbs) {
+            throw new RuntimeException("invalid accumulator length: " + aLimbs.length);
+        }
+        if (rLimbs.length != numLimbs) {
+            throw new RuntimeException("invalid R length: " + rLimbs.length);
+        }
+    }
+
     /**
      * Partition the authentication key into the R and S components, clamp
      * the R value, and instantiate IntegerModuloP objects to R and S's
      * numeric values.
      */
-    private void setRSVals() {
+    private void setRSVals() throws InvalidKeyException {
         // Clamp the bytes in the "r" half of the key.
         keyBytes[3] &= 15;
         keyBytes[7] &= 15;
@@ -250,6 +281,24 @@ final class Poly1305 {
         keyBytes[8] &= (byte)252;
         keyBytes[12] &= (byte)252;
 
+        if (checkWeakKey) {
+            byte keyIsZero = 0;
+            for (int i = 0; i < RS_LENGTH; i++) {
+                keyIsZero |= keyBytes[i];
+            }
+            if (keyIsZero == 0) {
+                throw new InvalidKeyException("R is set to zero");
+            }
+
+            keyIsZero = 0;
+            for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) {
+                keyIsZero |= keyBytes[i];
+            }
+            if (keyIsZero == 0) {
+                throw new InvalidKeyException("S is set to zero");
+            }
+        }
+
         // Create IntegerModuloP elements from the r and s values
         r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0);
         s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0);
diff --git a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java
index 7084776bfa1..2373bf903fc 100644
--- a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java
+++ b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java
@@ -153,6 +153,11 @@ public interface IntegerModuloP {
      */
     void asByteArray(byte[] result);
 
+    /**
+     * Break encapsulation, used for IntrinsicCandidate functions
+     */
+    long[] getLimbs();
+
     /**
      * Compute the multiplicative inverse of this field element.
      *
diff --git a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java
index 810c3fb3b86..693d88bcc76 100644
--- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java
+++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java
@@ -626,6 +626,10 @@ public abstract sealed class IntegerPolynomial implements IntegerFieldModuloP
             }
             limbsToByteArray(limbs, result);
         }
+
+        public long[] getLimbs() {
+            return limbs;
+        }
     }
 
     protected class MutableElement extends Element
diff --git a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
index 272bb670418..2165b2cc030 100644
--- a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
+++ b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
@@ -231,6 +231,7 @@ public class AMD64 extends Architecture {
         OSPKE,
         CET_IBT,
         CET_SS,
+        AVX512_IFMA,
     }
 
     private final EnumSet<CPUFeature> features;
diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java
index df80d7d9de8..34bb118155f 100644
--- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java
+++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,3 +28,41 @@
  * @run main java.base/com.sun.crypto.provider.Poly1305UnitTest
  * @summary Unit test for com.sun.crypto.provider.Poly1305.
  */
+
+/*
+ * @test
+ * @key randomness
+ * @modules java.base/com.sun.crypto.provider
+ * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest
+ * @summary Unit test for com.sun.crypto.provider.Poly1305.
+ */
+
+/*
+ * @test
+ * @modules java.base/com.sun.crypto.provider
+ * @run main java.base/com.sun.crypto.provider.Poly1305KAT
+ * @summary Unit test for com.sun.crypto.provider.Poly1305.
+ */
+
+/*
+ * @test
+ * @key randomness
+ * @modules java.base/com.sun.crypto.provider
+ * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305.
+ * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest
+ */
+
+/*
+ * @test
+ * @modules java.base/com.sun.crypto.provider
+ * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305.
+ * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305KAT
+ */
+
+package com.sun.crypto.provider.Cipher.ChaCha20;
+
+public class Poly1305UnitTestDriver {
+    static public void main(String[] args) {
+        System.out.println("Passed");
+    }
+}
diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java
new file mode 100644
index 00000000000..3e7ecbad62e
--- /dev/null
+++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022, Intel Corporation. All rights reserved.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package com.sun.crypto.provider;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import javax.crypto.spec.SecretKeySpec;
+
+// This test case relies on the fact that single-byte Poly1305.engineUpdate(byte) does not have an intrinsic
+// In this way we can compare if the intrinsic and pure java produce same result
+// This test case is NOT entirely deterministic, it uses a random seed for pseudo-random number generator
+// If a failure occurs, hardcode the seed to make the test case deterministic
+public class Poly1305IntrinsicFuzzTest {
+        public static void main(String[] args) throws Exception {
+                //Note: it might be useful to increase this number during development of new Poly1305 intrinsics
+                final int repeat = 100;
+                for (int i = 0; i < repeat; i++) {
+                        run();
+                }
+                System.out.println("Fuzz Success");
+        }
+
+        public static void run() throws Exception {
+                java.util.Random rnd = new java.util.Random();
+                long seed = rnd.nextLong();
+                rnd.setSeed(seed);
+
+                byte[] key = new byte[32];
+                rnd.nextBytes(key);
+                int msgLen = rnd.nextInt(128, 4096); // x86_64 intrinsic requires 256 bytes minimum
+                byte[] message = new byte[msgLen];
+
+                Poly1305 authenticator = new Poly1305();
+                Poly1305 authenticatorSlow = new Poly1305();
+                if (authenticator.engineGetMacLength() != 16) {
+                        throw new RuntimeException("The length of Poly1305 MAC must be 16-bytes.");
+                }
+
+                authenticator.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null);
+                authenticatorSlow.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null);
+
+                if (rnd.nextBoolean()) {
+                        // Prime just the buffer and/or accumulator (buffer can keep at most 16 bytes from previous engineUpdate)
+                        int initDataLen = rnd.nextInt(8, 24);
+                        authenticator.engineUpdate(message, 0, initDataLen);
+                        slowUpdate(authenticatorSlow, message, 0, initDataLen);
+                }
+
+                if (rnd.nextBoolean()) {
+                        // Multiple calls to engineUpdate
+                        authenticator.engineUpdate(message, 0, message.length);
+                        slowUpdate(authenticatorSlow, message, 0, message.length);
+                }
+
+                authenticator.engineUpdate(message, 0, message.length);
+                slowUpdate(authenticatorSlow, message, 0, message.length);
+
+                byte[] tag = authenticator.engineDoFinal();
+                byte[] tagSlow = authenticatorSlow.engineDoFinal();
+
+                if (!Arrays.equals(tag, tagSlow)) {
+                        throw new RuntimeException("[Seed "+seed+"] Tag mismatch: " + Arrays.toString(tag) + " != " + Arrays.toString(tagSlow));
+                }
+        }
+
+        static void slowUpdate(Poly1305 authenticator, byte[] message, int offset, int len) {
+                len = Math.min(message.length, offset + len);
+                for (int i = offset; i < len; i++) {
+                        authenticator.engineUpdate(message[i]);
+                }
+        }
+}
diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java
new file mode 100644
index 00000000000..649d1888c70
--- /dev/null
+++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2022, Intel Corporation. All rights reserved.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package com.sun.crypto.provider;
+
+import java.util.*;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import javax.crypto.spec.SecretKeySpec;
+
+public class Poly1305KAT {
+    public static class TestData {
+        public TestData(String name, String keyStr, String inputStr, String outStr) {
+            HexFormat hex = HexFormat.of();
+            testName = Objects.requireNonNull(name);
+            key = hex.parseHex(Objects.requireNonNull(keyStr));
+            input = hex.parseHex(Objects.requireNonNull(inputStr));
+            expOutput = hex.parseHex(Objects.requireNonNull(outStr));
+        }
+
+        public final String testName;
+        public final byte[] key;
+        public final byte[] input;
+        public final byte[] expOutput;
+    }
+
+    public static final List<TestData> testList = new LinkedList<TestData>() {{
+        add(new TestData("RFC 7539 A.3 Test Vector #1",
+            "0000000000000000000000000000000000000000000000000000000000000000",
+            "0000000000000000000000000000000000000000000000000000000000000000" +
+            "0000000000000000000000000000000000000000000000000000000000000000",
+            "00000000000000000000000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #2",
+            "0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e",
+            "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
+            "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
+            "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
+            "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
+            "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
+            "206f6620616e204945544620616374697669747920697320636f6e7369646572" +
+            "656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
+            "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
+            "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
+            "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
+            "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
+            "207768696368206172652061646472657373656420746f",
+            "36e5f6b5c5e06070f0efca96227a863e"));
+        add(new TestData("RFC 7539 A.3 Test Vector #3",
+            "36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000",
+             "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
+             "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
+             "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
+             "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
+             "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
+             "206f6620616e204945544620616374697669747920697320636f6e7369646572" +
+             "656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
+             "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
+             "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
+             "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
+             "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
+             "207768696368206172652061646472657373656420746f",
+             "f3477e7cd95417af89a6b8794c310cf0"));
+        add(new TestData("RFC 7539 A.3 Test Vector #4",
+            "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0",
+            "2754776173206272696c6c69672c20616e642074686520736c6974687920746f" +
+            "7665730a446964206779726520616e642067696d626c6520696e207468652077" +
+            "6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" +
+            "732c0a416e6420746865206d6f6d65207261746873206f757467726162652e",
+            "4541669a7eaaee61e708dc7cbcc5eb62"));
+        add(new TestData("RFC 7539 A.3 Test Vector #5: If one uses 130-bit partial reduction, does the code handle the case where partially reducedfinal result is not fully reduced?",
+            "0200000000000000000000000000000000000000000000000000000000000000",
+            "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
+            "03000000000000000000000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #6: What happens if addition of s overflows modulo 2^128?",
+            "02000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
+            "02000000000000000000000000000000",
+            "03000000000000000000000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #7: What happens if data limb is all ones and there is carry from lower limb?",
+            "0100000000000000000000000000000000000000000000000000000000000000",
+            "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" +
+            "11000000000000000000000000000000",
+             "05000000000000000000000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #8: What happens if final result from polynomial part is exactly 2^130-5?",
+            "0100000000000000000000000000000000000000000000000000000000000000",
+            "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFEFEFEFEFEFEFEFEFEFEFEFEFEFEFE" +
+            "01010101010101010101010101010101",
+            "00000000000000000000000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #9: What happens if final result from polynomial part is exactly 2^130-6?",
+            "0200000000000000000000000000000000000000000000000000000000000000",
+            "FDFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
+            "FAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"));
+        add(new TestData("RFC 7539 A.3 Test Vector #10: What happens if 5*H+L-type reduction produces 131-bit intermediate result?",
+            "0100000000000000040000000000000000000000000000000000000000000000",
+            "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" +
+            "0000000000000000000000000000000001000000000000000000000000000000",
+            "14000000000000005500000000000000"));
+        add(new TestData("RFC 7539 A.3 Test Vector #11: What happens if 5*H+L-type reduction produces 131-bit final result?",
+            "0100000000000000040000000000000000000000000000000000000000000000",
+            "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" +
+            "00000000000000000000000000000000",
+            "13000000000000000000000000000000"));
+    }};
+
+    public static void main(String args[]) throws Exception {
+        int testsPassed = 0;
+        int testNumber = 0;
+
+        for (TestData test : testList) {
+            System.out.println("*** Test " + ++testNumber + ": " +
+                    test.testName);
+            if (runSingleTest(test)) {
+                testsPassed++;
+            }
+        }
+        System.out.println();
+
+        if (testsPassed != testNumber) {
+            throw new RuntimeException("One or more tests failed.  " +
+                    "Check output for details");
+        }
+    }
+
+    private static boolean runSingleTest(TestData testData) throws Exception {
+        Poly1305 authenticator = new Poly1305(false);
+        authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null);
+        authenticator.engineUpdate(testData.input, 0, testData.input.length);
+        byte[] tag = authenticator.engineDoFinal();
+        if (!Arrays.equals(tag, testData.expOutput)) {
+                System.out.println("ERROR - Output Mismatch!");
+                System.out.println("Expected:\n" +
+                        dumpHexBytes(testData.expOutput, testData.expOutput.length, "\n", " "));
+                System.out.println("Actual:\n" +
+                        dumpHexBytes(tag, tag.length, "\n", " "));
+                System.out.println();
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Dump the hex bytes of a buffer into string form.
+     *
+     * @param data The array of bytes to dump to stdout.
+     * @param itemsPerLine The number of bytes to display per line
+     *      if the {@code lineDelim} character is blank then all bytes
+     *      will be printed on a single line.
+     * @param lineDelim The delimiter between lines
+     * @param itemDelim The delimiter between bytes
+     *
+     * @return The hexdump of the byte array
+     */
+    private static String dumpHexBytes(byte[] data, int itemsPerLine,
+            String lineDelim, String itemDelim) {
+        return dumpHexBytes(ByteBuffer.wrap(data), itemsPerLine, lineDelim,
+                itemDelim);
+    }
+
+    private static String dumpHexBytes(ByteBuffer data, int itemsPerLine,
+            String lineDelim, String itemDelim) {
+        StringBuilder sb = new StringBuilder();
+        if (data != null) {
+            data.mark();
+            int i = 0;
+            while (data.remaining() > 0) {
+                if (i % itemsPerLine == 0 && i != 0) {
+                    sb.append(lineDelim);
+                }
+                sb.append(String.format("%02X", data.get())).append(itemDelim);
+                i++;
+            }
+            data.reset();
+        }
+
+        return sb.toString();
+    }
+}
+
diff --git a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java
index b5b8274b2a4..b41329d126d 100644
--- a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java
+++ b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java
@@ -65,7 +65,7 @@ public class CPUInfoTest {
                     "avx512_vbmi2", "avx512_vbmi",      "rdtscp",            "rdpid",
                     "hv",           "fsrm",             "avx512_bitalg",     "gfni",
                     "f16c",         "pku",              "ospke",             "cet_ibt",
-                    "cet_ss"
+                    "cet_ss",       "avx512_ifma"
                     );
             // @formatter:on
             // Checkstyle: resume
diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java
new file mode 100644
index 00000000000..aa45aa2e398
--- /dev/null
+++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022, Intel Corporation. All rights reserved.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.full;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.reflect.Method;
+import java.lang.reflect.Constructor;
+import java.security.Key;
+import java.security.spec.AlgorithmParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.annotations.Measurement;
+
+@Measurement(iterations = 3, time = 10)
+@Warmup(iterations = 3, time = 10)
+@Fork(value = 1, jvmArgsAppend = {"--add-opens", "java.base/com.sun.crypto.provider=ALL-UNNAMED"})
+public class Poly1305DigestBench extends CryptoBase {
+    public static final int SET_SIZE = 128;
+
+    @Param({"64", "256", "1024", "" + 16*1024, "" + 1024*1024})
+    int dataSize;
+
+    private byte[][] data;
+    int index = 0;
+    private static MethodHandle polyEngineInit, polyEngineUpdate, polyEngineFinal;
+    private static Object polyObj;
+
+    static {
+        try {
+            MethodHandles.Lookup lookup = MethodHandles.lookup();
+            Class<?> polyClazz = Class.forName("com.sun.crypto.provider.Poly1305");
+            Constructor<?> constructor = polyClazz.getDeclaredConstructor();
+            constructor.setAccessible(true);
+            polyObj = constructor.newInstance();
+
+            Method m = polyClazz.getDeclaredMethod("engineInit", Key.class, AlgorithmParameterSpec.class);
+            m.setAccessible(true);
+            polyEngineInit = lookup.unreflect(m);
+
+            m = polyClazz.getDeclaredMethod("engineUpdate", byte[].class, int.class, int.class);
+            m.setAccessible(true);
+            polyEngineUpdate = lookup.unreflect(m);
+
+            m = polyClazz.getDeclaredMethod("engineDoFinal");
+            m.setAccessible(true);
+            polyEngineFinal = lookup.unreflect(m);
+        } catch (Throwable ex) {
+            throw new RuntimeException(ex);
+        }
+    }
+
+    @Setup
+    public void setup() {
+        setupProvider();
+        data = fillRandom(new byte[SET_SIZE][dataSize]);
+    }
+
+    @Benchmark
+    public byte[] digest() {
+        try {
+            byte[] d = data[index];
+            index = (index +1) % SET_SIZE;
+            polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null);
+            polyEngineUpdate.invoke(polyObj, d, 0, d.length);
+            return (byte[])polyEngineFinal.invoke(polyObj);
+        } catch (Throwable ex) {
+            throw new RuntimeException(ex);
+        }
+    }
+}