From cd6bebbf34215723fad1d6bfe070a409351920c1 Mon Sep 17 00:00:00 2001
From: Jamil Nimeh <jnimeh@openjdk.org>
Date: Tue, 29 Nov 2022 14:40:20 +0000
Subject: [PATCH] 8247645: ChaCha20 intrinsics

Reviewed-by: sviswanathan, ngasson, vlivanov, ascarpino
---
 src/hotspot/cpu/aarch64/assembler_aarch64.hpp |  95 +++
 .../cpu/aarch64/macroAssembler_aarch64.hpp    |   7 +
 .../aarch64/macroAssembler_aarch64_chacha.cpp |  90 +++
 .../cpu/aarch64/stubGenerator_aarch64.cpp     | 130 ++++
 .../cpu/aarch64/vm_version_aarch64.cpp        |  11 +
 src/hotspot/cpu/x86/assembler_x86.cpp         |  20 +
 src/hotspot/cpu/x86/assembler_x86.hpp         |   2 +
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |   2 +
 src/hotspot/cpu/x86/stubGenerator_x86_64.hpp  |  12 +
 .../cpu/x86/stubGenerator_x86_64_chacha.cpp   | 582 ++++++++++++++++++
 src/hotspot/cpu/x86/vm_version_x86.cpp        |  16 +
 src/hotspot/share/classfile/vmIntrinsics.cpp  |   3 +
 src/hotspot/share/classfile/vmIntrinsics.hpp  |   6 +
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   |   1 +
 src/hotspot/share/opto/c2compiler.cpp         |   1 +
 src/hotspot/share/opto/escape.cpp             |   1 +
 src/hotspot/share/opto/library_call.cpp       |  32 +
 src/hotspot/share/opto/library_call.hpp       |   1 +
 src/hotspot/share/opto/runtime.cpp            |  20 +
 src/hotspot/share/opto/runtime.hpp            |   1 +
 src/hotspot/share/runtime/globals.hpp         |   3 +
 src/hotspot/share/runtime/stubRoutines.cpp    |   1 +
 src/hotspot/share/runtime/stubRoutines.hpp    |   2 +
 src/hotspot/share/runtime/vmStructs.cpp       |   1 +
 .../sun/crypto/provider/ChaCha20Cipher.java   |  96 +--
 .../intrinsics/chacha/ExerciseChaCha20.java   | 317 ++++++++++
 .../intrinsics/chacha/TestChaCha20.java       | 171 +++++
 .../bench/javax/crypto/full/CipherBench.java  |   4 +-
 28 files changed, 1590 insertions(+), 38 deletions(-)
 create mode 100644 src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
 create mode 100644 src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp
 create mode 100644 test/hotspot/jtreg/compiler/intrinsics/chacha/ExerciseChaCha20.java
 create mode 100644 test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 80bd9d7cbfe..56c34e85ce7 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -2322,6 +2322,40 @@ public:
     }
   }
 
+  // Single-structure load/store method (all addressing variants)
+  void ld_st(FloatRegister Vt, SIMD_RegVariant T, int index, Address a,
+             int op1, int op2, int regs) {
+    int expectedImmediate = (regVariant_to_elemBits(T) >> 3) * regs;
+    int sVal = (T < D) ? (index >> (2 - T)) & 0x01 : 0;
+    int opcode = (T < D) ? (T << 2) : ((T & 0x02) << 2);
+    int size = (T < D) ? (index & (0x3 << T)) : 1;  // only care about low 2b
+    Register Xn = a.base();
+    int Rm;
+
+    switch (a.getMode()) {
+    case Address::base_plus_offset:
+      guarantee(a.offset() == 0, "no offset allowed here");
+      Rm = 0;
+      break;
+    case Address::post:
+      guarantee(a.offset() == expectedImmediate, "bad offset");
+      op1 |= 0b100;
+      Rm = 0b11111;
+      break;
+    case Address::post_reg:
+      op1 |= 0b100;
+      Rm = a.index()->encoding();
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+
+    starti;
+    f(0,31), f((index >> (3 - T)), 30);
+    f(op1, 29, 21), f(Rm, 20, 16), f(op2 | opcode | sVal, 15, 12);
+    f(size, 11, 10), srf(Xn, 5), rf(Vt, 0);
+  }
+
  public:
 
 #define INSN1(NAME, op1, op2)                                           \
@@ -2379,6 +2413,66 @@ public:
 #undef INSN3
 #undef INSN4
 
+// Handle common single-structure ld/st parameter sanity checks
+// for all variations (1 to 4) of SIMD reigster inputs.  This
+// method will call the routine that generates the opcode.
+template<typename R, typename... Rx>
+  void ldst_sstr(SIMD_RegVariant T, int index, const Address &a,
+            int op1, int op2, R firstReg, Rx... otherRegs) {
+    const FloatRegister vtSet[] = { firstReg, otherRegs... };
+    const int regCount = sizeof...(otherRegs) + 1;
+    assert(index >= 0 && (T <= D) && ((T == B && index <= 15) ||
+              (T == H && index <= 7) || (T == S && index <= 3) ||
+              (T == D && index <= 1)), "invalid index");
+    assert(regCount >= 1 && regCount <= 4, "illegal register count");
+
+    // Check to make sure when multiple SIMD registers are used
+    // that they are in successive order.
+    for (int i = 0; i < regCount - 1; i++) {
+      assert(vtSet[i]->successor() == vtSet[i + 1],
+             "Registers must be ordered");
+    }
+
+    ld_st(firstReg, T, index, a, op1, op2, regCount);
+  }
+
+// Define a set of INSN1/2/3/4 macros to handle single-structure
+// load/store instructions.
+#define INSN1(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, SIMD_RegVariant T, int index,             \
+            const Address &a) {                                         \
+    ldst_sstr(T, index, a, op1, op2, Vt);                               \
+ }
+
+#define INSN2(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_RegVariant T,     \
+            int index, const Address &a) {                              \
+    ldst_sstr(T, index, a, op1, op2, Vt, Vt2);                          \
+  }
+
+#define INSN3(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,     \
+            SIMD_RegVariant T, int index, const Address &a) {           \
+    ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3);                     \
+  }
+
+#define INSN4(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,     \
+            FloatRegister Vt4, SIMD_RegVariant T, int index,            \
+            const Address &a) {                                         \
+    ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3, Vt4);                \
+  }
+
+  INSN1(st1, 0b001101000, 0b0000);
+  INSN2(st2, 0b001101001, 0b0000);
+  INSN3(st3, 0b001101000, 0b0010);
+  INSN4(st4, 0b001101001, 0b0010);
+
+#undef INSN1
+#undef INSN2
+#undef INSN3
+#undef INSN4
+
 #define INSN(NAME, opc)                                                                 \
   void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
     starti;                                                                             \
@@ -2749,6 +2843,7 @@ public:
   INSN(ushr, 1, 0b000001, /* isSHR = */ true);
   INSN(usra, 1, 0b000101, /* isSHR = */ true);
   INSN(ssra, 0, 0b000101, /* isSHR = */ true);
+  INSN(sli,  1, 0b010101, /* isSHR = */ false);
 
 #undef INSN
 
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
index a50a5890e5e..f98d08b663b 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -1450,6 +1450,13 @@ public:
   void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
   void aes_round(FloatRegister input, FloatRegister subkey);
 
+  // ChaCha20 functions support block
+  void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
+          FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
+          FloatRegister tbl);
+  void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
+          FloatRegister dVec, bool colToDiag);
+
   // Place an ISB after code may have been modified due to a safepoint.
   void safepoint_isb();
 
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
new file mode 100644
index 00000000000..9e53258730e
--- /dev/null
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_aarch64.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/stubRoutines.hpp"
+
+/**
+ * Perform the quarter round calculations on values contained within
+ * four SIMD registers.
+ *
+ * @param aVec the SIMD register containing only the "a" values
+ * @param bVec the SIMD register containing only the "b" values
+ * @param cVec the SIMD register containing only the "c" values
+ * @param dVec the SIMD register containing only the "d" values
+ * @param scratch scratch SIMD register used for 12 and 7 bit left rotations
+ * @param table the SIMD register used as a table for 8 bit left rotations
+ */
+void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
+    FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
+     FloatRegister table) {
+
+  // a += b, d ^= a, d <<<= 16
+  addv(aVec, T4S, aVec, bVec);
+  eor(dVec, T16B, dVec, aVec);
+  rev32(dVec, T8H, dVec);
+
+  // c += d, b ^= c, b <<<= 12
+  addv(cVec, T4S, cVec, dVec);
+  eor(scratch, T16B, bVec, cVec);
+  ushr(bVec, T4S, scratch, 20);
+  sli(bVec, T4S, scratch, 12);
+
+  // a += b, d ^= a, d <<<= 8
+  addv(aVec, T4S, aVec, bVec);
+  eor(dVec, T16B, dVec, aVec);
+  tbl(dVec, T16B, dVec,  1, table);
+
+  // c += d, b ^= c, b <<<= 7
+  addv(cVec, T4S, cVec, dVec);
+  eor(scratch, T16B, bVec, cVec);
+  ushr(bVec, T4S, scratch, 25);
+  sli(bVec, T4S, scratch, 7);
+}
+
+/**
+ * Shift the b, c, and d vectors between columnar and diagonal representations.
+ * Note that the "a" vector does not shift.
+ *
+ * @param bVec the SIMD register containing only the "b" values
+ * @param cVec the SIMD register containing only the "c" values
+ * @param dVec the SIMD register containing only the "d" values
+ * @param colToDiag true if moving columnar to diagonal, false if
+ *                  moving diagonal back to columnar.
+ */
+void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
+    FloatRegister dVec, bool colToDiag) {
+  int bShift = colToDiag ? 4 : 12;
+  int cShift = 8;
+  int dShift = colToDiag ? 12 : 4;
+
+  ext(bVec, T16B, bVec, bVec, bShift);
+  ext(cVec, T16B, cVec, cVec, cShift);
+  ext(dVec, T16B, dVec, dVec, dShift);
+}
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
index 74a908edc75..484df788026 100644
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -4081,6 +4081,132 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+  // ChaCha20 block function.  This version parallelizes by loading
+  // individual 32-bit state elements into vectors for four blocks
+  // (e.g. all four blocks' worth of state[0] in one register, etc.)
+  //
+  // state (int[16]) = c_rarg0
+  // keystream (byte[1024]) = c_rarg1
+  // return - number of bytes of keystream (always 256)
+  address generate_chacha20Block_blockpar() {
+    Label L_twoRounds, L_cc20_const;
+    // The constant data is broken into two 128-bit segments to be loaded
+    // onto FloatRegisters.  The first 128 bits are a counter add overlay
+    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
+    // The second 128-bits is a table constant used for 8-bit left rotations.
+    __ BIND(L_cc20_const);
+    __ emit_int64(0x0000000100000000UL);
+    __ emit_int64(0x0000000300000002UL);
+    __ emit_int64(0x0605040702010003UL);
+    __ emit_int64(0x0E0D0C0F0A09080BUL);
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "chacha20Block");
+    address start = __ pc();
+    __ enter();
+
+    int i, j;
+    const Register state = c_rarg0;
+    const Register keystream = c_rarg1;
+    const Register loopCtr = r10;
+    const Register tmpAddr = r11;
+
+    const FloatRegister stateFirst = v0;
+    const FloatRegister stateSecond = v1;
+    const FloatRegister stateThird = v2;
+    const FloatRegister stateFourth = v3;
+    const FloatRegister origCtrState = v28;
+    const FloatRegister scratch = v29;
+    const FloatRegister lrot8Tbl = v30;
+
+    // Organize SIMD registers in an array that facilitates
+    // putting repetitive opcodes into loop structures.  It is
+    // important that each grouping of 4 registers is monotonically
+    // increasing to support the requirements of multi-register
+    // instructions (e.g. ld4r, st4, etc.)
+    const FloatRegister workSt[16] = {
+         v4,  v5,  v6,  v7, v16, v17, v18, v19,
+        v20, v21, v22, v23, v24, v25, v26, v27
+    };
+
+    // Load from memory and interlace across 16 SIMD registers,
+    // With each word from memory being broadcast to all lanes of
+    // each successive SIMD register.
+    //      Addr(0) -> All lanes in workSt[i]
+    //      Addr(4) -> All lanes workSt[i + 1], etc.
+    __ mov(tmpAddr, state);
+    for (i = 0; i < 16; i += 4) {
+      __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
+          __ post(tmpAddr, 16));
+    }
+
+    // Pull in constant data.  The first 16 bytes are the add overlay
+    // which is applied to the vector holding the counter (state[12]).
+    // The second 16 bytes is the index register for the 8-bit left
+    // rotation tbl instruction.
+    __ adr(tmpAddr, L_cc20_const);
+    __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
+    __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
+
+    // Set up the 10 iteration loop and perform all 8 quarter round ops
+    __ mov(loopCtr, 10);
+    __ BIND(L_twoRounds);
+
+    __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
+        scratch, lrot8Tbl);
+
+    __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
+        scratch, lrot8Tbl);
+    __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
+        scratch, lrot8Tbl);
+
+    // Decrement and iterate
+    __ sub(loopCtr, loopCtr, 1);
+    __ cbnz(loopCtr, L_twoRounds);
+
+    __ mov(tmpAddr, state);
+
+    // Add the starting state back to the post-loop keystream
+    // state.  We read/interlace the state array from memory into
+    // 4 registers similar to what we did in the beginning.  Then
+    // add the counter overlay onto workSt[12] at the end.
+    for (i = 0; i < 16; i += 4) {
+      __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
+          __ post(tmpAddr, 16));
+      __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
+      __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
+      __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
+      __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
+    }
+    __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
+
+    // Write to key stream, storing the same element out of workSt[0..15]
+    // to consecutive 4-byte offsets in the key stream buffer, then repeating
+    // for the next element position.
+    for (i = 0; i < 4; i++) {
+      for (j = 0; j < 16; j += 4) {
+        __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
+            __ post(keystream, 16));
+      }
+    }
+
+    __ mov(r0, 256);             // Return length of output keystream
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -7919,6 +8045,10 @@ class StubGenerator: public StubCodeGenerator {
     }
 #endif // COMPILER2
 
+    if (UseChaCha20Intrinsics) {
+      StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
+    }
+
     if (UseBASE64Intrinsics) {
         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
index 7fc82ef2358..25853cc38b1 100644
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@@ -366,6 +366,17 @@ void VM_Version::initialize() {
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  if (_features & CPU_ASIMD) {
+      if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+          UseChaCha20Intrinsics = true;
+      }
+  } else if (UseChaCha20Intrinsics) {
+      if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+          warning("ChaCha20 intrinsic requires ASIMD instructions");
+      }
+      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
     UseBASE64Intrinsics = true;
   }
diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
index d1831aac96c..ad7543cfb78 100644
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -5269,6 +5269,16 @@ void Assembler::pshufhw(XMMRegister dst, XMMRegister src, int mode) {
   emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
 }
 
+void Assembler::vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
+    assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
+            (vector_len == AVX_256bit ? VM_Version::supports_avx2() :
+            (vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
+    NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+    InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+    int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+    emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
+}
+
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -5290,6 +5300,16 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
   emit_int8(mode & 0xFF);
 }
 
+void Assembler::vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
+    assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
+            (vector_len == AVX_256bit ? VM_Version::supports_avx2() :
+            (vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
+    NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+    InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+    int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+    emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
+}
+
 void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_evex(), "requires EVEX support");
   assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, "");
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
index 04dbb7907be..7b4288e775e 100644
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -1946,6 +1946,8 @@ private:
   void pshufhw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, Address src,     int mode);
+  void vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
+  void vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
 
   //shuffle floats and doubles
   void shufps(XMMRegister, XMMRegister, int);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index c3e0b79dc46..95dc03200eb 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -3809,6 +3809,8 @@ void StubGenerator::generate_all() {
 
   generate_ghash_stubs();
 
+  generate_chacha_stubs();
+
   if (UseMD5Intrinsics) {
     StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
     StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
index 5e97e1e9a44..d9680c75542 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@@ -387,6 +387,18 @@ class StubGenerator: public StubCodeGenerator {
   // Ghash single and multi block operations using AVX instructions
   address generate_avx_ghash_processBlocks();
 
+  // ChaCha20 stubs and helper functions
+  void generate_chacha_stubs();
+  address generate_chacha20Block_avx();
+  address generate_chacha20Block_avx512();
+  void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
+    XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
+    XMMRegister lrot8, XMMRegister lrot16, int vector_len);
+  void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
+    XMMRegister dVec, int vector_len, bool colToDiag);
+  void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec,
+    XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset);
+
   // Poly1305 multiblock using IFMA instructions
   address generate_poly1305_processBlocks();
   void poly1305_process_blocks_avx512(const Register input, const Register length,
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp
new file mode 100644
index 00000000000..190b11efca0
--- /dev/null
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_x86.hpp"
+#include "stubGenerator_x86_64.hpp"
+
+#define __ _masm->
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif // PRODUCT
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// Constants
+
+/**
+ * This AVX/AVX2 add mask generation can be used for multiple duties:
+ *      1.) Provide +0/+1 counter increments by loading 256 bits
+ *          at offset 0
+ *      2.) Provide +2/+2 counter increments for the second set
+ *          of 4 AVX2 registers at offset 32 (256-bit load)
+ *      3.) Provide a +1 increment for the second set of 4 AVX
+ *          registers at offset 16 (128-bit load)
+ */
+ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX[] = {
+    0x0000000000000000UL, 0x0000000000000000UL,
+    0x0000000000000001UL, 0x0000000000000000UL,
+    0x0000000000000002UL, 0x0000000000000000UL,
+    0x0000000000000002UL, 0x0000000000000000UL,
+};
+static address chacha20_ctradd_avx() {
+  return (address)CC20_COUNTER_ADD_AVX;
+}
+
+/**
+ * Add masks for 4-block ChaCha20 Block calculations
+ * The first 512 bits creates a +0/+1/+2/+3 add overlay.
+ * The second 512 bits is a +4/+4/+4/+4 add overlay.  This
+ * can be used to increment the counter fields for the next 4 blocks.
+ */
+ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX512[] = {
+    0x0000000000000000UL, 0x0000000000000000UL,
+    0x0000000000000001UL, 0x0000000000000000UL,
+    0x0000000000000002UL, 0x0000000000000000UL,
+    0x0000000000000003UL, 0x0000000000000000UL,
+
+    0x0000000000000004UL, 0x0000000000000000UL,
+    0x0000000000000004UL, 0x0000000000000000UL,
+    0x0000000000000004UL, 0x0000000000000000UL,
+    0x0000000000000004UL, 0x0000000000000000UL
+};
+static address chacha20_ctradd_avx512() {
+  return (address)CC20_COUNTER_ADD_AVX512;
+}
+
+/**
+ * The first 256 bits represents a byte-wise permutation
+ * for an 8-bit left-rotation on 32-bit lanes.
+ * The second 256 bits is a 16-bit rotation on 32-bit lanes.
+ */
+ATTRIBUTE_ALIGNED(64) uint64_t CC20_LROT_CONSTS[] = {
+    0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
+    0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
+
+    0x0504070601000302UL, 0x0D0C0F0E09080B0AUL,
+    0x0504070601000302UL, 0x0D0C0F0E09080B0AUL
+};
+static address chacha20_lrot_consts() {
+  return (address)CC20_LROT_CONSTS;
+}
+
+
+
+void StubGenerator::generate_chacha_stubs() {
+  // Generate ChaCha20 intrinsics code
+  if (UseChaCha20Intrinsics) {
+    if (VM_Version::supports_evex()) {
+      StubRoutines::_chacha20Block = generate_chacha20Block_avx512();
+    } else {    // Either AVX or AVX2 is supported
+      assert(VM_Version::supports_avx() == true, "Must at least support AVX instructions");
+      StubRoutines::_chacha20Block = generate_chacha20Block_avx();
+    }
+  }
+}
+
+/* The 2-block AVX/AVX2-enabled ChaCha20 block function implementation */
+address StubGenerator::generate_chacha20Block_avx() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "chacha20Block");
+  address start = __ pc();
+
+  Label L_twoRounds;
+  const Register state        = c_rarg0;
+  const Register result       = c_rarg1;
+  const Register loopCounter  = r8;
+  const Register rotAddr      = r9;
+
+  const XMMRegister aState = xmm0;
+  const XMMRegister bState = xmm1;
+  const XMMRegister cState = xmm2;
+  const XMMRegister dState = xmm3;
+  const XMMRegister a1Vec = xmm4;
+  const XMMRegister b1Vec = xmm5;
+  const XMMRegister c1Vec = xmm6;
+  const XMMRegister d1Vec = xmm7;
+  const XMMRegister a2Vec = xmm8;
+  const XMMRegister b2Vec = xmm9;
+  const XMMRegister c2Vec = xmm10;
+  const XMMRegister d2Vec = xmm11;
+  const XMMRegister scratch = xmm12;
+  const XMMRegister d2State = xmm13;
+  const XMMRegister lrot8 = xmm14;
+  const XMMRegister lrot16 = xmm15;
+
+  int vector_len;
+  int outlen;
+
+  // This function will only be called if AVX2 or AVX are supported
+  // AVX512 uses a different function.
+  if (VM_Version::supports_avx2()) {
+    vector_len = Assembler::AVX_256bit;
+    outlen = 256;
+  } else if (VM_Version::supports_avx()) {
+    vector_len = Assembler::AVX_128bit;
+    outlen = 128;
+  }
+
+  __ enter();
+
+  // Load the initial state in columnar orientation and then copy
+  // that starting state to the working register set.
+  // Also load the address of the add mask for later use in handling
+  // multi-block counter increments.
+  __ lea(rotAddr, ExternalAddress(chacha20_lrot_consts()));
+  __ lea(rax, ExternalAddress(chacha20_ctradd_avx()));
+  if (vector_len == Assembler::AVX_128bit) {
+    __ movdqu(aState, Address(state, 0));       // Bytes 0 - 15 -> a1Vec
+    __ movdqu(bState, Address(state, 16));      // Bytes 16 - 31 -> b1Vec
+    __ movdqu(cState, Address(state, 32));      // Bytes 32 - 47 -> c1Vec
+    __ movdqu(dState, Address(state, 48));      // Bytes 48 - 63 -> d1Vec
+
+    __ movdqu(a1Vec, aState);
+    __ movdqu(b1Vec, bState);
+    __ movdqu(c1Vec, cState);
+    __ movdqu(d1Vec, dState);
+
+    __ movdqu(a2Vec, aState);
+    __ movdqu(b2Vec, bState);
+    __ movdqu(c2Vec, cState);
+    __ vpaddd(d2State, dState, Address(rax, 16), vector_len);
+    __ movdqu(d2Vec, d2State);
+    __ movdqu(lrot8, Address(rotAddr, 0));      // Load 8-bit lrot const
+    __ movdqu(lrot16, Address(rotAddr, 32));    // Load 16-bit lrot const
+  } else {
+    // We will broadcast each 128-bit segment of the state array into
+    // the high and low halves of ymm state registers.  Then apply the add
+    // mask to the dState register.  These will then be copied into the
+    // a/b/c/d1Vec working registers.
+    __ vbroadcastf128(aState, Address(state, 0), vector_len);
+    __ vbroadcastf128(bState, Address(state, 16), vector_len);
+    __ vbroadcastf128(cState, Address(state, 32), vector_len);
+    __ vbroadcastf128(dState, Address(state, 48), vector_len);
+    __ vpaddd(dState, dState, Address(rax, 0), vector_len);
+    __ vpaddd(d2State, dState, Address(rax, 32), vector_len);
+
+    __ vmovdqu(a1Vec, aState);
+    __ vmovdqu(b1Vec, bState);
+    __ vmovdqu(c1Vec, cState);
+    __ vmovdqu(d1Vec, dState);
+
+    __ vmovdqu(a2Vec, aState);
+    __ vmovdqu(b2Vec, bState);
+    __ vmovdqu(c2Vec, cState);
+    __ vmovdqu(d2Vec, d2State);
+    __ vmovdqu(lrot8, Address(rotAddr, 0));      // Load 8-bit lrot const
+    __ vmovdqu(lrot16, Address(rotAddr, 32));    // Load 16-bit lrot const
+  }
+
+  __ movl(loopCounter, 10);                   // Set 10 2-round iterations
+  __ BIND(L_twoRounds);
+
+  // The first quarter round macro call covers the first 4 QR operations:
+  //  Qround(state, 0, 4, 8,12)
+  //  Qround(state, 1, 5, 9,13)
+  //  Qround(state, 2, 6,10,14)
+  //  Qround(state, 3, 7,11,15)
+  cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
+      lrot8, lrot16, vector_len);
+  cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
+      lrot8, lrot16, vector_len);
+
+  // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
+  // to diagonals.  The a1Vec does not need to change orientation.
+  cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, true);
+  cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, true);
+
+  // The second set of operations on the vectors covers the second 4 quarter
+  // round operations, now acting on the diagonals:
+  //  Qround(state, 0, 5,10,15)
+  //  Qround(state, 1, 6,11,12)
+  //  Qround(state, 2, 7, 8,13)
+  //  Qround(state, 3, 4, 9,14)
+  cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
+      lrot8, lrot16, vector_len);
+  cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
+      lrot8, lrot16, vector_len);
+
+  // Before we start the next iteration, we need to perform shuffles
+  // on the b/c/d vectors to move them back to columnar organizations
+  // from their current diagonal orientation.
+  cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, false);
+  cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, false);
+
+  __ decrement(loopCounter);
+  __ jcc(Assembler::notZero, L_twoRounds);
+
+  // Add the original start state back into the current state.
+  __ vpaddd(a1Vec, a1Vec, aState, vector_len);
+  __ vpaddd(b1Vec, b1Vec, bState, vector_len);
+  __ vpaddd(c1Vec, c1Vec, cState, vector_len);
+  __ vpaddd(d1Vec, d1Vec, dState, vector_len);
+
+  __ vpaddd(a2Vec, a2Vec, aState, vector_len);
+  __ vpaddd(b2Vec, b2Vec, bState, vector_len);
+  __ vpaddd(c2Vec, c2Vec, cState, vector_len);
+  __ vpaddd(d2Vec, d2Vec, d2State, vector_len);
+
+  // Write the data to the keystream array
+  if (vector_len == Assembler::AVX_128bit) {
+    __ movdqu(Address(result, 0), a1Vec);
+    __ movdqu(Address(result, 16), b1Vec);
+    __ movdqu(Address(result, 32), c1Vec);
+    __ movdqu(Address(result, 48), d1Vec);
+    __ movdqu(Address(result, 64), a2Vec);
+    __ movdqu(Address(result, 80), b2Vec);
+    __ movdqu(Address(result, 96), c2Vec);
+    __ movdqu(Address(result, 112), d2Vec);
+  } else {
+    // Each half of the YMM has to be written 64 bytes apart from
+    // each other in memory so the final keystream buffer holds
+    // two consecutive keystream blocks.
+    __ vextracti128(Address(result, 0), a1Vec, 0);
+    __ vextracti128(Address(result, 64), a1Vec, 1);
+    __ vextracti128(Address(result, 16), b1Vec, 0);
+    __ vextracti128(Address(result, 80), b1Vec, 1);
+    __ vextracti128(Address(result, 32), c1Vec, 0);
+    __ vextracti128(Address(result, 96), c1Vec, 1);
+    __ vextracti128(Address(result, 48), d1Vec, 0);
+    __ vextracti128(Address(result, 112), d1Vec, 1);
+
+    __ vextracti128(Address(result, 128), a2Vec, 0);
+    __ vextracti128(Address(result, 192), a2Vec, 1);
+    __ vextracti128(Address(result, 144), b2Vec, 0);
+    __ vextracti128(Address(result, 208), b2Vec, 1);
+    __ vextracti128(Address(result, 160), c2Vec, 0);
+    __ vextracti128(Address(result, 224), c2Vec, 1);
+    __ vextracti128(Address(result, 176), d2Vec, 0);
+    __ vextracti128(Address(result, 240), d2Vec, 1);
+  }
+
+  // This function will always write 128 or 256 bytes into the
+  // key stream buffer, depending on the length of the SIMD
+  // registers.  That length should be returned through %rax.
+  __ mov64(rax, outlen);
+
+  __ leave();
+  __ ret(0);
+  return start;
+}
+
+/* The 4-block AVX512-enabled ChaCha20 block function implementation */
+address StubGenerator::generate_chacha20Block_avx512() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "chacha20Block");
+  address start = __ pc();
+
+  Label L_twoRounds;
+  const Register state        = c_rarg0;
+  const Register result       = c_rarg1;
+  const Register loopCounter  = r8;
+
+  const XMMRegister aState = xmm0;
+  const XMMRegister bState = xmm1;
+  const XMMRegister cState = xmm2;
+  const XMMRegister dState = xmm3;
+  const XMMRegister a1Vec = xmm4;
+  const XMMRegister b1Vec = xmm5;
+  const XMMRegister c1Vec = xmm6;
+  const XMMRegister d1Vec = xmm7;
+  const XMMRegister a2Vec = xmm8;
+  const XMMRegister b2Vec = xmm9;
+  const XMMRegister c2Vec = xmm10;
+  const XMMRegister d2Vec = xmm11;
+  const XMMRegister a3Vec = xmm12;
+  const XMMRegister b3Vec = xmm13;
+  const XMMRegister c3Vec = xmm14;
+  const XMMRegister d3Vec = xmm15;
+  const XMMRegister a4Vec = xmm16;
+  const XMMRegister b4Vec = xmm17;
+  const XMMRegister c4Vec = xmm18;
+  const XMMRegister d4Vec = xmm19;
+  const XMMRegister d2State = xmm20;
+  const XMMRegister d3State = xmm21;
+  const XMMRegister d4State = xmm22;
+  const XMMRegister scratch = xmm23;
+
+  __ enter();
+
+  // Load the initial state in columnar orientation.
+  // We will broadcast each 128-bit segment of the state array into
+  // all four double-quadword slots on ZMM State registers.  They will
+  // be copied into the working ZMM registers and then added back in
+  // at the very end of the block function.  The add mask should be
+  // applied to the dState register so it does not need to be fetched
+  // when adding the start state back into the final working state.
+  __ lea(rax, ExternalAddress(chacha20_ctradd_avx512()));
+  __ evbroadcasti32x4(aState, Address(state, 0), Assembler::AVX_512bit);
+  __ evbroadcasti32x4(bState, Address(state, 16), Assembler::AVX_512bit);
+  __ evbroadcasti32x4(cState, Address(state, 32), Assembler::AVX_512bit);
+  __ evbroadcasti32x4(dState, Address(state, 48), Assembler::AVX_512bit);
+  __ vpaddd(dState, dState, Address(rax, 0), Assembler::AVX_512bit);
+  __ evmovdqul(scratch, Address(rax, 64), Assembler::AVX_512bit);
+  __ vpaddd(d2State, dState, scratch, Assembler::AVX_512bit);
+  __ vpaddd(d3State, d2State, scratch, Assembler::AVX_512bit);
+  __ vpaddd(d4State, d3State, scratch, Assembler::AVX_512bit);
+
+  __ evmovdqul(a1Vec, aState, Assembler::AVX_512bit);
+  __ evmovdqul(b1Vec, bState, Assembler::AVX_512bit);
+  __ evmovdqul(c1Vec, cState, Assembler::AVX_512bit);
+  __ evmovdqul(d1Vec, dState, Assembler::AVX_512bit);
+
+  __ evmovdqul(a2Vec, aState, Assembler::AVX_512bit);
+  __ evmovdqul(b2Vec, bState, Assembler::AVX_512bit);
+  __ evmovdqul(c2Vec, cState, Assembler::AVX_512bit);
+  __ evmovdqul(d2Vec, d2State, Assembler::AVX_512bit);
+
+  __ evmovdqul(a3Vec, aState, Assembler::AVX_512bit);
+  __ evmovdqul(b3Vec, bState, Assembler::AVX_512bit);
+  __ evmovdqul(c3Vec, cState, Assembler::AVX_512bit);
+  __ evmovdqul(d3Vec, d3State, Assembler::AVX_512bit);
+
+  __ evmovdqul(a4Vec, aState, Assembler::AVX_512bit);
+  __ evmovdqul(b4Vec, bState, Assembler::AVX_512bit);
+  __ evmovdqul(c4Vec, cState, Assembler::AVX_512bit);
+  __ evmovdqul(d4Vec, d4State, Assembler::AVX_512bit);
+
+  __ movl(loopCounter, 10);                       // Set 10 2-round iterations
+  __ BIND(L_twoRounds);
+
+  // The first set of operations on the vectors covers the first 4 quarter
+  // round operations:
+  //  Qround(state, 0, 4, 8,12)
+  //  Qround(state, 1, 5, 9,13)
+  //  Qround(state, 2, 6,10,14)
+  //  Qround(state, 3, 7,11,15)
+  cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+
+  // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
+  // to diagonals.  The a1Vec does not need to change orientation.
+  cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, true);
+  cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, true);
+  cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, true);
+  cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, true);
+
+  // The second set of operations on the vectors covers the second 4 quarter
+  // round operations, now acting on the diagonals:
+  //  Qround(state, 0, 5,10,15)
+  //  Qround(state, 1, 6,11,12)
+  //  Qround(state, 2, 7, 8,13)
+  //  Qround(state, 3, 4, 9,14)
+  cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+  cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
+      xnoreg, xnoreg, Assembler::AVX_512bit);
+
+  // Before we start the next iteration, we need to perform shuffles
+  // on the b/c/d vectors to move them back to columnar organizations
+  // from their current diagonal orientation.
+  cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, false);
+  cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, false);
+  cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, false);
+  cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, false);
+
+  __ decrement(loopCounter);
+  __ jcc(Assembler::notZero, L_twoRounds);
+
+  // Add the initial state now held on the a/b/c/dState registers to the
+  // final working register values.  We will also add in the counter add
+  // mask onto zmm3 after adding in the start state.
+  __ vpaddd(a1Vec, a1Vec, aState, Assembler::AVX_512bit);
+  __ vpaddd(b1Vec, b1Vec, bState, Assembler::AVX_512bit);
+  __ vpaddd(c1Vec, c1Vec, cState, Assembler::AVX_512bit);
+  __ vpaddd(d1Vec, d1Vec, dState, Assembler::AVX_512bit);
+
+  __ vpaddd(a2Vec, a2Vec, aState, Assembler::AVX_512bit);
+  __ vpaddd(b2Vec, b2Vec, bState, Assembler::AVX_512bit);
+  __ vpaddd(c2Vec, c2Vec, cState, Assembler::AVX_512bit);
+  __ vpaddd(d2Vec, d2Vec, d2State, Assembler::AVX_512bit);
+
+  __ vpaddd(a3Vec, a3Vec, aState, Assembler::AVX_512bit);
+  __ vpaddd(b3Vec, b3Vec, bState, Assembler::AVX_512bit);
+  __ vpaddd(c3Vec, c3Vec, cState, Assembler::AVX_512bit);
+  __ vpaddd(d3Vec, d3Vec, d3State, Assembler::AVX_512bit);
+
+  __ vpaddd(a4Vec, a4Vec, aState, Assembler::AVX_512bit);
+  __ vpaddd(b4Vec, b4Vec, bState, Assembler::AVX_512bit);
+  __ vpaddd(c4Vec, c4Vec, cState, Assembler::AVX_512bit);
+  __ vpaddd(d4Vec, d4Vec, d4State, Assembler::AVX_512bit);
+
+  // Write the ZMM state registers out to the key stream buffer
+  // Each ZMM is divided into 4 128-bit segments.  Each segment
+  // is written to memory at 64-byte displacements from one
+  // another.  The result is that all 4 blocks will be in their
+  // proper order when serialized.
+  cc20_keystream_collate_avx512(a1Vec, b1Vec, c1Vec, d1Vec, result, 0);
+  cc20_keystream_collate_avx512(a2Vec, b2Vec, c2Vec, d2Vec, result, 256);
+  cc20_keystream_collate_avx512(a3Vec, b3Vec, c3Vec, d3Vec, result, 512);
+  cc20_keystream_collate_avx512(a4Vec, b4Vec, c4Vec, d4Vec, result, 768);
+
+  // This function will always write 1024 bytes into the key stream buffer
+  // and that length should be returned through %rax.
+  __ mov64(rax, 1024);
+
+  __ leave();
+  __ ret(0);
+  return start;
+}
+
+/**
+ * Provide a function that implements the ChaCha20 quarter round function.
+ *
+ * @param aVec the SIMD register containing only the "a" values
+ * @param bVec the SIMD register containing only the "b" values
+ * @param cVec the SIMD register containing only the "c" values
+ * @param dVec the SIMD register containing only the "d" values
+ * @param scratch SIMD register used for non-byte-aligned left rotations
+ * @param lrot8 shuffle control mask for an 8-byte left rotation (32-bit lane)
+ * @param lrot16 shuffle control mask for a 16-byte left rotation (32-bit lane)
+ * @param vector_len the length of the vector
+ */
+void StubGenerator::cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
+    XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
+    XMMRegister lrot8, XMMRegister lrot16, int vector_len) {
+
+  // a += b; d ^= a; d <<<= 16
+  __ vpaddd(aVec, aVec, bVec, vector_len);
+  __ vpxor(dVec, dVec, aVec, vector_len);
+  if (vector_len == Assembler::AVX_512bit) {
+    __ evprold(dVec, dVec, 16, vector_len);
+  } else {
+    __ vpshufb(dVec, dVec, lrot16, vector_len);
+  }
+
+  // c += d; b ^= c; b <<<= 12 (b << 12 | scratch >>> 20)
+  __ vpaddd(cVec, cVec, dVec, vector_len);
+  __ vpxor(bVec, bVec, cVec, vector_len);
+  if (vector_len == Assembler::AVX_512bit) {
+    __ evprold(bVec, bVec, 12, vector_len);
+  } else {
+    __ vpsrld(scratch, bVec, 20, vector_len);
+    __ vpslld(bVec, bVec, 12, vector_len);
+    __ vpor(bVec, bVec, scratch, vector_len);
+  }
+
+  // a += b; d ^= a; d <<<= 8 (d << 8 | scratch >>> 24)
+  __ vpaddd(aVec, aVec, bVec, vector_len);
+  __ vpxor(dVec, dVec, aVec, vector_len);
+  if (vector_len == Assembler::AVX_512bit) {
+    __ evprold(dVec, dVec, 8, vector_len);
+  } else {
+    __ vpshufb(dVec, dVec, lrot8, vector_len);
+  }
+
+  // c += d; b ^= c; b <<<= 7 (b << 7 | scratch >>> 25)
+  __ vpaddd(cVec, cVec, dVec, vector_len);
+  __ vpxor(bVec, bVec, cVec, vector_len);
+  if (vector_len == Assembler::AVX_512bit) {
+    __ evprold(bVec, bVec, 7, vector_len);
+  } else {
+    __ vpsrld(scratch, bVec, 25, vector_len);
+    __ vpslld(bVec, bVec, 7, vector_len);
+    __ vpor(bVec, bVec, scratch, vector_len);
+  }
+}
+
+/**
+ * Shift the b, c, and d vectors between columnar and diagonal representations.
+ * Note that the "a" vector does not shift.
+ *
+ * @param bVec the SIMD register containing only the "b" values
+ * @param cVec the SIMD register containing only the "c" values
+ * @param dVec the SIMD register containing only the "d" values
+ * @param vector_len the size of the SIMD register to operate upon
+ * @param colToDiag true if moving columnar to diagonal, false if
+ *                  moving diagonal back to columnar.
+ */
+void StubGenerator::cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
+    XMMRegister dVec, int vector_len, bool colToDiag) {
+  int bShift = colToDiag ? 0x39 : 0x93;
+  int cShift = 0x4E;
+  int dShift = colToDiag ? 0x93 : 0x39;
+
+  __ vpshufd(bVec, bVec, bShift, vector_len);
+  __ vpshufd(cVec, cVec, cShift, vector_len);
+  __ vpshufd(dVec, dVec, dShift, vector_len);
+}
+
+/**
+ * Write 256 bytes of keystream output held in 4 AVX512 SIMD registers
+ * in a quarter round parallel organization.
+ *
+ * @param aVec the SIMD register containing only the "a" values
+ * @param bVec the SIMD register containing only the "b" values
+ * @param cVec the SIMD register containing only the "c" values
+ * @param dVec the SIMD register containing only the "d" values
+ * @param baseAddr the register holding the base output address
+ * @param baseOffset the offset from baseAddr for writes
+ */
+void StubGenerator::cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister
+bVec,
+    XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset) {
+  __ vextracti32x4(Address(baseAddr, baseOffset + 0), aVec, 0);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 64), aVec, 1);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 128), aVec, 2);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 192), aVec, 3);
+
+  __ vextracti32x4(Address(baseAddr, baseOffset + 16), bVec, 0);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 80), bVec, 1);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 144), bVec, 2);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 208), bVec, 3);
+
+  __ vextracti32x4(Address(baseAddr, baseOffset + 32), cVec, 0);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 96), cVec, 1);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 160), cVec, 2);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 224), cVec, 3);
+
+  __ vextracti32x4(Address(baseAddr, baseOffset + 48), dVec, 0);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 112), dVec, 1);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 176), dVec, 2);
+  __ vextracti32x4(Address(baseAddr, baseOffset + 240), dVec, 3);
+}
+
+#undef __
diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp
index 5a45c29307e..5a0a085401b 100644
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -1122,6 +1122,22 @@ void VM_Version::get_processor_features() {
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  // ChaCha20 Intrinsics
+  // As long as the system supports AVX as a baseline we can do a
+  // SIMD-enabled block function.  StubGenerator makes the determination
+  // based on the VM capabilities whether to use an AVX2 or AVX512-enabled
+  // version.
+  if (UseAVX >= 1) {
+      if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+          UseChaCha20Intrinsics = true;
+      }
+  } else if (UseChaCha20Intrinsics) {
+      if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+          warning("ChaCha20 intrinsic requires AVX instructions");
+      }
+      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
+  }
+
   // Base64 Intrinsics (Check the condition for which the intrinsic will be active)
   if ((UseAVX > 2) && supports_avx512vl() && supports_avx512bw()) {
     if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
diff --git a/src/hotspot/share/classfile/vmIntrinsics.cpp b/src/hotspot/share/classfile/vmIntrinsics.cpp
index 0f8ba1e808d..0c3364a8c94 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.cpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.cpp
@@ -475,6 +475,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
   case vmIntrinsics::_ghash_processBlocks:
     if (!UseGHASHIntrinsics) return true;
     break;
+  case vmIntrinsics::_chacha20Block:
+    if (!UseChaCha20Intrinsics) return true;
+    break;
   case vmIntrinsics::_base64_encodeBlock:
   case vmIntrinsics::_base64_decodeBlock:
     if (!UseBASE64Intrinsics) return true;
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 9b0cd3f366f..70604efe1dc 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -532,6 +532,12 @@ class methodHandle;
   do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \
    do_name(processMultipleBlocks_name, "processMultipleBlocks")                                                         \
                                                                                                                         \
+  /* support for com.sun.crypto.provider.ChaCha20Cipher */                                                              \
+  do_class(com_sun_crypto_provider_chacha20cipher,      "com/sun/crypto/provider/ChaCha20Cipher")                       \
+  do_intrinsic(_chacha20Block, com_sun_crypto_provider_chacha20cipher, chacha20Block_name, chacha20Block_signature, F_S) \
+   do_name(chacha20Block_name,                                 "implChaCha20Block")                                         \
+   do_signature(chacha20Block_signature, "([I[B)I")                                                                    \
+                                                                                                                        \
   /* support for java.util.zip */                                                                                       \
   do_class(java_util_zip_CRC32,           "java/util/zip/CRC32")                                                        \
   do_intrinsic(_updateCRC32,               java_util_zip_CRC32,   update_name, int2_int_signature,               F_SN)  \
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index 4b583acc987..74d65514bc1 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -318,6 +318,7 @@
   static_field(StubRoutines,                _ghash_processBlocks,                             address)                               \
   static_field(StubRoutines,                _md5_implCompress,                                address)                               \
   static_field(StubRoutines,                _md5_implCompressMB,                              address)                               \
+  static_field(StubRoutines,                _chacha20Block,                                   address)                               \
   static_field(StubRoutines,                _sha1_implCompress,                               address)                               \
   static_field(StubRoutines,                _sha1_implCompressMB,                             address)                               \
   static_field(StubRoutines,                _sha256_implCompress,                             address)                               \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index a683b259b92..a4e997513f4 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -737,6 +737,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
   case vmIntrinsics::_bigIntegerLeftShiftWorker:
   case vmIntrinsics::_vectorizedMismatch:
   case vmIntrinsics::_ghash_processBlocks:
+  case vmIntrinsics::_chacha20Block:
   case vmIntrinsics::_base64_encodeBlock:
   case vmIntrinsics::_base64_decodeBlock:
   case vmIntrinsics::_poly1305_processBlocks:
diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp
index b221ca932e3..308b59ab799 100644
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@@ -1168,6 +1168,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "chacha20Block") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "md5_implCompress") == 0 ||
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index a59923ed21a..872570a7fca 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -608,6 +608,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_ghash_processBlocks:
     return inline_ghash_processBlocks();
+  case vmIntrinsics::_chacha20Block:
+    return inline_chacha20Block();
   case vmIntrinsics::_base64_encodeBlock:
     return inline_base64_encodeBlock();
   case vmIntrinsics::_base64_decodeBlock:
@@ -6897,6 +6899,36 @@ bool LibraryCallKit::inline_ghash_processBlocks() {
   return true;
 }
 
+//------------------------------inline_chacha20Block
+bool LibraryCallKit::inline_chacha20Block() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseChaCha20Intrinsics, "need ChaCha20 intrinsics support");
+
+  stubAddr = StubRoutines::chacha20Block();
+  stubName = "chacha20Block";
+
+  Node* state          = argument(0);
+  Node* result         = argument(1);
+
+  state = must_be_not_null(state, true);
+  result = must_be_not_null(result, true);
+
+  Node* state_start  = array_element_address(state, intcon(0), T_INT);
+  assert(state_start, "state is NULL");
+  Node* result_start  = array_element_address(result, intcon(0), T_BYTE);
+  assert(result_start, "result is NULL");
+
+  Node* cc20Blk = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::chacha20Block_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  state_start, result_start);
+  // return key stream length (int)
+  Node* retvalue = _gvn.transform(new ProjNode(cc20Blk, TypeFunc::Parms));
+  set_result(retvalue);
+  return true;
+}
+
 bool LibraryCallKit::inline_base64_encodeBlock() {
   address stubAddr;
   const char *stubName;
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 35c699b73bf..405a5bfd01c 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -291,6 +291,7 @@ class LibraryCallKit : public GraphKit {
   Node* inline_counterMode_AESCrypt_predicate();
   Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
   bool inline_ghash_processBlocks();
+  bool inline_chacha20Block();
   bool inline_base64_encodeBlock();
   bool inline_base64_decodeBlock();
   bool inline_poly1305_processBlocks();
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 769bbd191ff..aed280fd4e5 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -1222,6 +1222,26 @@ const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
     const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
     return TypeFunc::make(domain, range);
 }
+
+// ChaCha20 Block function
+const TypeFunc* OptoRuntime::chacha20Block_Type() {
+    int argcnt = 2;
+
+    const Type** fields = TypeTuple::fields(argcnt);
+    int argp = TypeFunc::Parms;
+    fields[argp++] = TypePtr::NOTNULL;      // state
+    fields[argp++] = TypePtr::NOTNULL;      // result
+
+    assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+    const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+
+    // result type needed
+    fields = TypeTuple::fields(1);
+    fields[TypeFunc::Parms + 0] = TypeInt::INT;     // key stream outlen as int
+    const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+    return TypeFunc::make(domain, range);
+}
+
 // Base64 encode function
 const TypeFunc* OptoRuntime::base64_encodeBlock_Type() {
   int argcnt = 6;
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index 1de8ffb18fb..6ddf50a7b6d 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -278,6 +278,7 @@ private:
   static const TypeFunc* vectorizedMismatch_Type();
 
   static const TypeFunc* ghash_processBlocks_Type();
+  static const TypeFunc* chacha20Block_Type();
   static const TypeFunc* base64_encodeBlock_Type();
   static const TypeFunc* base64_decodeBlock_Type();
   static const TypeFunc* poly1305_processBlocks_Type();
diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp
index c5750d6d68e..f54f59494e8 100644
--- a/src/hotspot/share/runtime/globals.hpp
+++ b/src/hotspot/share/runtime/globals.hpp
@@ -323,6 +323,9 @@ const int ObjectAlignmentInBytes = 8;
   product(bool, UseAESCTRIntrinsics, false, DIAGNOSTIC,                     \
           "Use intrinsics for the paralleled version of AES/CTR crypto")    \
                                                                             \
+  product(bool, UseChaCha20Intrinsics, false, DIAGNOSTIC,                   \
+          "Use intrinsics for the vectorized version of ChaCha20")          \
+                                                                            \
   product(bool, UseMD5Intrinsics, false, DIAGNOSTIC,                        \
           "Use intrinsics for MD5 crypto hash function")                    \
                                                                             \
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 9418b758387..091ff90e470 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -128,6 +128,7 @@ address StubRoutines::_electronicCodeBook_decryptAESCrypt  = NULL;
 address StubRoutines::_counterMode_AESCrypt                = NULL;
 address StubRoutines::_galoisCounterMode_AESCrypt          = NULL;
 address StubRoutines::_ghash_processBlocks                 = NULL;
+address StubRoutines::_chacha20Block                       = NULL;
 address StubRoutines::_base64_encodeBlock                  = NULL;
 address StubRoutines::_base64_decodeBlock                  = NULL;
 address StubRoutines::_poly1305_processBlocks              = NULL;
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index f4cec54aa7f..767a2a098c0 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -209,6 +209,7 @@ class StubRoutines: AllStatic {
   static address _counterMode_AESCrypt;
   static address _galoisCounterMode_AESCrypt;
   static address _ghash_processBlocks;
+  static address _chacha20Block;
   static address _base64_encodeBlock;
   static address _base64_decodeBlock;
   static address _poly1305_processBlocks;
@@ -388,6 +389,7 @@ class StubRoutines: AllStatic {
   static address poly1305_processBlocks()               { return _poly1305_processBlocks; }
   static address counterMode_AESCrypt()  { return _counterMode_AESCrypt; }
   static address ghash_processBlocks()   { return _ghash_processBlocks; }
+  static address chacha20Block()         { return _chacha20Block; }
   static address base64_encodeBlock()    { return _base64_encodeBlock; }
   static address base64_decodeBlock()    { return _base64_decodeBlock; }
   static address md5_implCompress()      { return _md5_implCompress; }
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 48f3895b2f2..2ee5a251a65 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -541,6 +541,7 @@
      static_field(StubRoutines,                _counterMode_AESCrypt,                         address)                               \
      static_field(StubRoutines,                _galoisCounterMode_AESCrypt,                   address)                               \
      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
+     static_field(StubRoutines,                _chacha20Block,                                address)                               \
      static_field(StubRoutines,                _base64_encodeBlock,                           address)                               \
      static_field(StubRoutines,                _base64_decodeBlock,                           address)                               \
      static_field(StubRoutines,                _poly1305_processBlocks,                       address)                               \
diff --git a/src/java.base/share/classes/com/sun/crypto/provider/ChaCha20Cipher.java b/src/java.base/share/classes/com/sun/crypto/provider/ChaCha20Cipher.java
index 5230c3653cf..84bd6521345 100644
--- a/src/java.base/share/classes/com/sun/crypto/provider/ChaCha20Cipher.java
+++ b/src/java.base/share/classes/com/sun/crypto/provider/ChaCha20Cipher.java
@@ -39,6 +39,9 @@ import javax.crypto.*;
 import javax.crypto.spec.ChaCha20ParameterSpec;
 import javax.crypto.spec.IvParameterSpec;
 import javax.crypto.spec.SecretKeySpec;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.annotation.IntrinsicCandidate;
 import sun.security.util.DerValue;
 
 /**
@@ -58,8 +61,9 @@ abstract class ChaCha20Cipher extends CipherSpi {
     private static final int STATE_CONST_3 = 0x6b206574;
 
     // The keystream block size in bytes and as integers
-    private static final int KEYSTREAM_SIZE = 64;
-    private static final int KS_SIZE_INTS = KEYSTREAM_SIZE / Integer.BYTES;
+    private static final int KS_MAX_LEN = 1024;
+    private static final int KS_BLK_SIZE = 64;
+    private static final int KS_SIZE_INTS = KS_BLK_SIZE / Integer.BYTES;
     private static final int CIPHERBUF_BASE = 1024;
 
     // The initialization state of the cipher
@@ -85,14 +89,18 @@ abstract class ChaCha20Cipher extends CipherSpi {
     private long finalCounterValue;
     private long counter;
 
-    // Two arrays, both implemented as 16-element integer arrays:
-    // The base state, created at initialization time, and a working
-    // state which is a clone of the start state, and is then modified
-    // with the counter and the ChaCha20 block function.
+    // The base state is created at initialization time as a 16-int array
+    // and then is copied into either local variables for computations (Java) or
+    // into SIMD registers (intrinsics).
     private final int[] startState = new int[KS_SIZE_INTS];
-    private final byte[] keyStream = new byte[KEYSTREAM_SIZE];
 
-    // The offset into the current keystream
+    // The output keystream array is sized to hold keystream output from the
+    // implChaCha20Block method.  This can range from a single block at a time
+    // (Java software) up to 16 blocks on x86_64 with AVX512 support.
+    private final byte[] keyStream = new byte[KS_MAX_LEN];
+
+    // The keystream buffer limit and offset
+    private int keyStrLimit;
     private int keyStrOffset;
 
     // AEAD-related fields and constants
@@ -561,12 +569,14 @@ abstract class ChaCha20Cipher extends CipherSpi {
             }
         }
 
-        // We can also get one block's worth of keystream created
+        // We can also generate the first block (or blocks if intrinsics
+        // are capable of doing multiple blocks at a time) of keystream.
         finalCounterValue = counter + MAX_UINT32;
-        generateKeystream();
+        this.keyStrLimit = chaCha20Block(startState, counter, keyStream);
+        this.keyStrOffset = 0;
+        this.counter += (keyStrLimit / KS_BLK_SIZE);
         direction = opmode;
         aadDone = false;
-        this.keyStrOffset = 0;
         initialized = true;
     }
 
@@ -831,31 +841,34 @@ abstract class ChaCha20Cipher extends CipherSpi {
         }
     }
 
-    /**
-     * Using the current state and counter create the next set of keystream
-     * bytes.  This method will generate the next 512 bits of keystream and
-     * return it in the {@code keyStream} parameter.  Following the
-     * block function the counter will be incremented.
-     */
-    private void generateKeystream() {
-        chaCha20Block(startState, counter, keyStream);
-        counter++;
+    @ForceInline
+    private static int chaCha20Block(int[] initState, long counter,
+            byte[] result) {
+        if (initState.length != KS_SIZE_INTS || result.length != KS_MAX_LEN) {
+            throw new IllegalArgumentException(
+                    "Illegal state or keystream buffer length");
+        }
+
+        // Set the counter value before sending into the underlying
+        // private block method
+        initState[12] = (int)counter;
+        return implChaCha20Block(initState, result);
     }
 
     /**
      * Perform a full 20-round ChaCha20 transform on the initial state.
      *
-     * @param initState the starting state, not including the counter
-     *      value.
-     * @param counter the counter value to apply
+     * @param initState the starting state using the current counter value.
      * @param result  the array that will hold the result of the ChaCha20
      *      block function.
      *
-     * @note it is the caller's responsibility to ensure that the workState
-     * is sized the same as the initState, no checking is performed internally.
+     * @return the number of keystream bytes generated.  In a pure Java method
+     *      this will always be 64 bytes, but intrinsics that make use of
+     *      AVX2 or AVX512 registers may generate multiple blocks of keystream
+     *      in a single call and therefore may be a larger multiple of 64.
      */
-    private static void chaCha20Block(int[] initState, long counter,
-                                      byte[] result) {
+    @IntrinsicCandidate
+    private static int implChaCha20Block(int[] initState, byte[] result) {
         // Create an initial state and clone a working copy
         int ws00 = STATE_CONST_0;
         int ws01 = STATE_CONST_1;
@@ -869,7 +882,7 @@ abstract class ChaCha20Cipher extends CipherSpi {
         int ws09 = initState[9];
         int ws10 = initState[10];
         int ws11 = initState[11];
-        int ws12 = (int)counter;
+        int ws12 = initState[12];
         int ws13 = initState[13];
         int ws14 = initState[14];
         int ws15 = initState[15];
@@ -986,11 +999,12 @@ abstract class ChaCha20Cipher extends CipherSpi {
         asIntLittleEndian.set(result, 36, ws09 + initState[9]);
         asIntLittleEndian.set(result, 40, ws10 + initState[10]);
         asIntLittleEndian.set(result, 44, ws11 + initState[11]);
-        // Add the counter back into workState[12]
-        asIntLittleEndian.set(result, 48, ws12 + (int)counter);
+        asIntLittleEndian.set(result, 48, ws12 + initState[12]);
         asIntLittleEndian.set(result, 52, ws13 + initState[13]);
         asIntLittleEndian.set(result, 56, ws14 + initState[14]);
         asIntLittleEndian.set(result, 60, ws15 + initState[15]);
+
+        return KS_BLK_SIZE;
     }
 
     /**
@@ -1009,12 +1023,21 @@ abstract class ChaCha20Cipher extends CipherSpi {
         int remainingData = inLen;
 
         while (remainingData > 0) {
-            int ksRemain = keyStream.length - keyStrOffset;
+            int ksRemain = keyStrLimit - keyStrOffset;
             if (ksRemain <= 0) {
                 if (counter <= finalCounterValue) {
-                    generateKeystream();
+                    // Intrinsics can do multiple blocks at once.  This means
+                    // it may overrun the counter. In order to prevent key
+                    // stream reuse, we adjust the key stream limit to only the
+                    // key stream length that is calculated from unique
+                    // counter values.
+                    keyStrLimit = chaCha20Block(startState, counter, keyStream);
+                    counter += (keyStrLimit / KS_BLK_SIZE);
+                    if (counter > finalCounterValue) {
+                        keyStrLimit -= (int)(counter - finalCounterValue) * 64;
+                    }
                     keyStrOffset = 0;
-                    ksRemain = keyStream.length;
+                    ksRemain = keyStrLimit;
                 } else {
                     throw new KeyException("Counter exhausted.  " +
                             "Reinitialize with new key and/or nonce");
@@ -1060,9 +1083,10 @@ abstract class ChaCha20Cipher extends CipherSpi {
     private void initAuthenticator() throws InvalidKeyException {
         authenticator = new Poly1305();
 
-        // Derive the Poly1305 key from the starting state
-        byte[] serializedKey = new byte[KEYSTREAM_SIZE];
-        chaCha20Block(startState, 0, serializedKey);
+        // Derive the Poly1305 key from the starting state with the counter
+        // value forced to zero.
+        byte[] serializedKey = new byte[KS_MAX_LEN];
+        chaCha20Block(startState, 0L, serializedKey);
 
         authenticator.engineInit(new SecretKeySpec(serializedKey, 0, 32,
                 authAlgName), null);
diff --git a/test/hotspot/jtreg/compiler/intrinsics/chacha/ExerciseChaCha20.java b/test/hotspot/jtreg/compiler/intrinsics/chacha/ExerciseChaCha20.java
new file mode 100644
index 00000000000..4214c689484
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/intrinsics/chacha/ExerciseChaCha20.java
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.intrinsics.chacha;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.ChaCha20ParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import java.security.GeneralSecurityException;
+import java.util.*;
+
+public class ExerciseChaCha20 {
+
+    private static final int WARMUP_CYCLES = 200000;
+
+    // Use the test vectors from RFC 7539 to exercise the ChaCha20 block
+    // intrinsic
+    public static final List<TestData> testList = List.of(
+        new TestData("RFC 7539 Sample Test Vector",
+            "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
+            "000000000000004a00000000",
+            1, Cipher.ENCRYPT_MODE,
+            "4c616469657320616e642047656e746c656d656e206f662074686520636c6173" +
+            "73206f66202739393a204966204920636f756c64206f6666657220796f75206f" +
+            "6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73" +
+            "637265656e20776f756c642062652069742e",
+            null,
+            "6e2e359a2568f98041ba0728dd0d6981e97e7aec1d4360c20a27afccfd9fae0b" +
+            "f91b65c5524733ab8f593dabcd62b3571639d624e65152ab8f530c359f0861d8" +
+            "07ca0dbf500d6a6156a38e088a22b65e52bc514d16ccf806818ce91ab7793736" +
+            "5af90bbf74a35be6b40b8eedf2785e42874d"),
+        new TestData("RFC 7539 Test Vector 1 (all zeroes)",
+            "0000000000000000000000000000000000000000000000000000000000000000",
+            "000000000000000000000000",
+            0, Cipher.ENCRYPT_MODE,
+            "0000000000000000000000000000000000000000000000000000000000000000" +
+            "0000000000000000000000000000000000000000000000000000000000000000",
+            null,
+            "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7" +
+            "da41597c5157488d7724e03fb8d84a376a43b8f41518a11cc387b669b2ee6586"),
+        new TestData("RFC 7539 Test Vector 2",
+            "0000000000000000000000000000000000000000000000000000000000000001",
+            "000000000000000000000002",
+            1, Cipher.ENCRYPT_MODE,
+            "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
+            "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
+            "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
+            "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
+            "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
+            "206f6620616e204945544620616374697669747920697320636f6e7369646572" +
+            "656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
+            "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
+            "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
+            "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
+            "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
+            "207768696368206172652061646472657373656420746f",
+            null,
+            "a3fbf07df3fa2fde4f376ca23e82737041605d9f4f4f57bd8cff2c1d4b7955ec" +
+            "2a97948bd3722915c8f3d337f7d370050e9e96d647b7c39f56e031ca5eb6250d" +
+            "4042e02785ececfa4b4bb5e8ead0440e20b6e8db09d881a7c6132f420e527950" +
+            "42bdfa7773d8a9051447b3291ce1411c680465552aa6c405b7764d5e87bea85a" +
+            "d00f8449ed8f72d0d662ab052691ca66424bc86d2df80ea41f43abf937d3259d" +
+            "c4b2d0dfb48a6c9139ddd7f76966e928e635553ba76c5c879d7b35d49eb2e62b" +
+            "0871cdac638939e25e8a1e0ef9d5280fa8ca328b351c3c765989cbcf3daa8b6c" +
+            "cc3aaf9f3979c92b3720fc88dc95ed84a1be059c6499b9fda236e7e818b04b0b" +
+            "c39c1e876b193bfe5569753f88128cc08aaa9b63d1a16f80ef2554d7189c411f" +
+            "5869ca52c5b83fa36ff216b9c1d30062bebcfd2dc5bce0911934fda79a86f6e6" +
+            "98ced759c3ff9b6477338f3da4f9cd8514ea9982ccafb341b2384dd902f3d1ab" +
+            "7ac61dd29c6f21ba5b862f3730e37cfdc4fd806c22f221"),
+        new TestData("RFC 7539 Test Vector 3",
+            "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0",
+            "000000000000000000000002",
+            42, Cipher.ENCRYPT_MODE,
+            "2754776173206272696c6c69672c20616e642074686520736c6974687920746f" +
+            "7665730a446964206779726520616e642067696d626c6520696e207468652077" +
+            "6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" +
+            "732c0a416e6420746865206d6f6d65207261746873206f757467726162652e",
+            null,
+            "62e6347f95ed87a45ffae7426f27a1df5fb69110044c0d73118effa95b01e5cf" +
+            "166d3df2d721caf9b21e5fb14c616871fd84c54f9d65b283196c7fe4f60553eb" +
+            "f39c6402c42234e32a356b3e764312a61a5532055716ead6962568f87d3f3f77" +
+            "04c6a8d1bcd1bf4d50d6154b6da731b187b58dfd728afa36757a797ac188d1")
+    );
+
+    public static class TestData {
+        public TestData(String name, String keyStr, String nonceStr, int ctr,
+                        int dir, String inputStr, String aadStr, String outStr) {
+            testName = Objects.requireNonNull(name);
+            HexFormat hex = HexFormat.of();
+            key = hex.parseHex(Objects.requireNonNull(keyStr));
+            nonce = hex.parseHex(Objects.requireNonNull(nonceStr));
+            if ((counter = ctr) < 0) {
+                throw new IllegalArgumentException(
+                        "counter must be 0 or greater");
+            }
+            direction = dir;
+            if ((direction != Cipher.ENCRYPT_MODE) &&
+                    (direction != Cipher.DECRYPT_MODE)) {
+                throw new IllegalArgumentException(
+                        "Direction must be ENCRYPT_MODE or DECRYPT_MODE");
+            }
+            input = hex.parseHex(Objects.requireNonNull(inputStr));
+            aad = (aadStr != null) ? hex.parseHex(aadStr) : null;
+            expOutput = hex.parseHex(Objects.requireNonNull(outStr));
+        }
+
+        public final String testName;
+        public final byte[] key;
+        public final byte[] nonce;
+        public final int counter;
+        public final int direction;
+        public final byte[] input;
+        public final byte[] aad;
+        public final byte[] expOutput;
+    }
+
+    public static void main(String[] args) throws Exception {
+        int testsPassed = 0;
+        int testNumber = 0;
+
+        // Use the first test vector to warm up the JVM and activate
+        // the intrinsics.
+        System.out.println("Running " + WARMUP_CYCLES + " warm up cycles");
+        for (int i = 0; i < WARMUP_CYCLES; i++) {
+            runSinglePartTest(testList.get(0));
+        }
+
+        System.out.println("----- Single-part Tests -----");
+        for (TestData test : testList) {
+            System.out.println("*** Test " + ++testNumber + ": " +
+                    test.testName);
+            if (runSinglePartTest(test)) {
+                testsPassed++;
+            }
+        }
+        System.out.println();
+
+        System.out.println("----- Multi-part Tests -----");
+        for (TestData test : testList) {
+            System.out.println("*** Test " + ++testNumber + ": " +
+                    test.testName);
+            if (runMultiPartTest(test)) {
+                testsPassed++;
+            }
+        }
+        System.out.println();
+
+        System.out.println("Total tests: " + testNumber +
+                ", Passed: " + testsPassed + ", Failed: " +
+                (testNumber - testsPassed));
+        if (testsPassed != testNumber) {
+            throw new RuntimeException("One or more tests failed.  " +
+                    "Check output for details");
+        }
+    }
+
+    private static boolean runSinglePartTest(TestData testData)
+            throws GeneralSecurityException {
+        boolean encRes = false;
+        boolean decRes = false;
+        byte[] encryptedResult;
+        byte[] decryptedResult;
+
+        // Get a Cipher instance and set up the parameters
+        Cipher mambo = Cipher.getInstance("ChaCha20");
+        SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
+        ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
+                testData.nonce, testData.counter);
+
+        // Encrypt our input
+        mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
+        encryptedResult = mambo.doFinal(testData.input);
+
+        if (!Arrays.equals(encryptedResult, testData.expOutput)) {
+            System.out.println("ERROR - Output Mismatch!");
+            System.out.println("Expected:\n" +
+                    dumpHexBytes(testData.expOutput, 16, "\n", " "));
+            System.out.println("Actual:\n" +
+                    dumpHexBytes(encryptedResult, 16, "\n", " "));
+            System.out.println();
+        } else {
+            encRes = true;
+        }
+
+        // Decrypt the result of the encryption operation
+        mambo = Cipher.getInstance("ChaCha20");
+        mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
+        decryptedResult = mambo.doFinal(encryptedResult);
+
+        if (!Arrays.equals(decryptedResult, testData.input)) {
+            System.out.println("ERROR - Output Mismatch!");
+            System.out.println("Expected:\n" +
+                    dumpHexBytes(testData.input, 16, "\n", " "));
+            System.out.println("Actual:\n" +
+                    dumpHexBytes(decryptedResult, 16, "\n", " "));
+            System.out.println();
+        } else {
+            decRes = true;
+        }
+
+        return (encRes && decRes);
+    }
+
+    private static boolean runMultiPartTest(TestData testData)
+            throws GeneralSecurityException {
+        boolean encRes = false;
+        boolean decRes = false;
+
+        // Get a cipher instance and initialize it
+        Cipher mambo = Cipher.getInstance("ChaCha20");
+        SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
+        ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
+                testData.nonce, testData.counter);
+
+        byte[] encryptedResult = new byte[testData.input.length];
+        mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
+        System.out.print("Encrypt - ");
+        doMulti(mambo, testData.input, encryptedResult);
+
+        if (!Arrays.equals(encryptedResult, testData.expOutput)) {
+            System.out.println("ERROR - Output Mismatch!");
+            System.out.println("Expected:\n" +
+                    dumpHexBytes(testData.expOutput));
+            System.out.println("Actual:\n" +
+                    dumpHexBytes(encryptedResult));
+            System.out.println();
+        } else {
+            encRes = true;
+        }
+
+        // Decrypt the result of the encryption operation
+        mambo = Cipher.getInstance("ChaCha20");
+        byte[] decryptedResult = new byte[encryptedResult.length];
+        mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
+        System.out.print("Decrypt - ");
+        doMulti(mambo, encryptedResult, decryptedResult);
+
+        if (!Arrays.equals(decryptedResult, testData.input)) {
+            System.out.println("ERROR - Output Mismatch!");
+            System.out.println("Expected:\n" + dumpHexBytes(testData.input));
+            System.out.println("Actual:\n" + dumpHexBytes(decryptedResult));
+            System.out.println();
+        } else {
+            decRes = true;
+        }
+
+        return (encRes && decRes);
+    }
+
+    private static void doMulti(Cipher c, byte[] input, byte[] output)
+            throws GeneralSecurityException {
+        int offset = 0;
+        boolean done = false;
+        Random randIn = new Random(System.currentTimeMillis());
+
+        // Send small updates between 1 - 8 bytes in length until we get
+        // 8 or less bytes from the end of the input, then finalize.
+        System.out.println("Input length: " + input.length);
+        System.out.print("Multipart (bytes in/out): ");
+        while (!done) {
+            int mPartLen = randIn.nextInt(8) + 1;
+            int bytesLeft = input.length - offset;
+            int processed;
+            if (mPartLen < bytesLeft) {
+                System.out.print(mPartLen + "/");
+                processed = c.update(input, offset, mPartLen,
+                        output, offset);
+                offset += processed;
+                System.out.print(processed + " ");
+            } else {
+                processed = c.doFinal(input, offset, bytesLeft,
+                        output, offset);
+                System.out.print(bytesLeft + "/" + processed + " ");
+                done = true;
+            }
+        }
+        System.out.println();
+    }
+
+    private static String dumpHexBytes(byte[] data) {
+        return dumpHexBytes(data, 16, "\n", " ");
+    }
+
+    private static String dumpHexBytes(byte[] data, int itemsPerLine,
+           String lineDelim, String itemDelim) {
+        StringBuilder sb = new StringBuilder();
+        if (data != null) {
+            for (int i = 0; i < data.length; i++) {
+                if (i % itemsPerLine == 0 && i != 0) {
+                    sb.append(lineDelim);
+                }
+                sb.append(String.format("%02X", data[i])).append(itemDelim);
+            }
+        }
+        return sb.toString();
+    }
+}
\ No newline at end of file
diff --git a/test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java b/test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java
new file mode 100644
index 00000000000..a3b708fab81
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021, Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.intrinsics.chacha;
+
+import java.util.ArrayList;
+import java.util.List;
+import jdk.test.lib.Platform;
+import jdk.test.lib.process.OutputAnalyzer;
+import jdk.test.lib.process.ProcessTools;
+import jdk.test.whitebox.cpuinfo.CPUInfo;
+
+/**
+ * @test
+ * @bug 8247645
+ * @summary ChaCha20 Intrinsics
+ * @library /test/lib
+ * @build   compiler.intrinsics.chacha.ExerciseChaCha20
+ *          jdk.test.whitebox.WhiteBox
+ * @run driver jdk.test.lib.helpers.ClassFileInstaller jdk.test.whitebox.WhiteBox
+ * @run main/othervm/timeout=7200
+ *      -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI
+ *      compiler.intrinsics.chacha.TestChaCha20
+ */
+public class TestChaCha20 {
+
+    // Default to 1/4 of the CPUs, and allow users to override.
+    static final int MAX_PARALLELISM = Integer.getInteger("maxParallelism",
+        Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
+
+    private static List<String> mix(List<String> o, String... mix) {
+        List<String> n = new ArrayList<>(o);
+        for (String m : mix) {
+            n.add(m);
+        }
+        return n;
+    }
+
+    private static boolean containsFuzzy(List<String> list, String sub) {
+        for (String s : list) {
+            if (s.contains(sub)) return true;
+        }
+        return false;
+    }
+
+    public static void main(String... args) throws Exception {
+        List<List<String>> configs = new ArrayList<>();
+        List<String> cpuFeatures = CPUInfo.getFeatures();
+
+        System.out.print("CPU Features: ");
+        cpuFeatures.forEach(f -> System.out.print(f + " "));
+        System.out.println();
+
+        if (Platform.isX64()) {
+            // If CPU features were not found, provide a default config.
+            if (cpuFeatures.isEmpty()) {
+                configs.add(new ArrayList());
+            }
+
+            // Otherwise, select the tests that make sense on current platform.
+            if (containsFuzzy(cpuFeatures, "avx512")) {
+                System.out.println("Setting up AVX512 worker");
+                configs.add(List.of("-XX:UseAVX=3"));
+            }
+            if (containsFuzzy(cpuFeatures, "avx2")) {
+                System.out.println("Setting up AVX2 worker");
+                configs.add(List.of("-XX:UseAVX=2"));
+            }
+            if (containsFuzzy(cpuFeatures, "avx")) {
+                System.out.println("Setting up AVX worker");
+                configs.add(List.of("-XX:UseAVX=1"));
+            }
+        } else if (Platform.isAArch64()) {
+            // AArch64 intrinsics require the advanced simd instructions
+            if (containsFuzzy(cpuFeatures, "simd")) {
+                System.out.println("Setting up ASIMD worker");
+                configs.add(new ArrayList());
+            }
+        } else {
+            // We only have ChaCha20 intrinsics on x64 and aarch64
+            // currently.  If the platform is neither of these then
+            // the ChaCha20 known answer tests in
+            // com/sun/crypto/provider/Cipher are sufficient.
+            return;
+        }
+
+        // If by this point we have no configs, it means we are running
+        // on a platform that intrinsics have been written for, but does
+        // not possess the necessary instruction sets for that processor.
+        // We can exit out if that is the case.
+        if (configs.isEmpty()) {
+            System.out.println("No intrinsics-capable configurations found");
+            return;
+        }
+
+        // We can expand this array later to include other tests if new
+        // ChaCha20 intrinsics are developed.
+        String[] classNames = {
+            "compiler.intrinsics.chacha.ExerciseChaCha20"
+        };
+
+        ArrayList<Fork> forks = new ArrayList<>();
+        int jobs = 0;
+
+        for (List<String> c : configs) {
+            for (String className : classNames) {
+                // Start a new job
+                {
+                    ProcessBuilder pb = ProcessTools.createTestJvm(
+                            mix(c, "-Xmx256m", className));
+                    Process p = pb.start();
+                    OutputAnalyzer oa = new OutputAnalyzer(p);
+                    forks.add(new Fork(p, oa));
+                    jobs++;
+                }
+
+                // Wait for the completion of other jobs
+                while (jobs >= MAX_PARALLELISM) {
+                    Fork f = findDone(forks);
+                    if (f != null) {
+                        OutputAnalyzer oa = f.oa();
+                        oa.shouldHaveExitValue(0);
+                        forks.remove(f);
+                        jobs--;
+                    } else {
+                        // Nothing is done, wait a little.
+                        Thread.sleep(200);
+                    }
+                }
+            }
+        }
+
+        // Drain the rest
+        for (Fork f : forks) {
+            OutputAnalyzer oa = f.oa();
+            oa.shouldHaveExitValue(0);
+        }
+    }
+
+    private static Fork findDone(List<Fork> forks) {
+        for (Fork f : forks) {
+            if (!f.p().isAlive()) {
+                return f;
+            }
+        }
+        return null;
+    }
+
+    private static record Fork(Process p, OutputAnalyzer oa) {};
+}
diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/CipherBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/CipherBench.java
index 5f4d22a3252..b6fe1a1af71 100644
--- a/test/micro/org/openjdk/bench/javax/crypto/full/CipherBench.java
+++ b/test/micro/org/openjdk/bench/javax/crypto/full/CipherBench.java
@@ -198,7 +198,7 @@ public abstract class CipherBench extends CryptoBase {
         @Param({"256"})
         private int keyLength;
 
-        @Param({"1024", "" + 16 * 1024})
+        @Param({"256", "1024", "4096", "16384"})
         private int dataSize;
 
         protected int ivLength() {
@@ -223,7 +223,7 @@ public abstract class CipherBench extends CryptoBase {
         @Param({"256"})
         private int keyLength;
 
-        @Param({"1024", "" + 16 * 1024})
+        @Param({"256", "1024", "4096", "16384"})
         private int dataSize;
 
         protected int ivLength() {