8271567: AArch64: AES Galois CounterMode (GCM) interleaved implementation using vector instructions

Reviewed-by: ngasson, adinn, xliu
2021-09-23 09:00:46 +00:00 · 2021-09-23 09:00:46 +00:00 · 4f3b626a36
commit 4f3b626a36
parent 8799856528
7 changed files with 1168 additions and 225 deletions
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -647,22 +647,22 @@ typedef enum {

 class Assembler : public AbstractAssembler {

+public:
+
 #ifndef PRODUCT
  static const uintptr_t asm_bp;

-  void emit_long(jint x) {
+  void emit_int32(jint x) {
    if ((uintptr_t)pc() == asm_bp)
      NOP();
    AbstractAssembler::emit_int32(x);
  }
 #else
-  void emit_long(jint x) {
+  void emit_int32(jint x) {
    AbstractAssembler::emit_int32(x);
  }
 #endif

-public:
-
  enum { instruction_size = 4 };

  //---<  calculate length of instruction  >---
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1296,11 +1296,37 @@ public:
  void kernel_crc32c_using_crc32c(Register crc, Register buf,
        Register len, Register tmp0, Register tmp1, Register tmp2,
        Register tmp3);
+
+  void ghash_modmul (FloatRegister result,
+                     FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                     FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                     FloatRegister t1, FloatRegister t2, FloatRegister t3);
+  void ghash_load_wide(int index, Register data, FloatRegister result, FloatRegister state);
 public:
  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
                       Register zlen, Register tmp1, Register tmp2, Register tmp3,
                       Register tmp4, Register tmp5, Register tmp6, Register tmp7);
  void mul_add(Register out, Register in, Register offs, Register len, Register k);
+  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
+  void ghash_multiply_wide(int index,
+                           FloatRegister result_lo, FloatRegister result_hi,
+                           FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                           FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
+  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                    FloatRegister p, FloatRegister z, FloatRegister t1);
+  void ghash_reduce_wide(int index, FloatRegister result, FloatRegister lo, FloatRegister hi,
+                    FloatRegister p, FloatRegister z, FloatRegister t1);
+  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
+                                Register data, Register blocks, int unrolls);
+
+
+  void aesenc_loadkeys(Register key, Register keylen);
+  void aesecb_encrypt(Register from, Register to, Register keylen,
+                      FloatRegister data = v0, int unrolls = 1);
+  void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
+  void aes_round(FloatRegister input, FloatRegister subkey);

  // Place an ISB after code may have been modified due to a safepoint.
  void safepoint_isb();
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_aarch64.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/stubRoutines.hpp"
+
+void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
+  Label L_doLast;
+
+  ld1(v0, T16B, from); // get 16 bytes of input
+
+  ld1(v5, T16B, post(key, 16));
+  rev32(v5, T16B, v5);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 44);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 52);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  bind(L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+
+  eor(v0, T16B, v0, v5);
+
+  st1(v0, T16B, to);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// Load expanded key into v17..v31
+void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
+  Label L_loadkeys_44, L_loadkeys_52;
+  cmpw(keylen, 52);
+  br(Assembler::LO, L_loadkeys_44);
+  br(Assembler::EQ, L_loadkeys_52);
+
+  ld1(v17, v18,  T16B,  post(key, 32));
+  rev32(v17,  T16B, v17);
+  rev32(v18,  T16B, v18);
+  bind(L_loadkeys_52);
+  ld1(v19, v20,  T16B,  post(key, 32));
+  rev32(v19,  T16B, v19);
+  rev32(v20,  T16B, v20);
+  bind(L_loadkeys_44);
+  ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
+  rev32(v21,  T16B, v21);
+  rev32(v22,  T16B, v22);
+  rev32(v23,  T16B, v23);
+  rev32(v24,  T16B, v24);
+  ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
+  rev32(v25,  T16B, v25);
+  rev32(v26,  T16B, v26);
+  rev32(v27,  T16B, v27);
+  rev32(v28,  T16B, v28);
+  ld1(v29, v30, v31,  T16B, post(key, 48));
+  rev32(v29,  T16B, v29);
+  rev32(v30,  T16B, v30);
+  rev32(v31,  T16B, v31);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// NeoverseTM N1Software Optimization Guide:
+// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
+// instruction pairs will exhibit the performance characteristics
+// described in Section 4.6.
+void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
+  aese(input, subkey); aesmc(input, input);
+}
+
+// KernelGenerator
+//
+// The abstract base class of an unrolled function generator.
+// Subclasses override generate(), length(), and next() to generate
+// unrolled and interleaved functions.
+//
+// The core idea is that a subclass defines a method which generates
+// the base case of a function and a method to generate a clone of it,
+// shifted to a different set of registers. KernelGenerator will then
+// generate several interleaved copies of the function, with each one
+// using a different set of registers.
+
+// The subclass must implement three methods: length(), which is the
+// number of instruction bundles in the intrinsic, generate(int n)
+// which emits the nth instruction bundle in the intrinsic, and next()
+// which takes an instance of the generator and returns a version of it,
+// shifted to a new set of registers.
+
+class KernelGenerator: public MacroAssembler {
+protected:
+  const int _unrolls;
+public:
+  KernelGenerator(Assembler *as, int unrolls)
+    : MacroAssembler(as->code()), _unrolls(unrolls) { }
+  virtual void generate(int index) = 0;
+  virtual int length() = 0;
+  virtual KernelGenerator *next() = 0;
+  int unrolls() { return _unrolls; }
+  void unroll();
+};
+
+void KernelGenerator::unroll() {
+  ResourceMark rm;
+  KernelGenerator **generators
+    = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
+
+  generators[0] = this;
+  for (int i = 1; i < unrolls(); i++) {
+    generators[i] = generators[i-1]->next();
+  }
+
+  for (int j = 0; j < length(); j++) {
+    for (int i = 0; i < unrolls(); i++) {
+      generators[i]->generate(j);
+    }
+  }
+}
+
+// An unrolled and interleaved generator for AES encryption.
+class AESKernelGenerator: public KernelGenerator {
+  Register _from, _to;
+  const Register _keylen;
+  FloatRegister _data;
+  const FloatRegister _subkeys;
+  bool _once;
+  Label _rounds_44, _rounds_52;
+
+public:
+  AESKernelGenerator(Assembler *as, int unrolls,
+                     Register from, Register to, Register keylen, FloatRegister data,
+                     FloatRegister subkeys, bool once = true)
+    : KernelGenerator(as, unrolls),
+      _from(from), _to(to), _keylen(keylen), _data(data),
+      _subkeys(subkeys), _once(once) {
+  }
+
+  virtual void generate(int index) {
+    switch (index) {
+    case  0:
+      if (_from != noreg) {
+        ld1(_data, T16B, _from); // get 16 bytes of input
+      }
+      break;
+    case  1:
+      if (_once) {
+        cmpw(_keylen, 52);
+        br(Assembler::LO, _rounds_44);
+        br(Assembler::EQ, _rounds_52);
+      }
+      break;
+    case  2:  aes_round(_data, _subkeys +  0);  break;
+    case  3:  aes_round(_data, _subkeys +  1);  break;
+    case  4:
+      if (_once)  bind(_rounds_52);
+      break;
+    case  5:  aes_round(_data, _subkeys +  2);  break;
+    case  6:  aes_round(_data, _subkeys +  3);  break;
+    case  7:
+      if (_once)  bind(_rounds_44);
+      break;
+    case  8:  aes_round(_data, _subkeys +  4);  break;
+    case  9:  aes_round(_data, _subkeys +  5);  break;
+    case 10:  aes_round(_data, _subkeys +  6);  break;
+    case 11:  aes_round(_data, _subkeys +  7);  break;
+    case 12:  aes_round(_data, _subkeys +  8);  break;
+    case 13:  aes_round(_data, _subkeys +  9);  break;
+    case 14:  aes_round(_data, _subkeys + 10);  break;
+    case 15:  aes_round(_data, _subkeys + 11);  break;
+    case 16:  aes_round(_data, _subkeys + 12);  break;
+    case 17:  aese(_data, _subkeys + 13);  break;
+    case 18:  eor(_data, T16B, _data, _subkeys + 14);  break;
+    case 19:
+      if (_to != noreg) {
+        st1(_data, T16B, _to);
+      }
+      break;
+    default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    return new AESKernelGenerator(this, _unrolls,
+                                  _from, _to, _keylen,
+                                  _data + 1, _subkeys, /*once*/false);
+  }
+
+  virtual int length() { return 20; }
+};
+
+// Uses expanded key in v17..v31
+// Returns encrypted values in inputs.
+// If to != noreg, store value at to; likewise from
+// Preserves key, keylen
+// Increments from, to
+// Input data in v0, v1, ...
+// unrolls controls the number of times to unroll the generated function
+void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
+                                    FloatRegister data, int unrolls) {
+  AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
+}
+
+// ghash_multiply and ghash_reduce are the non-unrolled versions of
+// the GHASH function generators.
+void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                                     FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                                     FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
+  // Karatsuba multiplication performs a 128*128 -> 256-bit
+  // multiplication in three 128-bit multiplications and a few
+  // additions.
+  //
+  // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+  // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+  //
+  // Inputs:
+  //
+  // A0 in a.d[0]     (subkey)
+  // A1 in a.d[1]
+  // (A1+A0) in a1_xor_a0.d[0]
+  //
+  // B0 in b.d[0]     (state)
+  // B1 in b.d[1]
+
+  ext(tmp1, T16B, b, b, 0x08);
+  pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
+  eor(tmp1, T16B, tmp1, b);           // (B1+B0)
+  pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
+  pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+
+  ext(tmp1, T16B, result_lo, result_hi, 0x08);
+  eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
+  eor(tmp2, T16B, tmp2, tmp1);
+  eor(tmp2, T16B, tmp2, tmp3);
+
+  // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
+  ins(result_hi, D, tmp2, 0, 1);
+  ins(result_lo, D, tmp2, 1, 0);
+}
+
+void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                  FloatRegister p, FloatRegister vzr, FloatRegister t1) {
+  const FloatRegister t0 = result;
+
+  // The GCM field polynomial f is z^128 + p(z), where p =
+  // z^7+z^2+z+1.
+  //
+  //    z^128 === -p(z)  (mod (z^128 + p(z)))
+  //
+  // so, given that the product we're reducing is
+  //    a == lo + hi * z^128
+  // substituting,
+  //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+  //
+  // we reduce by multiplying hi by p(z) and subtracting the result
+  // from (i.e. XORing it with) lo.  Because p has no nonzero high
+  // bits we can do this with two 64-bit multiplications, lo*p and
+  // hi*p.
+
+  pmull2(t0, T1Q, hi, p, T2D);
+  ext(t1, T16B, t0, vzr, 8);
+  eor(hi, T16B, hi, t1);
+  ext(t1, T16B, vzr, t0, 8);
+  eor(lo, T16B, lo, t1);
+  pmull(t0, T1Q, hi, p, T1D);
+  eor(result, T16B, lo, t0);
+}
+
+class GHASHMultiplyGenerator: public KernelGenerator {
+  FloatRegister _result_lo, _result_hi, _b,
+    _a, _vzr, _a1_xor_a0, _p,
+    _tmp1, _tmp2, _tmp3;
+
+public:
+  GHASHMultiplyGenerator(Assembler *as, int unrolls,
+                         /* offsetted registers */
+                         FloatRegister result_lo, FloatRegister result_hi,
+                         FloatRegister b,
+                         /* non-offsetted (shared) registers */
+                         FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
+                         /* offseted (temp) registers */
+                         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
+    : KernelGenerator(as, unrolls),
+      _result_lo(result_lo), _result_hi(result_hi), _b(b),
+      _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
+      _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
+
+  int register_stride = 7;
+
+  virtual void generate(int index) {
+    // Karatsuba multiplication performs a 128*128 -> 256-bit
+    // multiplication in three 128-bit multiplications and a few
+    // additions.
+    //
+    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+    //
+    // Inputs:
+    //
+    // A0 in a.d[0]     (subkey)
+    // A1 in a.d[1]
+    // (A1+A0) in a1_xor_a0.d[0]
+    //
+    // B0 in b.d[0]     (state)
+    // B1 in b.d[1]
+
+    switch (index) {
+      case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
+      case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
+        break;
+      case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
+        break;
+      case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
+        break;
+      case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+        break;
+
+      case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
+      case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
+        break;
+      case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
+      case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;
+
+        // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
+      case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
+      case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
+      default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHMultiplyGenerator *result = new GHASHMultiplyGenerator(*this);
+    result->_result_lo += register_stride;
+    result->_result_hi += register_stride;
+    result->_b += register_stride;
+    result->_tmp1 += register_stride;
+    result->_tmp2 += register_stride;
+    result->_tmp3 += register_stride;
+    return result;
+  }
+
+  virtual int length() { return 11; }
+};
+
+// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
+// The FloatRegister argument called data is optional: if it is a
+// valid register, we interleave LD1 instructions with the
+// reduction. This is to reduce latency next time around the loop.
+class GHASHReduceGenerator: public KernelGenerator {
+  FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
+  int _once;
+public:
+  GHASHReduceGenerator(Assembler *as, int unrolls,
+                       /* offsetted registers */
+                       FloatRegister result, FloatRegister lo, FloatRegister hi,
+                       /* non-offsetted (shared) registers */
+                       FloatRegister p, FloatRegister vzr, FloatRegister data,
+                       /* offseted (temp) registers */
+                       FloatRegister t1)
+    : KernelGenerator(as, unrolls),
+      _result(result), _lo(lo), _hi(hi),
+      _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
+
+  int register_stride = 7;
+
+  virtual void generate(int index) {
+    const FloatRegister t0 = _result;
+
+    switch (index) {
+      // The GCM field polynomial f is z^128 + p(z), where p =
+      // z^7+z^2+z+1.
+      //
+      //    z^128 === -p(z)  (mod (z^128 + p(z)))
+      //
+      // so, given that the product we're reducing is
+      //    a == lo + hi * z^128
+      // substituting,
+      //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+      //
+      // we reduce by multiplying hi by p(z) and subtracting the _result
+      // from (i.e. XORing it with) lo.  Because p has no nonzero high
+      // bits we can do this with two 64-bit multiplications, lo*p and
+      // hi*p.
+
+      case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
+      case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
+      case  2:  eor(_hi, T16B, _hi, _t1);  break;
+      case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
+      case  4:  eor(_lo, T16B, _lo, _t1);  break;
+      case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
+      case  6:  eor(_result, T16B, _lo, t0);  break;
+      default: ShouldNotReachHere();
+    }
+
+    // Sprinkle load instructions into the generated instructions
+    if (_data->is_valid() && _once) {
+      assert(length() >= unrolls(), "not enough room for inteleaved loads");
+      if (index < unrolls()) {
+        ld1((_data + index*register_stride), T16B, post(r2, 0x10));
+      }
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
+    result->_result += register_stride;
+    result->_hi += register_stride;
+    result->_lo += register_stride;
+    result->_t1 += register_stride;
+    result->_once = false;
+    return result;
+  }
+
+ int length() { return 7; }
+};
+
+// Perform a GHASH multiply/reduce on a single FloatRegister.
+void MacroAssembler::ghash_modmul(FloatRegister result,
+                                  FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                                  FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                                  FloatRegister t1, FloatRegister t2, FloatRegister t3) {
+  ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
+  ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
+}
+
+// Interleaved GHASH processing.
+//
+// Clobbers all vector registers.
+//
+void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
+                                              Register subkeyH,
+                                              Register data, Register blocks, int unrolls) {
+  int register_stride = 7;
+
+  // Bafflingly, GCM uses little-endian for the byte order, but
+  // big-endian for the bit order.  For example, the polynomial 1 is
+  // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
+  //
+  // So, we must either reverse the bytes in each word and do
+  // everything big-endian or reverse the bits in each byte and do
+  // it little-endian.  On AArch64 it's more idiomatic to reverse
+  // the bits in each byte (we have an instruction, RBIT, to do
+  // that) and keep the data in little-endian bit order throught the
+  // calculation, bit-reversing the inputs and outputs.
+
+  assert(unrolls * register_stride < 32, "out of registers");
+
+  FloatRegister a1_xor_a0 = v28;
+  FloatRegister Hprime = v29;
+  FloatRegister vzr = v30;
+  FloatRegister p = v31;
+  eor(vzr, T16B, vzr, vzr); // zero register
+
+  ldrq(p, field_polynomial);    // The field polynomial
+
+  ldrq(v0, Address(state));
+  ldrq(Hprime, Address(subkeyH));
+
+  rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
+  rbit(v0, T16B, v0);
+  rev64(Hprime, T16B, Hprime);
+  rbit(Hprime, T16B, Hprime);
+
+  // Powers of H -> Hprime
+
+  Label already_calculated, done;
+  {
+    // The first time around we'll have to calculate H**2, H**3, etc.
+    // Look at the largest power of H in the subkeyH array to see if
+    // it's already been calculated.
+    ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
+    orr(rscratch1, rscratch1, rscratch2);
+    cbnz(rscratch1, already_calculated);
+
+    orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
+    for (int i = 1; i < unrolls; i++) {
+      ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+      eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+      ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
+                   Hprime, vzr, a1_xor_a0, p,
+                   /*temps*/v1, v3, v2);
+      rev64(v1, T16B, v6);
+      rbit(v1, T16B, v1);
+      strq(v1, Address(subkeyH, 16 * i));
+    }
+    b(done);
+  }
+  {
+    bind(already_calculated);
+
+    // Load the largest power of H we need into v6.
+    ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
+    rev64(v6, T16B, v6);
+    rbit(v6, T16B, v6);
+  }
+  bind(done);
+
+  orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime
+
+  // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
+  // v0 contains the initial state. Clear the others.
+  for (int i = 1; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register
+  }
+
+  ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+  eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+
+  // Load #unrolls blocks of data
+  for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+    ld1(v2+ofs, T16B, post(data, 0x10));
+  }
+
+  // Register assignments, replicated across 4 clones, v0 ... v23
+  //
+  // v0: input / output: current state, result of multiply/reduce
+  // v1: temp
+  // v2: input: one block of data (the ciphertext)
+  //     also used as a temp once the data has been consumed
+  // v3: temp
+  // v4: output: high part of product
+  // v5: output: low part ...
+  // v6: unused
+  //
+  // Not replicated:
+  //
+  // v28: High part of H xor low part of H'
+  // v29: H' (hash subkey)
+  // v30: zero
+  // v31: Reduction polynomial of the Galois field
+
+  // Inner loop.
+  // Do the whole load/add/multiply/reduce over all our data except
+  // the last few rows.
+  {
+    Label L_ghash_loop;
+    bind(L_ghash_loop);
+
+    // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
+    // prfm(Address(data, 128), PLDL1KEEP);
+
+    // Xor data into current state
+    for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+      rbit((v2+ofs), T16B, (v2+ofs));
+      eor((v2+ofs), T16B, v0+ofs, (v2+ofs));   // bit-swapped data ^ bit-swapped state
+    }
+
+    // Generate fully-unrolled multiply-reduce in two stages.
+
+    GHASHMultiplyGenerator(this, unrolls,
+                           /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
+                           Hprime, a1_xor_a0, p, vzr,
+                           /*temps*/v1, v3, /* reuse b*/v2) .unroll();
+
+    // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
+    // data into v0, v0+ofs, the current state.
+    GHASHReduceGenerator (this, unrolls,
+                          /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
+                          /*data*/v2, /*temp*/v3) .unroll();
+
+    sub(blocks, blocks, unrolls);
+    cmp(blocks, (unsigned char)(unrolls * 2));
+    br(GE, L_ghash_loop);
+  }
+
+  // Merge the #unrolls states.  Note that the data for the next
+  // iteration has already been loaded into v4, v4+ofs, etc...
+
+  // First, we multiply/reduce each clone by the appropriate power of H.
+  for (int i = 0; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
+
+    rbit(v2+ofs, T16B, v2+ofs);
+    eor(v2+ofs, T16B, ofs+v0, v2+ofs);   // bit-swapped data ^ bit-swapped state
+
+    rev64(Hprime, T16B, Hprime);
+    rbit(Hprime, T16B, Hprime);
+    ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+    eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+    ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs,
+                 Hprime, vzr, a1_xor_a0, p,
+                 /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs);
+  }
+
+  // Then we sum the results.
+  for (int i = 0; i < unrolls - 1; i++) {
+    int ofs = register_stride * i;
+    eor(v0, T16B, v0, v0 + register_stride + ofs);
+  }
+
+  sub(blocks, blocks, (unsigned char)unrolls);
+
+  // And finally bit-reverse the state back to big endian.
+  rev64(v0, T16B, v0);
+  rbit(v0, T16B, v0);
+  st1(v0, T16B, state);
+}
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -2560,8 +2560,6 @@ class StubGenerator: public StubCodeGenerator {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");

-    Label L_doLast;
-
    const Register from        = c_rarg0;  // source array address
    const Register to          = c_rarg1;  // destination array address
    const Register key         = c_rarg2;  // key array address
@ -2572,75 +2570,8 @@ class StubGenerator: public StubCodeGenerator {

    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

-    __ ld1(v0, __ T16B, from); // get 16 bytes of input
-
-    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-    __ rev32(v3, __ T16B, v3);
-    __ rev32(v4, __ T16B, v4);
-    __ aese(v0, v1);
-    __ aesmc(v0, v0);
-    __ aese(v0, v2);
-    __ aesmc(v0, v0);
-    __ aese(v0, v3);
-    __ aesmc(v0, v0);
-    __ aese(v0, v4);
-    __ aesmc(v0, v0);
-
-    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-    __ rev32(v3, __ T16B, v3);
-    __ rev32(v4, __ T16B, v4);
-    __ aese(v0, v1);
-    __ aesmc(v0, v0);
-    __ aese(v0, v2);
-    __ aesmc(v0, v0);
-    __ aese(v0, v3);
-    __ aesmc(v0, v0);
-    __ aese(v0, v4);
-    __ aesmc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ cmpw(keylen, 44);
-    __ br(Assembler::EQ, L_doLast);
-
-    __ aese(v0, v1);
-    __ aesmc(v0, v0);
-    __ aese(v0, v2);
-    __ aesmc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ cmpw(keylen, 52);
-    __ br(Assembler::EQ, L_doLast);
-
-    __ aese(v0, v1);
-    __ aesmc(v0, v0);
-    __ aese(v0, v2);
-    __ aesmc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ BIND(L_doLast);
-
-    __ aese(v0, v1);
-    __ aesmc(v0, v0);
-    __ aese(v0, v2);
-
-    __ ld1(v1, __ T16B, key);
-    __ rev32(v1, __ T16B, v1);
-    __ eor(v0, __ T16B, v0, v1);
-
-    __ st1(v0, __ T16B, to);
+    __ aesenc_loadkeys(key, keylen);
+    __ aesecb_encrypt(from, to, keylen);

    __ mov(r0, 0);

@ -2673,76 +2604,7 @@ class StubGenerator: public StubCodeGenerator {

    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

-    __ ld1(v0, __ T16B, from); // get 16 bytes of input
-
-    __ ld1(v5, __ T16B, __ post(key, 16));
-    __ rev32(v5, __ T16B, v5);
-
-    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-    __ rev32(v3, __ T16B, v3);
-    __ rev32(v4, __ T16B, v4);
-    __ aesd(v0, v1);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v2);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v3);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v4);
-    __ aesimc(v0, v0);
-
-    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-    __ rev32(v3, __ T16B, v3);
-    __ rev32(v4, __ T16B, v4);
-    __ aesd(v0, v1);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v2);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v3);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v4);
-    __ aesimc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ cmpw(keylen, 44);
-    __ br(Assembler::EQ, L_doLast);
-
-    __ aesd(v0, v1);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v2);
-    __ aesimc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ cmpw(keylen, 52);
-    __ br(Assembler::EQ, L_doLast);
-
-    __ aesd(v0, v1);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v2);
-    __ aesimc(v0, v0);
-
-    __ ld1(v1, v2, __ T16B, __ post(key, 32));
-    __ rev32(v1, __ T16B, v1);
-    __ rev32(v2, __ T16B, v2);
-
-    __ BIND(L_doLast);
-
-    __ aesd(v0, v1);
-    __ aesimc(v0, v0);
-    __ aesd(v0, v2);
-
-    __ eor(v0, __ T16B, v0, v5);
-
-    __ st1(v0, __ T16B, to);
+    __ aesecb_decrypt(from, to, key, keylen);

    __ mov(r0, 0);

@ -2964,6 +2826,385 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  // CTR AES crypt.
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   c_rarg4   - input length
+  //   c_rarg5   - saved encryptedCounter start
+  //   c_rarg6   - saved used length
+  //
+  // Output:
+  //   r0       - input length
+  //
+  address generate_counterMode_AESCrypt() {
+    const Register in = c_rarg0;
+    const Register out = c_rarg1;
+    const Register key = c_rarg2;
+    const Register counter = c_rarg3;
+    const Register saved_len = c_rarg4, len = r10;
+    const Register saved_encrypted_ctr = c_rarg5;
+    const Register used_ptr = c_rarg6, used = r12;
+
+    const Register offset = r7;
+    const Register keylen = r11;
+
+    const unsigned char block_size = 16;
+    const int bulk_width = 4;
+    // NB: bulk_width can be 4 or 8. 8 gives slightly faster
+    // performance with larger data sizes, but it also means that the
+    // fast path isn't used until you have at least 8 blocks, and up
+    // to 127 bytes of data will be executed on the slow path. For
+    // that reason, and also so as not to blow away too much icache, 4
+    // blocks seems like a sensible compromise.
+
+    // Algorithm:
+    //
+    //    if (len == 0) {
+    //        goto DONE;
+    //    }
+    //    int result = len;
+    //    do {
+    //        if (used >= blockSize) {
+    //            if (len >= bulk_width * blockSize) {
+    //                CTR_large_block();
+    //                if (len == 0)
+    //                    goto DONE;
+    //            }
+    //            for (;;) {
+    //                16ByteVector v0 = counter;
+    //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
+    //                used = 0;
+    //                if (len < blockSize)
+    //                    break;    /* goto NEXT */
+    //                16ByteVector v1 = load16Bytes(in, offset);
+    //                v1 = v1 ^ encryptedCounter;
+    //                store16Bytes(out, offset);
+    //                used = blockSize;
+    //                offset += blockSize;
+    //                len -= blockSize;
+    //                if (len == 0)
+    //                    goto DONE;
+    //            }
+    //        }
+    //      NEXT:
+    //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
+    //        len--;
+    //    } while (len != 0);
+    //  DONE:
+    //    return result;
+    //
+    // CTR_large_block()
+    //    Wide bulk encryption of whole blocks.
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    const address start = __ pc();
+    __ enter();
+
+    Label DONE, CTR_large_block, large_block_return;
+    __ ldrw(used, Address(used_ptr));
+    __ cbzw(saved_len, DONE);
+
+    __ mov(len, saved_len);
+    __ mov(offset, 0);
+
+    // Compute #rounds for AES based on the length of the key array
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ aesenc_loadkeys(key, keylen);
+
+    {
+      Label L_CTR_loop, NEXT;
+
+      __ bind(L_CTR_loop);
+
+      __ cmp(used, block_size);
+      __ br(__ LO, NEXT);
+
+      // Maybe we have a lot of data
+      __ subsw(rscratch1, len, bulk_width * block_size);
+      __ br(__ HS, CTR_large_block);
+      __ BIND(large_block_return);
+      __ cbzw(len, DONE);
+
+      // Setup the counter
+      __ movi(v4, __ T4S, 0);
+      __ movi(v5, __ T4S, 1);
+      __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
+
+      __ ld1(v0, __ T16B, counter); // Load the counter into v0
+      __ rev32(v16, __ T16B, v0);
+      __ addv(v16, __ T4S, v16, v4);
+      __ rev32(v16, __ T16B, v16);
+      __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+      {
+        // We have fewer than bulk_width blocks of data left. Encrypt
+        // them one by one until there is less than a full block
+        // remaining, being careful to save both the encrypted counter
+        // and the counter.
+
+        Label inner_loop;
+        __ bind(inner_loop);
+        // Counter to encrypt is in v0
+        __ aesecb_encrypt(noreg, noreg, keylen);
+        __ st1(v0, __ T16B, saved_encrypted_ctr);
+
+        // Do we have a remaining full block?
+
+        __ mov(used, 0);
+        __ cmp(len, block_size);
+        __ br(__ LO, NEXT);
+
+        // Yes, we have a full block
+        __ ldrq(v1, Address(in, offset));
+        __ eor(v1, __ T16B, v1, v0);
+        __ strq(v1, Address(out, offset));
+        __ mov(used, block_size);
+        __ add(offset, offset, block_size);
+
+        __ subw(len, len, block_size);
+        __ cbzw(len, DONE);
+
+        // Increment the counter, store it back
+        __ orr(v0, __ T16B, v16, v16);
+        __ rev32(v16, __ T16B, v16);
+        __ addv(v16, __ T4S, v16, v4);
+        __ rev32(v16, __ T16B, v16);
+        __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+        __ b(inner_loop);
+      }
+
+      __ BIND(NEXT);
+
+      // Encrypt a single byte, and loop.
+      // We expect this to be a rare event.
+      __ ldrb(rscratch1, Address(in, offset));
+      __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
+      __ eor(rscratch1, rscratch1, rscratch2);
+      __ strb(rscratch1, Address(out, offset));
+      __ add(offset, offset, 1);
+      __ add(used, used, 1);
+      __ subw(len, len,1);
+      __ cbnzw(len, L_CTR_loop);
+    }
+
+    __ bind(DONE);
+    __ strw(used, Address(used_ptr));
+    __ mov(r0, saved_len);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    // Bulk encryption
+
+    __ BIND (CTR_large_block);
+    assert(bulk_width == 4 || bulk_width == 8, "must be");
+
+    if (bulk_width == 8) {
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+    }
+    __ sub(sp, sp, 4 * 16);
+    __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+    RegSet saved_regs = (RegSet::of(in, out, offset)
+                         + RegSet::of(saved_encrypted_ctr, used_ptr, len));
+    __ push(saved_regs, sp);
+    __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
+    __ add(in, in, offset);
+    __ add(out, out, offset);
+
+    // Keys should already be loaded into the correct registers
+
+    __ ld1(v0, __ T16B, counter); // v0 contains the first counter
+    __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
+
+    // AES/CTR loop
+    {
+      Label L_CTR_loop;
+      __ BIND(L_CTR_loop);
+
+      // Setup the counters
+      __ movi(v8, __ T4S, 0);
+      __ movi(v9, __ T4S, 1);
+      __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
+
+      for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
+        __ rev32(f, __ T16B, v16);
+        __ addv(v16, __ T4S, v16, v8);
+      }
+
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
+
+      // Encrypt the counters
+      __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
+
+      if (bulk_width == 8) {
+        __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
+      }
+
+      // XOR the encrypted counters with the inputs
+      for (int i = 0; i < bulk_width; i++) {
+        __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
+      }
+
+      // Write the encrypted data
+      __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
+      if (bulk_width == 8) {
+        __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
+      }
+
+      __ subw(len, len, 16 * bulk_width);
+      __ cbnzw(len, L_CTR_loop);
+    }
+
+    // Save the counter back where it goes
+    __ rev32(v16, __ T16B, v16);
+    __ st1(v16, __ T16B, counter);
+
+    __ pop(saved_regs, sp);
+
+    __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+    if (bulk_width == 8) {
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+    }
+
+    __ andr(rscratch1, len, -16 * bulk_width);
+    __ sub(len, len, rscratch1);
+    __ add(offset, offset, rscratch1);
+    __ mov(used, 16);
+    __ strw(used, Address(used_ptr));
+    __ b(large_block_return);
+
+    return start;
+  }
+
+  // Vector AES Galois Counter Mode implementation. Parameters:
+  //
+  // in = c_rarg0
+  // len = c_rarg1
+  // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
+  // out = c_rarg3
+  // key = c_rarg4
+  // state = c_rarg5 - GHASH.state
+  // subkeyHtbl = c_rarg6 - powers of H
+  // counter = c_rarg7 - pointer to 16 bytes of CTR
+  // return - number of processed bytes
+  address generate_galoisCounterMode_AESCrypt() {
+    address ghash_polynomial = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
+    __ align(CodeEntryAlignment);
+     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
+    address start = __ pc();
+    const Register in = c_rarg0;
+    const Register len = c_rarg1;
+    const Register ct = c_rarg2;
+    const Register out = c_rarg3;
+    // and updated with the incremented counter in the end
+
+    const Register key = c_rarg4;
+    const Register state = c_rarg5;
+
+    const Register subkeyHtbl = c_rarg6;
+
+    const Register counter = c_rarg7;
+
+    const Register keylen = r10;
+    __ enter();
+    // Save state before entering routine
+    __ sub(sp, sp, 4 * 16);
+    __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+    __ sub(sp, sp, 4 * 16);
+    __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+
+    // __ andr(len, len, -512);
+    __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
+    __ str(len, __ pre(sp, -2 * wordSize));
+
+    Label DONE;
+    __ cbz(len, DONE);
+
+    // Compute #rounds for AES based on the length of the key array
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ aesenc_loadkeys(key, keylen);
+    __ ld1(v0, __ T16B, counter); // v0 contains the first counter
+    __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
+
+    // AES/CTR loop
+    {
+      Label L_CTR_loop;
+      __ BIND(L_CTR_loop);
+
+      // Setup the counters
+      __ movi(v8, __ T4S, 0);
+      __ movi(v9, __ T4S, 1);
+      __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
+      for (FloatRegister f = v0; f < v8; f++) {
+        __ rev32(f, __ T16B, v16);
+        __ addv(v16, __ T4S, v16, v8);
+      }
+
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
+
+      // Encrypt the counters
+      __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
+
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
+
+      // XOR the encrypted counters with the inputs
+      for (int i = 0; i < 8; i++) {
+        __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
+      }
+      __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
+      __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
+
+      __ subw(len, len, 16 * 8);
+      __ cbnzw(len, L_CTR_loop);
+    }
+
+    __ rev32(v16, __ T16B, v16);
+    __ st1(v16, __ T16B, counter);
+
+    __ ldr(len, Address(sp));
+    __ lsr(len, len, exact_log2(16));  // We want the count of blocks
+
+    // GHASH/CTR loop
+    __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
+                                len, /*unrolls*/4);
+
+#ifdef ASSERT
+    { Label L;
+      __ cmp(len, (unsigned char)0);
+      __ br(Assembler::EQ, L);
+      __ stop("stubGenerator: abort");
+      __ bind(L);
+  }
+#endif
+
+  __ bind(DONE);
+    // Return the number of bytes processed
+    __ ldr(r0, __ post(sp, 2 * wordSize));
+
+    __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+    __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+     return start;
+  }
+
  // Arguments:
  //
  // Inputs:
@ -4227,69 +4468,6 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

-  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
-                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
-                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
-    // Karatsuba multiplication performs a 128*128 -> 256-bit
-    // multiplication in three 128-bit multiplications and a few
-    // additions.
-    //
-    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
-    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
-    //
-    // Inputs:
-    //
-    // A0 in a.d[0]     (subkey)
-    // A1 in a.d[1]
-    // (A1+A0) in a1_xor_a0.d[0]
-    //
-    // B0 in b.d[0]     (state)
-    // B1 in b.d[1]
-
-    __ ext(tmp1, __ T16B, b, b, 0x08);
-    __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
-    __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
-    __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
-    __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
-
-    __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
-    __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
-    __ eor(tmp2, __ T16B, tmp2, tmp4);
-    __ eor(tmp2, __ T16B, tmp2, tmp3);
-
-    // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
-    __ ins(result_hi, __ D, tmp2, 0, 1);
-    __ ins(result_lo, __ D, tmp2, 1, 0);
-  }
-
-  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
-                    FloatRegister p, FloatRegister z, FloatRegister t1) {
-    const FloatRegister t0 = result;
-
-    // The GCM field polynomial f is z^128 + p(z), where p =
-    // z^7+z^2+z+1.
-    //
-    //    z^128 === -p(z)  (mod (z^128 + p(z)))
-    //
-    // so, given that the product we're reducing is
-    //    a == lo + hi * z^128
-    // substituting,
-    //      === lo - hi * p(z)  (mod (z^128 + p(z)))
-    //
-    // we reduce by multiplying hi by p(z) and subtracting the result
-    // from (i.e. XORing it with) lo.  Because p has no nonzero high
-    // bits we can do this with two 64-bit multiplications, lo*p and
-    // hi*p.
-
-    __ pmull2(t0, __ T1Q, hi, p, __ T2D);
-    __ ext(t1, __ T16B, t0, z, 8);
-    __ eor(hi, __ T16B, hi, t1);
-    __ ext(t1, __ T16B, z, t0, 8);
-    __ eor(lo, __ T16B, lo, t1);
-    __ pmull(t0, __ T1Q, hi, p, __ T1D);
-    __ eor(result, __ T16B, lo, t0);
-  }
-
  address generate_has_negatives(address &has_negatives_long) {
    const u1 large_loop_size = 64;
    const uint64_t UPPER_BIT_MASK=0x8080808080808080;
@ -5387,6 +5565,8 @@ class StubGenerator: public StubCodeGenerator {
    FloatRegister vzr = v30;
    __ eor(vzr, __ T16B, vzr, vzr); // zero register

+    __ ldrq(v24, p);    // The field polynomial
+
    __ ldrq(v0, Address(state));
    __ ldrq(v1, Address(subkeyH));

@ -5395,10 +5575,8 @@ class StubGenerator: public StubCodeGenerator {
    __ rev64(v1, __ T16B, v1);
    __ rbit(v1, __ T16B, v1);

-    __ ldrq(v26, p);
-
-    __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
-    __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+    __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
+    __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))

    {
      Label L_ghash_loop;
@ -5410,21 +5588,70 @@ class StubGenerator: public StubCodeGenerator {
      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state

      // Multiply state in v2 by subkey in v1
-      ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
-                     /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
-                     /*temps*/v6, v20, v18, v21);
+      __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
+                        /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
+                        /*temps*/v6, v3, /*reuse/clobber b*/v2);
      // Reduce v7:v5 by the field polynomial
-      ghash_reduce(v0, v5, v7, v26, vzr, v20);
+      __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);

      __ sub(blocks, blocks, 1);
      __ cbnz(blocks, L_ghash_loop);
    }

    // The bit-reversed result is at this point in v0
-    __ rev64(v1, __ T16B, v0);
-    __ rbit(v1, __ T16B, v1);
+    __ rev64(v0, __ T16B, v0);
+    __ rbit(v0, __ T16B, v0);
+
+    __ st1(v0, __ T16B, state);
+    __ ret(lr);
+
+    return start;
+  }
+
+  address generate_ghash_processBlocks_wide() {
+    address small = generate_ghash_processBlocks();
+
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
+    __ align(wordSize * 2);
+    address p = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
+    __ align(CodeEntryAlignment);
+    address start = __ pc();
+
+    Register state   = c_rarg0;
+    Register subkeyH = c_rarg1;
+    Register data    = c_rarg2;
+    Register blocks  = c_rarg3;
+
+    const int unroll = 4;
+
+    __ cmp(blocks, (unsigned char)(unroll * 2));
+    __ br(__ LT, small);
+
+    if (unroll > 1) {
+    // Save state before entering routine
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+    }
+
+    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
+
+    if (unroll > 1) {
+      // And restore state
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+    }
+
+    __ cmp(blocks, (unsigned char)0);
+    __ br(__ GT, small);

-    __ st1(v1, __ T16B, state);
    __ ret(lr);

    return start;
@ -7131,7 +7358,8 @@ class StubGenerator: public StubCodeGenerator {

    // generate GHASH intrinsics code
    if (UseGHASHIntrinsics) {
-      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+      // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
    }

    if (UseBASE64Intrinsics) {
@ -7148,6 +7376,8 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+      StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
    }

    if (UseSHA1Intrinsics) {
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
@ -36,7 +36,7 @@ static bool    returns_to_call_stub(address return_pc)   {

 enum platform_dependent_constants {
  code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 28000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 38000           // simply increase if too small (assembler will crash if too small)
 };

 class aarch64 {
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@ -237,6 +237,9 @@ void VM_Version::initialize() {
      warning("UseAESIntrinsics enabled, but UseAES not, enabling");
      UseAES = true;
    }
+    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true);
+    }
  } else {
    if (UseAES) {
      warning("AES instructions are not available on this CPU");
@ -246,12 +249,12 @@ void VM_Version::initialize() {
      warning("AES intrinsics are not available on this CPU");
      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
    }
+    if (UseAESCTRIntrinsics) {
+      warning("AES/CTR intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
  }

-  if (UseAESCTRIntrinsics) {
-    warning("AES/CTR intrinsics are not available on this CPU");
-    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
-  }

  if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
    UseCRC32Intrinsics = true;
--- a/src/hotspot/os_cpu/bsd_aarch64/vm_version_bsd_aarch64.cpp
+++ b/src/hotspot/os_cpu/bsd_aarch64/vm_version_bsd_aarch64.cpp
@ -60,6 +60,9 @@ void VM_Version::get_os_cpu_info() {
  assert(cpu_has("hw.optional.neon"), "should be");
  _features = CPU_FP | CPU_ASIMD;

+  // All Apple-darwin Arm processors have AES.
+  _features |= CPU_AES;
+
  // Only few features are available via sysctl, see line 614
  // https://opensource.apple.com/source/xnu/xnu-6153.141.1/bsd/kern/kern_mib.c.auto.html
  if (cpu_has("hw.optional.armv8_crc32"))     _features |= CPU_CRC32;
@ -88,6 +91,7 @@ void VM_Version::get_os_cpu_info() {
  if (sysctlbyname("hw.cpufamily", &family, &sysctllen, NULL, 0)) {
    family = 0;
  }
+
  _model = family;
  _cpu = CPU_APPLE;
 }