8271567: AArch64: AES Galois CounterMode (GCM) interleaved implementation using vector instructions

Reviewed-by: ngasson, adinn, xliu
This commit is contained in:
Andrew Haley 2021-09-23 09:00:46 +00:00
parent 8799856528
commit 4f3b626a36
7 changed files with 1168 additions and 225 deletions

View File

@ -647,22 +647,22 @@ typedef enum {
class Assembler : public AbstractAssembler {
public:
#ifndef PRODUCT
static const uintptr_t asm_bp;
void emit_long(jint x) {
void emit_int32(jint x) {
if ((uintptr_t)pc() == asm_bp)
NOP();
AbstractAssembler::emit_int32(x);
}
#else
void emit_long(jint x) {
void emit_int32(jint x) {
AbstractAssembler::emit_int32(x);
}
#endif
public:
enum { instruction_size = 4 };
//---< calculate length of instruction >---

View File

@ -1296,11 +1296,37 @@ public:
void kernel_crc32c_using_crc32c(Register crc, Register buf,
Register len, Register tmp0, Register tmp1, Register tmp2,
Register tmp3);
void ghash_modmul (FloatRegister result,
FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
FloatRegister t1, FloatRegister t2, FloatRegister t3);
void ghash_load_wide(int index, Register data, FloatRegister result, FloatRegister state);
public:
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
Register zlen, Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6, Register tmp7);
void mul_add(Register out, Register in, Register offs, Register len, Register k);
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
void ghash_multiply_wide(int index,
FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
FloatRegister p, FloatRegister z, FloatRegister t1);
void ghash_reduce_wide(int index, FloatRegister result, FloatRegister lo, FloatRegister hi,
FloatRegister p, FloatRegister z, FloatRegister t1);
void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
Register data, Register blocks, int unrolls);
void aesenc_loadkeys(Register key, Register keylen);
void aesecb_encrypt(Register from, Register to, Register keylen,
FloatRegister data = v0, int unrolls = 1);
void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
void aes_round(FloatRegister input, FloatRegister subkey);
// Place an ISB after code may have been modified due to a safepoint.
void safepoint_isb();

View File

@ -0,0 +1,680 @@
/*
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "macroAssembler_aarch64.hpp"
#include "memory/resourceArea.hpp"
#include "runtime/stubRoutines.hpp"
void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
Label L_doLast;
ld1(v0, T16B, from); // get 16 bytes of input
ld1(v5, T16B, post(key, 16));
rev32(v5, T16B, v5);
ld1(v1, v2, v3, v4, T16B, post(key, 64));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
rev32(v3, T16B, v3);
rev32(v4, T16B, v4);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
aesd(v0, v3);
aesimc(v0, v0);
aesd(v0, v4);
aesimc(v0, v0);
ld1(v1, v2, v3, v4, T16B, post(key, 64));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
rev32(v3, T16B, v3);
rev32(v4, T16B, v4);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
aesd(v0, v3);
aesimc(v0, v0);
aesd(v0, v4);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
cmpw(keylen, 44);
br(Assembler::EQ, L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
cmpw(keylen, 52);
br(Assembler::EQ, L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
bind(L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
eor(v0, T16B, v0, v5);
st1(v0, T16B, to);
// Preserve the address of the start of the key
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
}
// Load expanded key into v17..v31
void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
Label L_loadkeys_44, L_loadkeys_52;
cmpw(keylen, 52);
br(Assembler::LO, L_loadkeys_44);
br(Assembler::EQ, L_loadkeys_52);
ld1(v17, v18, T16B, post(key, 32));
rev32(v17, T16B, v17);
rev32(v18, T16B, v18);
bind(L_loadkeys_52);
ld1(v19, v20, T16B, post(key, 32));
rev32(v19, T16B, v19);
rev32(v20, T16B, v20);
bind(L_loadkeys_44);
ld1(v21, v22, v23, v24, T16B, post(key, 64));
rev32(v21, T16B, v21);
rev32(v22, T16B, v22);
rev32(v23, T16B, v23);
rev32(v24, T16B, v24);
ld1(v25, v26, v27, v28, T16B, post(key, 64));
rev32(v25, T16B, v25);
rev32(v26, T16B, v26);
rev32(v27, T16B, v27);
rev32(v28, T16B, v28);
ld1(v29, v30, v31, T16B, post(key, 48));
rev32(v29, T16B, v29);
rev32(v30, T16B, v30);
rev32(v31, T16B, v31);
// Preserve the address of the start of the key
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
}
// NeoverseTM N1Software Optimization Guide:
// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
// instruction pairs will exhibit the performance characteristics
// described in Section 4.6.
void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
aese(input, subkey); aesmc(input, input);
}
// KernelGenerator
//
// The abstract base class of an unrolled function generator.
// Subclasses override generate(), length(), and next() to generate
// unrolled and interleaved functions.
//
// The core idea is that a subclass defines a method which generates
// the base case of a function and a method to generate a clone of it,
// shifted to a different set of registers. KernelGenerator will then
// generate several interleaved copies of the function, with each one
// using a different set of registers.
// The subclass must implement three methods: length(), which is the
// number of instruction bundles in the intrinsic, generate(int n)
// which emits the nth instruction bundle in the intrinsic, and next()
// which takes an instance of the generator and returns a version of it,
// shifted to a new set of registers.
class KernelGenerator: public MacroAssembler {
protected:
const int _unrolls;
public:
KernelGenerator(Assembler *as, int unrolls)
: MacroAssembler(as->code()), _unrolls(unrolls) { }
virtual void generate(int index) = 0;
virtual int length() = 0;
virtual KernelGenerator *next() = 0;
int unrolls() { return _unrolls; }
void unroll();
};
void KernelGenerator::unroll() {
ResourceMark rm;
KernelGenerator **generators
= NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
generators[0] = this;
for (int i = 1; i < unrolls(); i++) {
generators[i] = generators[i-1]->next();
}
for (int j = 0; j < length(); j++) {
for (int i = 0; i < unrolls(); i++) {
generators[i]->generate(j);
}
}
}
// An unrolled and interleaved generator for AES encryption.
class AESKernelGenerator: public KernelGenerator {
Register _from, _to;
const Register _keylen;
FloatRegister _data;
const FloatRegister _subkeys;
bool _once;
Label _rounds_44, _rounds_52;
public:
AESKernelGenerator(Assembler *as, int unrolls,
Register from, Register to, Register keylen, FloatRegister data,
FloatRegister subkeys, bool once = true)
: KernelGenerator(as, unrolls),
_from(from), _to(to), _keylen(keylen), _data(data),
_subkeys(subkeys), _once(once) {
}
virtual void generate(int index) {
switch (index) {
case 0:
if (_from != noreg) {
ld1(_data, T16B, _from); // get 16 bytes of input
}
break;
case 1:
if (_once) {
cmpw(_keylen, 52);
br(Assembler::LO, _rounds_44);
br(Assembler::EQ, _rounds_52);
}
break;
case 2: aes_round(_data, _subkeys + 0); break;
case 3: aes_round(_data, _subkeys + 1); break;
case 4:
if (_once) bind(_rounds_52);
break;
case 5: aes_round(_data, _subkeys + 2); break;
case 6: aes_round(_data, _subkeys + 3); break;
case 7:
if (_once) bind(_rounds_44);
break;
case 8: aes_round(_data, _subkeys + 4); break;
case 9: aes_round(_data, _subkeys + 5); break;
case 10: aes_round(_data, _subkeys + 6); break;
case 11: aes_round(_data, _subkeys + 7); break;
case 12: aes_round(_data, _subkeys + 8); break;
case 13: aes_round(_data, _subkeys + 9); break;
case 14: aes_round(_data, _subkeys + 10); break;
case 15: aes_round(_data, _subkeys + 11); break;
case 16: aes_round(_data, _subkeys + 12); break;
case 17: aese(_data, _subkeys + 13); break;
case 18: eor(_data, T16B, _data, _subkeys + 14); break;
case 19:
if (_to != noreg) {
st1(_data, T16B, _to);
}
break;
default: ShouldNotReachHere();
}
}
virtual KernelGenerator *next() {
return new AESKernelGenerator(this, _unrolls,
_from, _to, _keylen,
_data + 1, _subkeys, /*once*/false);
}
virtual int length() { return 20; }
};
// Uses expanded key in v17..v31
// Returns encrypted values in inputs.
// If to != noreg, store value at to; likewise from
// Preserves key, keylen
// Increments from, to
// Input data in v0, v1, ...
// unrolls controls the number of times to unroll the generated function
void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
FloatRegister data, int unrolls) {
AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
}
// ghash_multiply and ghash_reduce are the non-unrolled versions of
// the GHASH function generators.
void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
// Karatsuba multiplication performs a 128*128 -> 256-bit
// multiplication in three 128-bit multiplications and a few
// additions.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
//
// A0 in a.d[0] (subkey)
// A1 in a.d[1]
// (A1+A0) in a1_xor_a0.d[0]
//
// B0 in b.d[0] (state)
// B1 in b.d[1]
ext(tmp1, T16B, b, b, 0x08);
pmull2(result_hi, T1Q, b, a, T2D); // A1*B1
eor(tmp1, T16B, tmp1, b); // (B1+B0)
pmull(result_lo, T1Q, b, a, T1D); // A0*B0
pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
ext(tmp1, T16B, result_lo, result_hi, 0x08);
eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
eor(tmp2, T16B, tmp2, tmp1);
eor(tmp2, T16B, tmp2, tmp3);
// Register pair <result_hi:result_lo> holds the result of carry-less multiplication
ins(result_hi, D, tmp2, 0, 1);
ins(result_lo, D, tmp2, 1, 0);
}
void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
FloatRegister p, FloatRegister vzr, FloatRegister t1) {
const FloatRegister t0 = result;
// The GCM field polynomial f is z^128 + p(z), where p =
// z^7+z^2+z+1.
//
// z^128 === -p(z) (mod (z^128 + p(z)))
//
// so, given that the product we're reducing is
// a == lo + hi * z^128
// substituting,
// === lo - hi * p(z) (mod (z^128 + p(z)))
//
// we reduce by multiplying hi by p(z) and subtracting the result
// from (i.e. XORing it with) lo. Because p has no nonzero high
// bits we can do this with two 64-bit multiplications, lo*p and
// hi*p.
pmull2(t0, T1Q, hi, p, T2D);
ext(t1, T16B, t0, vzr, 8);
eor(hi, T16B, hi, t1);
ext(t1, T16B, vzr, t0, 8);
eor(lo, T16B, lo, t1);
pmull(t0, T1Q, hi, p, T1D);
eor(result, T16B, lo, t0);
}
class GHASHMultiplyGenerator: public KernelGenerator {
FloatRegister _result_lo, _result_hi, _b,
_a, _vzr, _a1_xor_a0, _p,
_tmp1, _tmp2, _tmp3;
public:
GHASHMultiplyGenerator(Assembler *as, int unrolls,
/* offsetted registers */
FloatRegister result_lo, FloatRegister result_hi,
FloatRegister b,
/* non-offsetted (shared) registers */
FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
/* offseted (temp) registers */
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
: KernelGenerator(as, unrolls),
_result_lo(result_lo), _result_hi(result_hi), _b(b),
_a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
_tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
int register_stride = 7;
virtual void generate(int index) {
// Karatsuba multiplication performs a 128*128 -> 256-bit
// multiplication in three 128-bit multiplications and a few
// additions.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
//
// A0 in a.d[0] (subkey)
// A1 in a.d[1]
// (A1+A0) in a1_xor_a0.d[0]
//
// B0 in b.d[0] (state)
// B1 in b.d[1]
switch (index) {
case 0: ext(_tmp1, T16B, _b, _b, 0x08); break;
case 1: pmull2(_result_hi, T1Q, _b, _a, T2D); // A1*B1
break;
case 2: eor(_tmp1, T16B, _tmp1, _b); // (B1+B0)
break;
case 3: pmull(_result_lo, T1Q, _b, _a, T1D); // A0*B0
break;
case 4: pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
break;
case 5: ext(_tmp1, T16B, _result_lo, _result_hi, 0x08); break;
case 6: eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
break;
case 7: eor(_tmp2, T16B, _tmp2, _tmp1); break;
case 8: eor(_tmp2, T16B, _tmp2, _tmp3); break;
// Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
case 9: ins(_result_hi, D, _tmp2, 0, 1); break;
case 10: ins(_result_lo, D, _tmp2, 1, 0); break;
default: ShouldNotReachHere();
}
}
virtual KernelGenerator *next() {
GHASHMultiplyGenerator *result = new GHASHMultiplyGenerator(*this);
result->_result_lo += register_stride;
result->_result_hi += register_stride;
result->_b += register_stride;
result->_tmp1 += register_stride;
result->_tmp2 += register_stride;
result->_tmp3 += register_stride;
return result;
}
virtual int length() { return 11; }
};
// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
// The FloatRegister argument called data is optional: if it is a
// valid register, we interleave LD1 instructions with the
// reduction. This is to reduce latency next time around the loop.
class GHASHReduceGenerator: public KernelGenerator {
FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
int _once;
public:
GHASHReduceGenerator(Assembler *as, int unrolls,
/* offsetted registers */
FloatRegister result, FloatRegister lo, FloatRegister hi,
/* non-offsetted (shared) registers */
FloatRegister p, FloatRegister vzr, FloatRegister data,
/* offseted (temp) registers */
FloatRegister t1)
: KernelGenerator(as, unrolls),
_result(result), _lo(lo), _hi(hi),
_p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
int register_stride = 7;
virtual void generate(int index) {
const FloatRegister t0 = _result;
switch (index) {
// The GCM field polynomial f is z^128 + p(z), where p =
// z^7+z^2+z+1.
//
// z^128 === -p(z) (mod (z^128 + p(z)))
//
// so, given that the product we're reducing is
// a == lo + hi * z^128
// substituting,
// === lo - hi * p(z) (mod (z^128 + p(z)))
//
// we reduce by multiplying hi by p(z) and subtracting the _result
// from (i.e. XORing it with) lo. Because p has no nonzero high
// bits we can do this with two 64-bit multiplications, lo*p and
// hi*p.
case 0: pmull2(t0, T1Q, _hi, _p, T2D); break;
case 1: ext(_t1, T16B, t0, _vzr, 8); break;
case 2: eor(_hi, T16B, _hi, _t1); break;
case 3: ext(_t1, T16B, _vzr, t0, 8); break;
case 4: eor(_lo, T16B, _lo, _t1); break;
case 5: pmull(t0, T1Q, _hi, _p, T1D); break;
case 6: eor(_result, T16B, _lo, t0); break;
default: ShouldNotReachHere();
}
// Sprinkle load instructions into the generated instructions
if (_data->is_valid() && _once) {
assert(length() >= unrolls(), "not enough room for inteleaved loads");
if (index < unrolls()) {
ld1((_data + index*register_stride), T16B, post(r2, 0x10));
}
}
}
virtual KernelGenerator *next() {
GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
result->_result += register_stride;
result->_hi += register_stride;
result->_lo += register_stride;
result->_t1 += register_stride;
result->_once = false;
return result;
}
int length() { return 7; }
};
// Perform a GHASH multiply/reduce on a single FloatRegister.
void MacroAssembler::ghash_modmul(FloatRegister result,
FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
FloatRegister t1, FloatRegister t2, FloatRegister t3) {
ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
}
// Interleaved GHASH processing.
//
// Clobbers all vector registers.
//
void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
Register subkeyH,
Register data, Register blocks, int unrolls) {
int register_stride = 7;
// Bafflingly, GCM uses little-endian for the byte order, but
// big-endian for the bit order. For example, the polynomial 1 is
// represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
//
// So, we must either reverse the bytes in each word and do
// everything big-endian or reverse the bits in each byte and do
// it little-endian. On AArch64 it's more idiomatic to reverse
// the bits in each byte (we have an instruction, RBIT, to do
// that) and keep the data in little-endian bit order throught the
// calculation, bit-reversing the inputs and outputs.
assert(unrolls * register_stride < 32, "out of registers");
FloatRegister a1_xor_a0 = v28;
FloatRegister Hprime = v29;
FloatRegister vzr = v30;
FloatRegister p = v31;
eor(vzr, T16B, vzr, vzr); // zero register
ldrq(p, field_polynomial); // The field polynomial
ldrq(v0, Address(state));
ldrq(Hprime, Address(subkeyH));
rev64(v0, T16B, v0); // Bit-reverse words in state and subkeyH
rbit(v0, T16B, v0);
rev64(Hprime, T16B, Hprime);
rbit(Hprime, T16B, Hprime);
// Powers of H -> Hprime
Label already_calculated, done;
{
// The first time around we'll have to calculate H**2, H**3, etc.
// Look at the largest power of H in the subkeyH array to see if
// it's already been calculated.
ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
orr(rscratch1, rscratch1, rscratch2);
cbnz(rscratch1, already_calculated);
orr(v6, T16B, Hprime, Hprime); // Start with H in v6 and Hprime
for (int i = 1; i < unrolls; i++) {
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
Hprime, vzr, a1_xor_a0, p,
/*temps*/v1, v3, v2);
rev64(v1, T16B, v6);
rbit(v1, T16B, v1);
strq(v1, Address(subkeyH, 16 * i));
}
b(done);
}
{
bind(already_calculated);
// Load the largest power of H we need into v6.
ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
rev64(v6, T16B, v6);
rbit(v6, T16B, v6);
}
bind(done);
orr(Hprime, T16B, v6, v6); // Move H ** unrolls into Hprime
// Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
// v0 contains the initial state. Clear the others.
for (int i = 1; i < unrolls; i++) {
int ofs = register_stride * i;
eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register
}
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
// Load #unrolls blocks of data
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
ld1(v2+ofs, T16B, post(data, 0x10));
}
// Register assignments, replicated across 4 clones, v0 ... v23
//
// v0: input / output: current state, result of multiply/reduce
// v1: temp
// v2: input: one block of data (the ciphertext)
// also used as a temp once the data has been consumed
// v3: temp
// v4: output: high part of product
// v5: output: low part ...
// v6: unused
//
// Not replicated:
//
// v28: High part of H xor low part of H'
// v29: H' (hash subkey)
// v30: zero
// v31: Reduction polynomial of the Galois field
// Inner loop.
// Do the whole load/add/multiply/reduce over all our data except
// the last few rows.
{
Label L_ghash_loop;
bind(L_ghash_loop);
// Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
// prfm(Address(data, 128), PLDL1KEEP);
// Xor data into current state
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
rbit((v2+ofs), T16B, (v2+ofs));
eor((v2+ofs), T16B, v0+ofs, (v2+ofs)); // bit-swapped data ^ bit-swapped state
}
// Generate fully-unrolled multiply-reduce in two stages.
GHASHMultiplyGenerator(this, unrolls,
/*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
Hprime, a1_xor_a0, p, vzr,
/*temps*/v1, v3, /* reuse b*/v2) .unroll();
// NB: GHASHReduceGenerator also loads the next #unrolls blocks of
// data into v0, v0+ofs, the current state.
GHASHReduceGenerator (this, unrolls,
/*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
/*data*/v2, /*temp*/v3) .unroll();
sub(blocks, blocks, unrolls);
cmp(blocks, (unsigned char)(unrolls * 2));
br(GE, L_ghash_loop);
}
// Merge the #unrolls states. Note that the data for the next
// iteration has already been loaded into v4, v4+ofs, etc...
// First, we multiply/reduce each clone by the appropriate power of H.
for (int i = 0; i < unrolls; i++) {
int ofs = register_stride * i;
ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
rbit(v2+ofs, T16B, v2+ofs);
eor(v2+ofs, T16B, ofs+v0, v2+ofs); // bit-swapped data ^ bit-swapped state
rev64(Hprime, T16B, Hprime);
rbit(Hprime, T16B, Hprime);
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs,
Hprime, vzr, a1_xor_a0, p,
/*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs);
}
// Then we sum the results.
for (int i = 0; i < unrolls - 1; i++) {
int ofs = register_stride * i;
eor(v0, T16B, v0, v0 + register_stride + ofs);
}
sub(blocks, blocks, (unsigned char)unrolls);
// And finally bit-reverse the state back to big endian.
rev64(v0, T16B, v0);
rbit(v0, T16B, v0);
st1(v0, T16B, state);
}

View File

@ -2560,8 +2560,6 @@ class StubGenerator: public StubCodeGenerator {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
Label L_doLast;
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register key = c_rarg2; // key array address
@ -2572,75 +2570,8 @@ class StubGenerator: public StubCodeGenerator {
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ ld1(v0, __ T16B, from); // get 16 bytes of input
__ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ rev32(v3, __ T16B, v3);
__ rev32(v4, __ T16B, v4);
__ aese(v0, v1);
__ aesmc(v0, v0);
__ aese(v0, v2);
__ aesmc(v0, v0);
__ aese(v0, v3);
__ aesmc(v0, v0);
__ aese(v0, v4);
__ aesmc(v0, v0);
__ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ rev32(v3, __ T16B, v3);
__ rev32(v4, __ T16B, v4);
__ aese(v0, v1);
__ aesmc(v0, v0);
__ aese(v0, v2);
__ aesmc(v0, v0);
__ aese(v0, v3);
__ aesmc(v0, v0);
__ aese(v0, v4);
__ aesmc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ cmpw(keylen, 44);
__ br(Assembler::EQ, L_doLast);
__ aese(v0, v1);
__ aesmc(v0, v0);
__ aese(v0, v2);
__ aesmc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ cmpw(keylen, 52);
__ br(Assembler::EQ, L_doLast);
__ aese(v0, v1);
__ aesmc(v0, v0);
__ aese(v0, v2);
__ aesmc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ BIND(L_doLast);
__ aese(v0, v1);
__ aesmc(v0, v0);
__ aese(v0, v2);
__ ld1(v1, __ T16B, key);
__ rev32(v1, __ T16B, v1);
__ eor(v0, __ T16B, v0, v1);
__ st1(v0, __ T16B, to);
__ aesenc_loadkeys(key, keylen);
__ aesecb_encrypt(from, to, keylen);
__ mov(r0, 0);
@ -2673,76 +2604,7 @@ class StubGenerator: public StubCodeGenerator {
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ ld1(v0, __ T16B, from); // get 16 bytes of input
__ ld1(v5, __ T16B, __ post(key, 16));
__ rev32(v5, __ T16B, v5);
__ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ rev32(v3, __ T16B, v3);
__ rev32(v4, __ T16B, v4);
__ aesd(v0, v1);
__ aesimc(v0, v0);
__ aesd(v0, v2);
__ aesimc(v0, v0);
__ aesd(v0, v3);
__ aesimc(v0, v0);
__ aesd(v0, v4);
__ aesimc(v0, v0);
__ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ rev32(v3, __ T16B, v3);
__ rev32(v4, __ T16B, v4);
__ aesd(v0, v1);
__ aesimc(v0, v0);
__ aesd(v0, v2);
__ aesimc(v0, v0);
__ aesd(v0, v3);
__ aesimc(v0, v0);
__ aesd(v0, v4);
__ aesimc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ cmpw(keylen, 44);
__ br(Assembler::EQ, L_doLast);
__ aesd(v0, v1);
__ aesimc(v0, v0);
__ aesd(v0, v2);
__ aesimc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ cmpw(keylen, 52);
__ br(Assembler::EQ, L_doLast);
__ aesd(v0, v1);
__ aesimc(v0, v0);
__ aesd(v0, v2);
__ aesimc(v0, v0);
__ ld1(v1, v2, __ T16B, __ post(key, 32));
__ rev32(v1, __ T16B, v1);
__ rev32(v2, __ T16B, v2);
__ BIND(L_doLast);
__ aesd(v0, v1);
__ aesimc(v0, v0);
__ aesd(v0, v2);
__ eor(v0, __ T16B, v0, v5);
__ st1(v0, __ T16B, to);
__ aesecb_decrypt(from, to, key, keylen);
__ mov(r0, 0);
@ -2964,6 +2826,385 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// CTR AES crypt.
// Arguments:
//
// Inputs:
// c_rarg0 - source byte array address
// c_rarg1 - destination byte array address
// c_rarg2 - K (key) in little endian int array
// c_rarg3 - counter vector byte array address
// c_rarg4 - input length
// c_rarg5 - saved encryptedCounter start
// c_rarg6 - saved used length
//
// Output:
// r0 - input length
//
address generate_counterMode_AESCrypt() {
const Register in = c_rarg0;
const Register out = c_rarg1;
const Register key = c_rarg2;
const Register counter = c_rarg3;
const Register saved_len = c_rarg4, len = r10;
const Register saved_encrypted_ctr = c_rarg5;
const Register used_ptr = c_rarg6, used = r12;
const Register offset = r7;
const Register keylen = r11;
const unsigned char block_size = 16;
const int bulk_width = 4;
// NB: bulk_width can be 4 or 8. 8 gives slightly faster
// performance with larger data sizes, but it also means that the
// fast path isn't used until you have at least 8 blocks, and up
// to 127 bytes of data will be executed on the slow path. For
// that reason, and also so as not to blow away too much icache, 4
// blocks seems like a sensible compromise.
// Algorithm:
//
// if (len == 0) {
// goto DONE;
// }
// int result = len;
// do {
// if (used >= blockSize) {
// if (len >= bulk_width * blockSize) {
// CTR_large_block();
// if (len == 0)
// goto DONE;
// }
// for (;;) {
// 16ByteVector v0 = counter;
// embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
// used = 0;
// if (len < blockSize)
// break; /* goto NEXT */
// 16ByteVector v1 = load16Bytes(in, offset);
// v1 = v1 ^ encryptedCounter;
// store16Bytes(out, offset);
// used = blockSize;
// offset += blockSize;
// len -= blockSize;
// if (len == 0)
// goto DONE;
// }
// }
// NEXT:
// out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
// len--;
// } while (len != 0);
// DONE:
// return result;
//
// CTR_large_block()
// Wide bulk encryption of whole blocks.
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
const address start = __ pc();
__ enter();
Label DONE, CTR_large_block, large_block_return;
__ ldrw(used, Address(used_ptr));
__ cbzw(saved_len, DONE);
__ mov(len, saved_len);
__ mov(offset, 0);
// Compute #rounds for AES based on the length of the key array
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ aesenc_loadkeys(key, keylen);
{
Label L_CTR_loop, NEXT;
__ bind(L_CTR_loop);
__ cmp(used, block_size);
__ br(__ LO, NEXT);
// Maybe we have a lot of data
__ subsw(rscratch1, len, bulk_width * block_size);
__ br(__ HS, CTR_large_block);
__ BIND(large_block_return);
__ cbzw(len, DONE);
// Setup the counter
__ movi(v4, __ T4S, 0);
__ movi(v5, __ T4S, 1);
__ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
__ ld1(v0, __ T16B, counter); // Load the counter into v0
__ rev32(v16, __ T16B, v0);
__ addv(v16, __ T4S, v16, v4);
__ rev32(v16, __ T16B, v16);
__ st1(v16, __ T16B, counter); // Save the incremented counter back
{
// We have fewer than bulk_width blocks of data left. Encrypt
// them one by one until there is less than a full block
// remaining, being careful to save both the encrypted counter
// and the counter.
Label inner_loop;
__ bind(inner_loop);
// Counter to encrypt is in v0
__ aesecb_encrypt(noreg, noreg, keylen);
__ st1(v0, __ T16B, saved_encrypted_ctr);
// Do we have a remaining full block?
__ mov(used, 0);
__ cmp(len, block_size);
__ br(__ LO, NEXT);
// Yes, we have a full block
__ ldrq(v1, Address(in, offset));
__ eor(v1, __ T16B, v1, v0);
__ strq(v1, Address(out, offset));
__ mov(used, block_size);
__ add(offset, offset, block_size);
__ subw(len, len, block_size);
__ cbzw(len, DONE);
// Increment the counter, store it back
__ orr(v0, __ T16B, v16, v16);
__ rev32(v16, __ T16B, v16);
__ addv(v16, __ T4S, v16, v4);
__ rev32(v16, __ T16B, v16);
__ st1(v16, __ T16B, counter); // Save the incremented counter back
__ b(inner_loop);
}
__ BIND(NEXT);
// Encrypt a single byte, and loop.
// We expect this to be a rare event.
__ ldrb(rscratch1, Address(in, offset));
__ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
__ eor(rscratch1, rscratch1, rscratch2);
__ strb(rscratch1, Address(out, offset));
__ add(offset, offset, 1);
__ add(used, used, 1);
__ subw(len, len,1);
__ cbnzw(len, L_CTR_loop);
}
__ bind(DONE);
__ strw(used, Address(used_ptr));
__ mov(r0, saved_len);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(lr);
// Bulk encryption
__ BIND (CTR_large_block);
assert(bulk_width == 4 || bulk_width == 8, "must be");
if (bulk_width == 8) {
__ sub(sp, sp, 4 * 16);
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
}
__ sub(sp, sp, 4 * 16);
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
RegSet saved_regs = (RegSet::of(in, out, offset)
+ RegSet::of(saved_encrypted_ctr, used_ptr, len));
__ push(saved_regs, sp);
__ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
__ add(in, in, offset);
__ add(out, out, offset);
// Keys should already be loaded into the correct registers
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
__ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
// AES/CTR loop
{
Label L_CTR_loop;
__ BIND(L_CTR_loop);
// Setup the counters
__ movi(v8, __ T4S, 0);
__ movi(v9, __ T4S, 1);
__ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
__ rev32(f, __ T16B, v16);
__ addv(v16, __ T4S, v16, v8);
}
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
// Encrypt the counters
__ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
if (bulk_width == 8) {
__ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
}
// XOR the encrypted counters with the inputs
for (int i = 0; i < bulk_width; i++) {
__ eor(v0 + i, __ T16B, v0 + i, v8 + i);
}
// Write the encrypted data
__ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
if (bulk_width == 8) {
__ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
}
__ subw(len, len, 16 * bulk_width);
__ cbnzw(len, L_CTR_loop);
}
// Save the counter back where it goes
__ rev32(v16, __ T16B, v16);
__ st1(v16, __ T16B, counter);
__ pop(saved_regs, sp);
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
if (bulk_width == 8) {
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
}
__ andr(rscratch1, len, -16 * bulk_width);
__ sub(len, len, rscratch1);
__ add(offset, offset, rscratch1);
__ mov(used, 16);
__ strw(used, Address(used_ptr));
__ b(large_block_return);
return start;
}
// Vector AES Galois Counter Mode implementation. Parameters:
//
// in = c_rarg0
// len = c_rarg1
// ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
// out = c_rarg3
// key = c_rarg4
// state = c_rarg5 - GHASH.state
// subkeyHtbl = c_rarg6 - powers of H
// counter = c_rarg7 - pointer to 16 bytes of CTR
// return - number of processed bytes
address generate_galoisCounterMode_AESCrypt() {
address ghash_polynomial = __ pc();
__ emit_int64(0x87); // The low-order bits of the field
// polynomial (i.e. p = z^7+z^2+z+1)
// repeated in the low and high parts of a
// 128-bit vector
__ emit_int64(0x87);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
address start = __ pc();
const Register in = c_rarg0;
const Register len = c_rarg1;
const Register ct = c_rarg2;
const Register out = c_rarg3;
// and updated with the incremented counter in the end
const Register key = c_rarg4;
const Register state = c_rarg5;
const Register subkeyHtbl = c_rarg6;
const Register counter = c_rarg7;
const Register keylen = r10;
__ enter();
// Save state before entering routine
__ sub(sp, sp, 4 * 16);
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
__ sub(sp, sp, 4 * 16);
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
// __ andr(len, len, -512);
__ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
__ str(len, __ pre(sp, -2 * wordSize));
Label DONE;
__ cbz(len, DONE);
// Compute #rounds for AES based on the length of the key array
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ aesenc_loadkeys(key, keylen);
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
__ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
// AES/CTR loop
{
Label L_CTR_loop;
__ BIND(L_CTR_loop);
// Setup the counters
__ movi(v8, __ T4S, 0);
__ movi(v9, __ T4S, 1);
__ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
for (FloatRegister f = v0; f < v8; f++) {
__ rev32(f, __ T16B, v16);
__ addv(v16, __ T4S, v16, v8);
}
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
// Encrypt the counters
__ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
__ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
// XOR the encrypted counters with the inputs
for (int i = 0; i < 8; i++) {
__ eor(v0 + i, __ T16B, v0 + i, v8 + i);
}
__ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
__ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
__ subw(len, len, 16 * 8);
__ cbnzw(len, L_CTR_loop);
}
__ rev32(v16, __ T16B, v16);
__ st1(v16, __ T16B, counter);
__ ldr(len, Address(sp));
__ lsr(len, len, exact_log2(16)); // We want the count of blocks
// GHASH/CTR loop
__ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
len, /*unrolls*/4);
#ifdef ASSERT
{ Label L;
__ cmp(len, (unsigned char)0);
__ br(Assembler::EQ, L);
__ stop("stubGenerator: abort");
__ bind(L);
}
#endif
__ bind(DONE);
// Return the number of bytes processed
__ ldr(r0, __ post(sp, 2 * wordSize));
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(lr);
return start;
}
// Arguments:
//
// Inputs:
@ -4227,69 +4468,6 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
// Karatsuba multiplication performs a 128*128 -> 256-bit
// multiplication in three 128-bit multiplications and a few
// additions.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
//
// A0 in a.d[0] (subkey)
// A1 in a.d[1]
// (A1+A0) in a1_xor_a0.d[0]
//
// B0 in b.d[0] (state)
// B1 in b.d[1]
__ ext(tmp1, __ T16B, b, b, 0x08);
__ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1
__ eor(tmp1, __ T16B, tmp1, b); // (B1+B0)
__ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0
__ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
__ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
__ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
__ eor(tmp2, __ T16B, tmp2, tmp4);
__ eor(tmp2, __ T16B, tmp2, tmp3);
// Register pair <result_hi:result_lo> holds the result of carry-less multiplication
__ ins(result_hi, __ D, tmp2, 0, 1);
__ ins(result_lo, __ D, tmp2, 1, 0);
}
void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
FloatRegister p, FloatRegister z, FloatRegister t1) {
const FloatRegister t0 = result;
// The GCM field polynomial f is z^128 + p(z), where p =
// z^7+z^2+z+1.
//
// z^128 === -p(z) (mod (z^128 + p(z)))
//
// so, given that the product we're reducing is
// a == lo + hi * z^128
// substituting,
// === lo - hi * p(z) (mod (z^128 + p(z)))
//
// we reduce by multiplying hi by p(z) and subtracting the result
// from (i.e. XORing it with) lo. Because p has no nonzero high
// bits we can do this with two 64-bit multiplications, lo*p and
// hi*p.
__ pmull2(t0, __ T1Q, hi, p, __ T2D);
__ ext(t1, __ T16B, t0, z, 8);
__ eor(hi, __ T16B, hi, t1);
__ ext(t1, __ T16B, z, t0, 8);
__ eor(lo, __ T16B, lo, t1);
__ pmull(t0, __ T1Q, hi, p, __ T1D);
__ eor(result, __ T16B, lo, t0);
}
address generate_has_negatives(address &has_negatives_long) {
const u1 large_loop_size = 64;
const uint64_t UPPER_BIT_MASK=0x8080808080808080;
@ -5387,6 +5565,8 @@ class StubGenerator: public StubCodeGenerator {
FloatRegister vzr = v30;
__ eor(vzr, __ T16B, vzr, vzr); // zero register
__ ldrq(v24, p); // The field polynomial
__ ldrq(v0, Address(state));
__ ldrq(v1, Address(subkeyH));
@ -5395,10 +5575,8 @@ class StubGenerator: public StubCodeGenerator {
__ rev64(v1, __ T16B, v1);
__ rbit(v1, __ T16B, v1);
__ ldrq(v26, p);
__ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
__ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
__ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
__ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
{
Label L_ghash_loop;
@ -5410,21 +5588,70 @@ class StubGenerator: public StubCodeGenerator {
__ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
// Multiply state in v2 by subkey in v1
ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
/*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
/*temps*/v6, v20, v18, v21);
__ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
/*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
/*temps*/v6, v3, /*reuse/clobber b*/v2);
// Reduce v7:v5 by the field polynomial
ghash_reduce(v0, v5, v7, v26, vzr, v20);
__ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
__ sub(blocks, blocks, 1);
__ cbnz(blocks, L_ghash_loop);
}
// The bit-reversed result is at this point in v0
__ rev64(v1, __ T16B, v0);
__ rbit(v1, __ T16B, v1);
__ rev64(v0, __ T16B, v0);
__ rbit(v0, __ T16B, v0);
__ st1(v0, __ T16B, state);
__ ret(lr);
return start;
}
address generate_ghash_processBlocks_wide() {
address small = generate_ghash_processBlocks();
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
__ align(wordSize * 2);
address p = __ pc();
__ emit_int64(0x87); // The low-order bits of the field
// polynomial (i.e. p = z^7+z^2+z+1)
// repeated in the low and high parts of a
// 128-bit vector
__ emit_int64(0x87);
__ align(CodeEntryAlignment);
address start = __ pc();
Register state = c_rarg0;
Register subkeyH = c_rarg1;
Register data = c_rarg2;
Register blocks = c_rarg3;
const int unroll = 4;
__ cmp(blocks, (unsigned char)(unroll * 2));
__ br(__ LT, small);
if (unroll > 1) {
// Save state before entering routine
__ sub(sp, sp, 4 * 16);
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
__ sub(sp, sp, 4 * 16);
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
}
__ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
if (unroll > 1) {
// And restore state
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
}
__ cmp(blocks, (unsigned char)0);
__ br(__ GT, small);
__ st1(v1, __ T16B, state);
__ ret(lr);
return start;
@ -7131,7 +7358,8 @@ class StubGenerator: public StubCodeGenerator {
// generate GHASH intrinsics code
if (UseGHASHIntrinsics) {
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
// StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
}
if (UseBASE64Intrinsics) {
@ -7148,6 +7376,8 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
}
if (UseSHA1Intrinsics) {

View File

@ -36,7 +36,7 @@ static bool returns_to_call_stub(address return_pc) {
enum platform_dependent_constants {
code_size1 = 19000, // simply increase if too small (assembler will crash if too small)
code_size2 = 28000 // simply increase if too small (assembler will crash if too small)
code_size2 = 38000 // simply increase if too small (assembler will crash if too small)
};
class aarch64 {

View File

@ -237,6 +237,9 @@ void VM_Version::initialize() {
warning("UseAESIntrinsics enabled, but UseAES not, enabling");
UseAES = true;
}
if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true);
}
} else {
if (UseAES) {
warning("AES instructions are not available on this CPU");
@ -246,12 +249,12 @@ void VM_Version::initialize() {
warning("AES intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
if (UseAESCTRIntrinsics) {
warning("AES/CTR intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
}
}
if (UseAESCTRIntrinsics) {
warning("AES/CTR intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
}
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
UseCRC32Intrinsics = true;

View File

@ -60,6 +60,9 @@ void VM_Version::get_os_cpu_info() {
assert(cpu_has("hw.optional.neon"), "should be");
_features = CPU_FP | CPU_ASIMD;
// All Apple-darwin Arm processors have AES.
_features |= CPU_AES;
// Only few features are available via sysctl, see line 614
// https://opensource.apple.com/source/xnu/xnu-6153.141.1/bsd/kern/kern_mib.c.auto.html
if (cpu_has("hw.optional.armv8_crc32")) _features |= CPU_CRC32;
@ -88,6 +91,7 @@ void VM_Version::get_os_cpu_info() {
if (sysctlbyname("hw.cpufamily", &family, &sysctllen, NULL, 0)) {
family = 0;
}
_model = family;
_cpu = CPU_APPLE;
}