8131062: aarch64: add support for GHASH acceleration
Add support for GHASH using pmull Reviewed-by: kvn, goetz, aph
This commit is contained in:
parent
b6cfe54a64
commit
f3d31d3866
@ -1896,7 +1896,7 @@ public:
|
||||
public:
|
||||
|
||||
enum SIMD_Arrangement {
|
||||
T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
|
||||
T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q
|
||||
};
|
||||
|
||||
enum SIMD_RegVariant {
|
||||
@ -2225,14 +2225,16 @@ public:
|
||||
f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
|
||||
}
|
||||
|
||||
// We do not handle the 1Q arrangement.
|
||||
void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
|
||||
starti;
|
||||
assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
|
||||
f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
|
||||
rf(Vn, 5), rf(Vd, 0);
|
||||
assert((Ta == T1Q && (Tb == T1D || Tb == T2D)) ||
|
||||
(Ta == T8H && (Tb == T8B || Tb == T16B)), "Invalid Size specifier");
|
||||
int size = (Ta == T1Q) ? 0b11 : 0b00;
|
||||
f(0, 31), f(Tb & 1, 30), f(0b001110, 29, 24), f(size, 23, 22);
|
||||
f(1, 21), rf(Vm, 16), f(0b111000, 15, 10), rf(Vn, 5), rf(Vd, 0);
|
||||
}
|
||||
void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
|
||||
assert(Tb == T2D || Tb == T16B, "pmull2 assumes T2D or T16B as the second size specifier");
|
||||
pmull(Vd, Ta, Vn, Vm, Tb);
|
||||
}
|
||||
|
||||
@ -2245,15 +2247,6 @@ public:
|
||||
f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
|
||||
}
|
||||
|
||||
void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
|
||||
{
|
||||
starti;
|
||||
assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H");
|
||||
f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24);
|
||||
f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10);
|
||||
rf(Vn, 5), rf(Vd, 0);
|
||||
}
|
||||
|
||||
void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs)
|
||||
{
|
||||
starti;
|
||||
@ -2290,6 +2283,57 @@ public:
|
||||
|
||||
#undef INSN
|
||||
|
||||
// Table vector lookup
|
||||
#define INSN(NAME, op) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \
|
||||
starti; \
|
||||
assert(T == T8B || T == T16B, "invalid arrangement"); \
|
||||
assert(0 < registers && registers <= 4, "invalid number of registers"); \
|
||||
f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \
|
||||
f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \
|
||||
}
|
||||
|
||||
INSN(tbl, 0);
|
||||
INSN(tbx, 1);
|
||||
|
||||
#undef INSN
|
||||
|
||||
#define INSN(NAME, U, opcode) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { \
|
||||
starti; \
|
||||
assert((ASSERTION), MSG); \
|
||||
f(0, 31), f((int)T & 1, 30), f(U, 29), f(0b01110, 28, 24); \
|
||||
f((int)(T >> 1), 23, 22), f(0b10000, 21, 17), f(opcode, 16, 12); \
|
||||
f(0b10, 11, 10), rf(Vn, 5), rf(Vd, 0); \
|
||||
}
|
||||
|
||||
#define MSG "invalid arrangement"
|
||||
|
||||
#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T == T2S || T == T4S)
|
||||
INSN(rev64, 0, 0b00000);
|
||||
#undef ASSERTION
|
||||
|
||||
#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H)
|
||||
INSN(rev32, 1, 0b00000);
|
||||
#undef ASSERTION
|
||||
|
||||
#define ASSERTION (T == T8B || T == T16B)
|
||||
INSN(rev16, 0, 0b00001);
|
||||
#undef ASSERTION
|
||||
|
||||
#undef MSG
|
||||
|
||||
#undef INSN
|
||||
|
||||
void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index)
|
||||
{
|
||||
starti;
|
||||
assert(T == T8B || T == T16B, "invalid arrangement");
|
||||
assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value");
|
||||
f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21);
|
||||
rf(Vm, 16), f(0, 15), f(index, 14, 11);
|
||||
f(0, 10), rf(Vn, 5), rf(Vd, 0);
|
||||
}
|
||||
|
||||
/* Simulator extensions to the ISA
|
||||
|
||||
|
@ -2435,6 +2435,137 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Input:
|
||||
* c_rarg0 - current state address
|
||||
* c_rarg1 - H key address
|
||||
* c_rarg2 - data address
|
||||
* c_rarg3 - number of blocks
|
||||
*
|
||||
* Output:
|
||||
* Updated state at c_rarg0
|
||||
*/
|
||||
address generate_ghash_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
Label L_ghash_loop, L_exit;
|
||||
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
address start = __ pc();
|
||||
|
||||
Register state = c_rarg0;
|
||||
Register subkeyH = c_rarg1;
|
||||
Register data = c_rarg2;
|
||||
Register blocks = c_rarg3;
|
||||
|
||||
FloatRegister vzr = v30;
|
||||
__ eor(vzr, __ T16B, vzr, vzr); // zero register
|
||||
|
||||
__ mov(v26, __ T16B, 1);
|
||||
__ mov(v27, __ T16B, 63);
|
||||
__ mov(v28, __ T16B, 62);
|
||||
__ mov(v29, __ T16B, 57);
|
||||
|
||||
__ ldrq(v6, Address(state));
|
||||
__ ldrq(v16, Address(subkeyH));
|
||||
|
||||
__ ext(v0, __ T16B, v6, v6, 0x08);
|
||||
__ ext(v1, __ T16B, v16, v16, 0x08);
|
||||
__ eor(v16, __ T16B, v16, v1);
|
||||
|
||||
__ bind(L_ghash_loop);
|
||||
|
||||
__ ldrq(v2, Address(__ post(data, 0x10)));
|
||||
__ rev64(v2, __ T16B, v2); // swap data
|
||||
|
||||
__ ext(v6, __ T16B, v0, v0, 0x08);
|
||||
__ eor(v6, __ T16B, v6, v2);
|
||||
__ ext(v2, __ T16B, v6, v6, 0x08);
|
||||
|
||||
__ pmull2(v7, __ T1Q, v2, v1, __ T2D); // A1*B1
|
||||
__ eor(v6, __ T16B, v6, v2);
|
||||
__ pmull(v5, __ T1Q, v2, v1, __ T1D); // A0*B0
|
||||
__ pmull(v20, __ T1Q, v6, v16, __ T1D); // (A1 + A0)(B1 + B0)
|
||||
|
||||
__ ext(v21, __ T16B, v5, v7, 0x08);
|
||||
__ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0
|
||||
__ eor(v20, __ T16B, v20, v21);
|
||||
__ eor(v20, __ T16B, v20, v18);
|
||||
|
||||
// Registers pair <v7:v5> holds the result of carry-less multiplication
|
||||
__ ins(v7, __ D, v20, 0, 1);
|
||||
__ ins(v5, __ D, v20, 1, 0);
|
||||
|
||||
// Result of the multiplication is shifted by one bit position
|
||||
// [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
|
||||
__ ushr(v18, __ T2D, v5, -63 & 63);
|
||||
__ ins(v25, __ D, v18, 1, 0);
|
||||
__ ins(v25, __ D, vzr, 0, 0);
|
||||
__ ushl(v5, __ T2D, v5, v26);
|
||||
__ orr(v5, __ T16B, v5, v25);
|
||||
|
||||
__ ushr(v19, __ T2D, v7, -63 & 63);
|
||||
__ ins(v19, __ D, v19, 1, 0);
|
||||
__ ins(v19, __ D, v18, 0, 1);
|
||||
__ ushl(v7, __ T2D, v7, v26);
|
||||
__ orr(v6, __ T16B, v7, v19);
|
||||
|
||||
__ ins(v24, __ D, v5, 0, 1);
|
||||
|
||||
// A = X0 << 63
|
||||
__ ushl(v21, __ T2D, v5, v27);
|
||||
|
||||
// A = X0 << 62
|
||||
__ ushl(v22, __ T2D, v5, v28);
|
||||
|
||||
// A = X0 << 57
|
||||
__ ushl(v23, __ T2D, v5, v29);
|
||||
|
||||
// D = X1^A^B^C
|
||||
__ eor(v21, __ T16B, v21, v22);
|
||||
__ eor(v21, __ T16B, v21, v23);
|
||||
__ eor(v21, __ T16B, v21, v24);
|
||||
__ ins(v5, __ D, v21, 1, 0);
|
||||
|
||||
// [E1:E0] = [D:X0] >> 1
|
||||
__ ushr(v20, __ T2D, v5, -1 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v27);
|
||||
__ ext(v25, __ T16B, v18, vzr, 0x08);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
__ eor(v7, __ T16B, v5, v19);
|
||||
|
||||
// [F1:F0] = [D:X0] >> 2
|
||||
__ ushr(v20, __ T2D, v5, -2 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v28);
|
||||
__ ins(v25, __ D, v18, 0, 1);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
__ eor(v7, __ T16B, v7, v19);
|
||||
|
||||
// [G1:G0] = [D:X0] >> 7
|
||||
__ ushr(v20, __ T2D, v5, -7 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v29);
|
||||
__ ins(v25, __ D, v18, 0, 1);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
// [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
|
||||
__ eor(v7, __ T16B, v7, v19);
|
||||
|
||||
// Result = [H1:H0]^[X3:X2]
|
||||
__ eor(v0, __ T16B, v7, v6);
|
||||
|
||||
__ subs(blocks, blocks, 1);
|
||||
__ cbnz(blocks, L_ghash_loop);
|
||||
|
||||
__ ext(v1, __ T16B, v0, v0, 0x08);
|
||||
__ st1(v1, __ T16B, state);
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
// Continuation point for throwing of implicit exceptions that are
|
||||
// not handled in the current activation. Fabricates an exception
|
||||
// oop and initiates normal exception dispatching in this
|
||||
@ -3438,6 +3569,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
|
||||
#ifndef BUILTIN_SIM
|
||||
// generate GHASH intrinsics code
|
||||
if (UseGHASHIntrinsics) {
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
}
|
||||
|
||||
if (UseAESIntrinsics) {
|
||||
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
|
||||
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
|
||||
|
@ -45,6 +45,10 @@
|
||||
#define HWCAP_AES (1<<3)
|
||||
#endif
|
||||
|
||||
#ifndef HWCAP_PMULL
|
||||
#define HWCAP_PMULL (1<<4)
|
||||
#endif
|
||||
|
||||
#ifndef HWCAP_SHA1
|
||||
#define HWCAP_SHA1 (1<<5)
|
||||
#endif
|
||||
@ -190,11 +194,6 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
}
|
||||
|
||||
if (UseGHASHIntrinsics) {
|
||||
warning("GHASH intrinsics are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
|
||||
UseCRC32Intrinsics = true;
|
||||
}
|
||||
@ -244,6 +243,15 @@ void VM_Version::get_processor_features() {
|
||||
FLAG_SET_DEFAULT(UseSHA, false);
|
||||
}
|
||||
|
||||
if (auxv & HWCAP_PMULL) {
|
||||
if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, true);
|
||||
}
|
||||
} else if (UseGHASHIntrinsics) {
|
||||
warning("GHASH intrinsics are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
// This machine allows unaligned memory accesses
|
||||
if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
|
||||
FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
|
||||
|
Loading…
x
Reference in New Issue
Block a user