8247645: ChaCha20 intrinsics
Reviewed-by: sviswanathan, ngasson, vlivanov, ascarpino
This commit is contained in:
parent
33587ffd35
commit
cd6bebbf34
@ -2322,6 +2322,40 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// Single-structure load/store method (all addressing variants)
|
||||
void ld_st(FloatRegister Vt, SIMD_RegVariant T, int index, Address a,
|
||||
int op1, int op2, int regs) {
|
||||
int expectedImmediate = (regVariant_to_elemBits(T) >> 3) * regs;
|
||||
int sVal = (T < D) ? (index >> (2 - T)) & 0x01 : 0;
|
||||
int opcode = (T < D) ? (T << 2) : ((T & 0x02) << 2);
|
||||
int size = (T < D) ? (index & (0x3 << T)) : 1; // only care about low 2b
|
||||
Register Xn = a.base();
|
||||
int Rm;
|
||||
|
||||
switch (a.getMode()) {
|
||||
case Address::base_plus_offset:
|
||||
guarantee(a.offset() == 0, "no offset allowed here");
|
||||
Rm = 0;
|
||||
break;
|
||||
case Address::post:
|
||||
guarantee(a.offset() == expectedImmediate, "bad offset");
|
||||
op1 |= 0b100;
|
||||
Rm = 0b11111;
|
||||
break;
|
||||
case Address::post_reg:
|
||||
op1 |= 0b100;
|
||||
Rm = a.index()->encoding();
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
starti;
|
||||
f(0,31), f((index >> (3 - T)), 30);
|
||||
f(op1, 29, 21), f(Rm, 20, 16), f(op2 | opcode | sVal, 15, 12);
|
||||
f(size, 11, 10), srf(Xn, 5), rf(Vt, 0);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
#define INSN1(NAME, op1, op2) \
|
||||
@ -2379,6 +2413,66 @@ public:
|
||||
#undef INSN3
|
||||
#undef INSN4
|
||||
|
||||
// Handle common single-structure ld/st parameter sanity checks
|
||||
// for all variations (1 to 4) of SIMD reigster inputs. This
|
||||
// method will call the routine that generates the opcode.
|
||||
template<typename R, typename... Rx>
|
||||
void ldst_sstr(SIMD_RegVariant T, int index, const Address &a,
|
||||
int op1, int op2, R firstReg, Rx... otherRegs) {
|
||||
const FloatRegister vtSet[] = { firstReg, otherRegs... };
|
||||
const int regCount = sizeof...(otherRegs) + 1;
|
||||
assert(index >= 0 && (T <= D) && ((T == B && index <= 15) ||
|
||||
(T == H && index <= 7) || (T == S && index <= 3) ||
|
||||
(T == D && index <= 1)), "invalid index");
|
||||
assert(regCount >= 1 && regCount <= 4, "illegal register count");
|
||||
|
||||
// Check to make sure when multiple SIMD registers are used
|
||||
// that they are in successive order.
|
||||
for (int i = 0; i < regCount - 1; i++) {
|
||||
assert(vtSet[i]->successor() == vtSet[i + 1],
|
||||
"Registers must be ordered");
|
||||
}
|
||||
|
||||
ld_st(firstReg, T, index, a, op1, op2, regCount);
|
||||
}
|
||||
|
||||
// Define a set of INSN1/2/3/4 macros to handle single-structure
|
||||
// load/store instructions.
|
||||
#define INSN1(NAME, op1, op2) \
|
||||
void NAME(FloatRegister Vt, SIMD_RegVariant T, int index, \
|
||||
const Address &a) { \
|
||||
ldst_sstr(T, index, a, op1, op2, Vt); \
|
||||
}
|
||||
|
||||
#define INSN2(NAME, op1, op2) \
|
||||
void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_RegVariant T, \
|
||||
int index, const Address &a) { \
|
||||
ldst_sstr(T, index, a, op1, op2, Vt, Vt2); \
|
||||
}
|
||||
|
||||
#define INSN3(NAME, op1, op2) \
|
||||
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
|
||||
SIMD_RegVariant T, int index, const Address &a) { \
|
||||
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3); \
|
||||
}
|
||||
|
||||
#define INSN4(NAME, op1, op2) \
|
||||
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
|
||||
FloatRegister Vt4, SIMD_RegVariant T, int index, \
|
||||
const Address &a) { \
|
||||
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3, Vt4); \
|
||||
}
|
||||
|
||||
INSN1(st1, 0b001101000, 0b0000);
|
||||
INSN2(st2, 0b001101001, 0b0000);
|
||||
INSN3(st3, 0b001101000, 0b0010);
|
||||
INSN4(st4, 0b001101001, 0b0010);
|
||||
|
||||
#undef INSN1
|
||||
#undef INSN2
|
||||
#undef INSN3
|
||||
#undef INSN4
|
||||
|
||||
#define INSN(NAME, opc) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
|
||||
starti; \
|
||||
@ -2749,6 +2843,7 @@ public:
|
||||
INSN(ushr, 1, 0b000001, /* isSHR = */ true);
|
||||
INSN(usra, 1, 0b000101, /* isSHR = */ true);
|
||||
INSN(ssra, 0, 0b000101, /* isSHR = */ true);
|
||||
INSN(sli, 1, 0b010101, /* isSHR = */ false);
|
||||
|
||||
#undef INSN
|
||||
|
||||
|
@ -1450,6 +1450,13 @@ public:
|
||||
void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
|
||||
void aes_round(FloatRegister input, FloatRegister subkey);
|
||||
|
||||
// ChaCha20 functions support block
|
||||
void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
|
||||
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
|
||||
FloatRegister tbl);
|
||||
void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
|
||||
FloatRegister dVec, bool colToDiag);
|
||||
|
||||
// Place an ISB after code may have been modified due to a safepoint.
|
||||
void safepoint_isb();
|
||||
|
||||
|
90
src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
Normal file
90
src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
|
||||
#include "asm/assembler.hpp"
|
||||
#include "asm/assembler.inline.hpp"
|
||||
#include "macroAssembler_aarch64.hpp"
|
||||
#include "memory/resourceArea.hpp"
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
|
||||
/**
|
||||
* Perform the quarter round calculations on values contained within
|
||||
* four SIMD registers.
|
||||
*
|
||||
* @param aVec the SIMD register containing only the "a" values
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param scratch scratch SIMD register used for 12 and 7 bit left rotations
|
||||
* @param table the SIMD register used as a table for 8 bit left rotations
|
||||
*/
|
||||
void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
|
||||
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
|
||||
FloatRegister table) {
|
||||
|
||||
// a += b, d ^= a, d <<<= 16
|
||||
addv(aVec, T4S, aVec, bVec);
|
||||
eor(dVec, T16B, dVec, aVec);
|
||||
rev32(dVec, T8H, dVec);
|
||||
|
||||
// c += d, b ^= c, b <<<= 12
|
||||
addv(cVec, T4S, cVec, dVec);
|
||||
eor(scratch, T16B, bVec, cVec);
|
||||
ushr(bVec, T4S, scratch, 20);
|
||||
sli(bVec, T4S, scratch, 12);
|
||||
|
||||
// a += b, d ^= a, d <<<= 8
|
||||
addv(aVec, T4S, aVec, bVec);
|
||||
eor(dVec, T16B, dVec, aVec);
|
||||
tbl(dVec, T16B, dVec, 1, table);
|
||||
|
||||
// c += d, b ^= c, b <<<= 7
|
||||
addv(cVec, T4S, cVec, dVec);
|
||||
eor(scratch, T16B, bVec, cVec);
|
||||
ushr(bVec, T4S, scratch, 25);
|
||||
sli(bVec, T4S, scratch, 7);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shift the b, c, and d vectors between columnar and diagonal representations.
|
||||
* Note that the "a" vector does not shift.
|
||||
*
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param colToDiag true if moving columnar to diagonal, false if
|
||||
* moving diagonal back to columnar.
|
||||
*/
|
||||
void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
|
||||
FloatRegister dVec, bool colToDiag) {
|
||||
int bShift = colToDiag ? 4 : 12;
|
||||
int cShift = 8;
|
||||
int dShift = colToDiag ? 12 : 4;
|
||||
|
||||
ext(bVec, T16B, bVec, bVec, bShift);
|
||||
ext(cVec, T16B, cVec, cVec, cShift);
|
||||
ext(dVec, T16B, dVec, dVec, dShift);
|
||||
}
|
@ -4081,6 +4081,132 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
// ChaCha20 block function. This version parallelizes by loading
|
||||
// individual 32-bit state elements into vectors for four blocks
|
||||
// (e.g. all four blocks' worth of state[0] in one register, etc.)
|
||||
//
|
||||
// state (int[16]) = c_rarg0
|
||||
// keystream (byte[1024]) = c_rarg1
|
||||
// return - number of bytes of keystream (always 256)
|
||||
address generate_chacha20Block_blockpar() {
|
||||
Label L_twoRounds, L_cc20_const;
|
||||
// The constant data is broken into two 128-bit segments to be loaded
|
||||
// onto FloatRegisters. The first 128 bits are a counter add overlay
|
||||
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
|
||||
// The second 128-bits is a table constant used for 8-bit left rotations.
|
||||
__ BIND(L_cc20_const);
|
||||
__ emit_int64(0x0000000100000000UL);
|
||||
__ emit_int64(0x0000000300000002UL);
|
||||
__ emit_int64(0x0605040702010003UL);
|
||||
__ emit_int64(0x0E0D0C0F0A09080BUL);
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
|
||||
address start = __ pc();
|
||||
__ enter();
|
||||
|
||||
int i, j;
|
||||
const Register state = c_rarg0;
|
||||
const Register keystream = c_rarg1;
|
||||
const Register loopCtr = r10;
|
||||
const Register tmpAddr = r11;
|
||||
|
||||
const FloatRegister stateFirst = v0;
|
||||
const FloatRegister stateSecond = v1;
|
||||
const FloatRegister stateThird = v2;
|
||||
const FloatRegister stateFourth = v3;
|
||||
const FloatRegister origCtrState = v28;
|
||||
const FloatRegister scratch = v29;
|
||||
const FloatRegister lrot8Tbl = v30;
|
||||
|
||||
// Organize SIMD registers in an array that facilitates
|
||||
// putting repetitive opcodes into loop structures. It is
|
||||
// important that each grouping of 4 registers is monotonically
|
||||
// increasing to support the requirements of multi-register
|
||||
// instructions (e.g. ld4r, st4, etc.)
|
||||
const FloatRegister workSt[16] = {
|
||||
v4, v5, v6, v7, v16, v17, v18, v19,
|
||||
v20, v21, v22, v23, v24, v25, v26, v27
|
||||
};
|
||||
|
||||
// Load from memory and interlace across 16 SIMD registers,
|
||||
// With each word from memory being broadcast to all lanes of
|
||||
// each successive SIMD register.
|
||||
// Addr(0) -> All lanes in workSt[i]
|
||||
// Addr(4) -> All lanes workSt[i + 1], etc.
|
||||
__ mov(tmpAddr, state);
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
|
||||
__ post(tmpAddr, 16));
|
||||
}
|
||||
|
||||
// Pull in constant data. The first 16 bytes are the add overlay
|
||||
// which is applied to the vector holding the counter (state[12]).
|
||||
// The second 16 bytes is the index register for the 8-bit left
|
||||
// rotation tbl instruction.
|
||||
__ adr(tmpAddr, L_cc20_const);
|
||||
__ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
|
||||
__ addv(workSt[12], __ T4S, workSt[12], origCtrState);
|
||||
|
||||
// Set up the 10 iteration loop and perform all 8 quarter round ops
|
||||
__ mov(loopCtr, 10);
|
||||
__ BIND(L_twoRounds);
|
||||
|
||||
__ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
|
||||
scratch, lrot8Tbl);
|
||||
|
||||
__ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
|
||||
scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
|
||||
scratch, lrot8Tbl);
|
||||
|
||||
// Decrement and iterate
|
||||
__ sub(loopCtr, loopCtr, 1);
|
||||
__ cbnz(loopCtr, L_twoRounds);
|
||||
|
||||
__ mov(tmpAddr, state);
|
||||
|
||||
// Add the starting state back to the post-loop keystream
|
||||
// state. We read/interlace the state array from memory into
|
||||
// 4 registers similar to what we did in the beginning. Then
|
||||
// add the counter overlay onto workSt[12] at the end.
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
__ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
|
||||
__ post(tmpAddr, 16));
|
||||
__ addv(workSt[i], __ T4S, workSt[i], stateFirst);
|
||||
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
|
||||
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
|
||||
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
|
||||
}
|
||||
__ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask
|
||||
|
||||
// Write to key stream, storing the same element out of workSt[0..15]
|
||||
// to consecutive 4-byte offsets in the key stream buffer, then repeating
|
||||
// for the next element position.
|
||||
for (i = 0; i < 4; i++) {
|
||||
for (j = 0; j < 16; j += 4) {
|
||||
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
|
||||
__ post(keystream, 16));
|
||||
}
|
||||
}
|
||||
|
||||
__ mov(r0, 256); // Return length of output keystream
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -7919,6 +8045,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
#endif // COMPILER2
|
||||
|
||||
if (UseChaCha20Intrinsics) {
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
|
||||
}
|
||||
|
||||
if (UseBASE64Intrinsics) {
|
||||
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
||||
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
||||
|
@ -366,6 +366,17 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (_features & CPU_ASIMD) {
|
||||
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
UseChaCha20Intrinsics = true;
|
||||
}
|
||||
} else if (UseChaCha20Intrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
warning("ChaCha20 intrinsic requires ASIMD instructions");
|
||||
}
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
|
||||
UseBASE64Intrinsics = true;
|
||||
}
|
||||
|
@ -5269,6 +5269,16 @@ void Assembler::pshufhw(XMMRegister dst, XMMRegister src, int mode) {
|
||||
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
|
||||
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
|
||||
(vector_len == AVX_256bit ? VM_Version::supports_avx2() :
|
||||
(vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
|
||||
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
|
||||
assert(isByte(mode), "invalid value");
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
@ -5290,6 +5300,16 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
|
||||
emit_int8(mode & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
|
||||
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
|
||||
(vector_len == AVX_256bit ? VM_Version::supports_avx2() :
|
||||
(vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
|
||||
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
|
||||
assert(VM_Version::supports_evex(), "requires EVEX support");
|
||||
assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, "");
|
||||
|
@ -1946,6 +1946,8 @@ private:
|
||||
void pshufhw(XMMRegister dst, XMMRegister src, int mode);
|
||||
void pshuflw(XMMRegister dst, XMMRegister src, int mode);
|
||||
void pshuflw(XMMRegister dst, Address src, int mode);
|
||||
void vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
|
||||
void vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
|
||||
|
||||
//shuffle floats and doubles
|
||||
void shufps(XMMRegister, XMMRegister, int);
|
||||
|
@ -3809,6 +3809,8 @@ void StubGenerator::generate_all() {
|
||||
|
||||
generate_ghash_stubs();
|
||||
|
||||
generate_chacha_stubs();
|
||||
|
||||
if (UseMD5Intrinsics) {
|
||||
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
|
||||
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
|
||||
|
@ -387,6 +387,18 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// Ghash single and multi block operations using AVX instructions
|
||||
address generate_avx_ghash_processBlocks();
|
||||
|
||||
// ChaCha20 stubs and helper functions
|
||||
void generate_chacha_stubs();
|
||||
address generate_chacha20Block_avx();
|
||||
address generate_chacha20Block_avx512();
|
||||
void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
|
||||
XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
|
||||
XMMRegister lrot8, XMMRegister lrot16, int vector_len);
|
||||
void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
|
||||
XMMRegister dVec, int vector_len, bool colToDiag);
|
||||
void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec,
|
||||
XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset);
|
||||
|
||||
// Poly1305 multiblock using IFMA instructions
|
||||
address generate_poly1305_processBlocks();
|
||||
void poly1305_process_blocks_avx512(const Register input, const Register length,
|
||||
|
582
src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp
Normal file
582
src/hotspot/cpu/x86/stubGenerator_x86_64_chacha.cpp
Normal file
@ -0,0 +1,582 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "asm/assembler.hpp"
|
||||
#include "asm/assembler.inline.hpp"
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
#include "macroAssembler_x86.hpp"
|
||||
#include "stubGenerator_x86_64.hpp"
|
||||
|
||||
#define __ _masm->
|
||||
|
||||
#ifdef PRODUCT
|
||||
#define BLOCK_COMMENT(str) /* nothing */
|
||||
#else
|
||||
#define BLOCK_COMMENT(str) __ block_comment(str)
|
||||
#endif // PRODUCT
|
||||
|
||||
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
|
||||
|
||||
// Constants
|
||||
|
||||
/**
|
||||
* This AVX/AVX2 add mask generation can be used for multiple duties:
|
||||
* 1.) Provide +0/+1 counter increments by loading 256 bits
|
||||
* at offset 0
|
||||
* 2.) Provide +2/+2 counter increments for the second set
|
||||
* of 4 AVX2 registers at offset 32 (256-bit load)
|
||||
* 3.) Provide a +1 increment for the second set of 4 AVX
|
||||
* registers at offset 16 (128-bit load)
|
||||
*/
|
||||
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX[] = {
|
||||
0x0000000000000000UL, 0x0000000000000000UL,
|
||||
0x0000000000000001UL, 0x0000000000000000UL,
|
||||
0x0000000000000002UL, 0x0000000000000000UL,
|
||||
0x0000000000000002UL, 0x0000000000000000UL,
|
||||
};
|
||||
static address chacha20_ctradd_avx() {
|
||||
return (address)CC20_COUNTER_ADD_AVX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add masks for 4-block ChaCha20 Block calculations
|
||||
* The first 512 bits creates a +0/+1/+2/+3 add overlay.
|
||||
* The second 512 bits is a +4/+4/+4/+4 add overlay. This
|
||||
* can be used to increment the counter fields for the next 4 blocks.
|
||||
*/
|
||||
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX512[] = {
|
||||
0x0000000000000000UL, 0x0000000000000000UL,
|
||||
0x0000000000000001UL, 0x0000000000000000UL,
|
||||
0x0000000000000002UL, 0x0000000000000000UL,
|
||||
0x0000000000000003UL, 0x0000000000000000UL,
|
||||
|
||||
0x0000000000000004UL, 0x0000000000000000UL,
|
||||
0x0000000000000004UL, 0x0000000000000000UL,
|
||||
0x0000000000000004UL, 0x0000000000000000UL,
|
||||
0x0000000000000004UL, 0x0000000000000000UL
|
||||
};
|
||||
static address chacha20_ctradd_avx512() {
|
||||
return (address)CC20_COUNTER_ADD_AVX512;
|
||||
}
|
||||
|
||||
/**
|
||||
* The first 256 bits represents a byte-wise permutation
|
||||
* for an 8-bit left-rotation on 32-bit lanes.
|
||||
* The second 256 bits is a 16-bit rotation on 32-bit lanes.
|
||||
*/
|
||||
ATTRIBUTE_ALIGNED(64) uint64_t CC20_LROT_CONSTS[] = {
|
||||
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
|
||||
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
|
||||
|
||||
0x0504070601000302UL, 0x0D0C0F0E09080B0AUL,
|
||||
0x0504070601000302UL, 0x0D0C0F0E09080B0AUL
|
||||
};
|
||||
static address chacha20_lrot_consts() {
|
||||
return (address)CC20_LROT_CONSTS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void StubGenerator::generate_chacha_stubs() {
|
||||
// Generate ChaCha20 intrinsics code
|
||||
if (UseChaCha20Intrinsics) {
|
||||
if (VM_Version::supports_evex()) {
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block_avx512();
|
||||
} else { // Either AVX or AVX2 is supported
|
||||
assert(VM_Version::supports_avx() == true, "Must at least support AVX instructions");
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block_avx();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* The 2-block AVX/AVX2-enabled ChaCha20 block function implementation */
|
||||
address StubGenerator::generate_chacha20Block_avx() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
|
||||
address start = __ pc();
|
||||
|
||||
Label L_twoRounds;
|
||||
const Register state = c_rarg0;
|
||||
const Register result = c_rarg1;
|
||||
const Register loopCounter = r8;
|
||||
const Register rotAddr = r9;
|
||||
|
||||
const XMMRegister aState = xmm0;
|
||||
const XMMRegister bState = xmm1;
|
||||
const XMMRegister cState = xmm2;
|
||||
const XMMRegister dState = xmm3;
|
||||
const XMMRegister a1Vec = xmm4;
|
||||
const XMMRegister b1Vec = xmm5;
|
||||
const XMMRegister c1Vec = xmm6;
|
||||
const XMMRegister d1Vec = xmm7;
|
||||
const XMMRegister a2Vec = xmm8;
|
||||
const XMMRegister b2Vec = xmm9;
|
||||
const XMMRegister c2Vec = xmm10;
|
||||
const XMMRegister d2Vec = xmm11;
|
||||
const XMMRegister scratch = xmm12;
|
||||
const XMMRegister d2State = xmm13;
|
||||
const XMMRegister lrot8 = xmm14;
|
||||
const XMMRegister lrot16 = xmm15;
|
||||
|
||||
int vector_len;
|
||||
int outlen;
|
||||
|
||||
// This function will only be called if AVX2 or AVX are supported
|
||||
// AVX512 uses a different function.
|
||||
if (VM_Version::supports_avx2()) {
|
||||
vector_len = Assembler::AVX_256bit;
|
||||
outlen = 256;
|
||||
} else if (VM_Version::supports_avx()) {
|
||||
vector_len = Assembler::AVX_128bit;
|
||||
outlen = 128;
|
||||
}
|
||||
|
||||
__ enter();
|
||||
|
||||
// Load the initial state in columnar orientation and then copy
|
||||
// that starting state to the working register set.
|
||||
// Also load the address of the add mask for later use in handling
|
||||
// multi-block counter increments.
|
||||
__ lea(rotAddr, ExternalAddress(chacha20_lrot_consts()));
|
||||
__ lea(rax, ExternalAddress(chacha20_ctradd_avx()));
|
||||
if (vector_len == Assembler::AVX_128bit) {
|
||||
__ movdqu(aState, Address(state, 0)); // Bytes 0 - 15 -> a1Vec
|
||||
__ movdqu(bState, Address(state, 16)); // Bytes 16 - 31 -> b1Vec
|
||||
__ movdqu(cState, Address(state, 32)); // Bytes 32 - 47 -> c1Vec
|
||||
__ movdqu(dState, Address(state, 48)); // Bytes 48 - 63 -> d1Vec
|
||||
|
||||
__ movdqu(a1Vec, aState);
|
||||
__ movdqu(b1Vec, bState);
|
||||
__ movdqu(c1Vec, cState);
|
||||
__ movdqu(d1Vec, dState);
|
||||
|
||||
__ movdqu(a2Vec, aState);
|
||||
__ movdqu(b2Vec, bState);
|
||||
__ movdqu(c2Vec, cState);
|
||||
__ vpaddd(d2State, dState, Address(rax, 16), vector_len);
|
||||
__ movdqu(d2Vec, d2State);
|
||||
__ movdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
|
||||
__ movdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
|
||||
} else {
|
||||
// We will broadcast each 128-bit segment of the state array into
|
||||
// the high and low halves of ymm state registers. Then apply the add
|
||||
// mask to the dState register. These will then be copied into the
|
||||
// a/b/c/d1Vec working registers.
|
||||
__ vbroadcastf128(aState, Address(state, 0), vector_len);
|
||||
__ vbroadcastf128(bState, Address(state, 16), vector_len);
|
||||
__ vbroadcastf128(cState, Address(state, 32), vector_len);
|
||||
__ vbroadcastf128(dState, Address(state, 48), vector_len);
|
||||
__ vpaddd(dState, dState, Address(rax, 0), vector_len);
|
||||
__ vpaddd(d2State, dState, Address(rax, 32), vector_len);
|
||||
|
||||
__ vmovdqu(a1Vec, aState);
|
||||
__ vmovdqu(b1Vec, bState);
|
||||
__ vmovdqu(c1Vec, cState);
|
||||
__ vmovdqu(d1Vec, dState);
|
||||
|
||||
__ vmovdqu(a2Vec, aState);
|
||||
__ vmovdqu(b2Vec, bState);
|
||||
__ vmovdqu(c2Vec, cState);
|
||||
__ vmovdqu(d2Vec, d2State);
|
||||
__ vmovdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
|
||||
__ vmovdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
|
||||
}
|
||||
|
||||
__ movl(loopCounter, 10); // Set 10 2-round iterations
|
||||
__ BIND(L_twoRounds);
|
||||
|
||||
// The first quarter round macro call covers the first 4 QR operations:
|
||||
// Qround(state, 0, 4, 8,12)
|
||||
// Qround(state, 1, 5, 9,13)
|
||||
// Qround(state, 2, 6,10,14)
|
||||
// Qround(state, 3, 7,11,15)
|
||||
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
|
||||
lrot8, lrot16, vector_len);
|
||||
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
|
||||
lrot8, lrot16, vector_len);
|
||||
|
||||
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
|
||||
// to diagonals. The a1Vec does not need to change orientation.
|
||||
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, true);
|
||||
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, true);
|
||||
|
||||
// The second set of operations on the vectors covers the second 4 quarter
|
||||
// round operations, now acting on the diagonals:
|
||||
// Qround(state, 0, 5,10,15)
|
||||
// Qround(state, 1, 6,11,12)
|
||||
// Qround(state, 2, 7, 8,13)
|
||||
// Qround(state, 3, 4, 9,14)
|
||||
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
|
||||
lrot8, lrot16, vector_len);
|
||||
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
|
||||
lrot8, lrot16, vector_len);
|
||||
|
||||
// Before we start the next iteration, we need to perform shuffles
|
||||
// on the b/c/d vectors to move them back to columnar organizations
|
||||
// from their current diagonal orientation.
|
||||
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, false);
|
||||
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, false);
|
||||
|
||||
__ decrement(loopCounter);
|
||||
__ jcc(Assembler::notZero, L_twoRounds);
|
||||
|
||||
// Add the original start state back into the current state.
|
||||
__ vpaddd(a1Vec, a1Vec, aState, vector_len);
|
||||
__ vpaddd(b1Vec, b1Vec, bState, vector_len);
|
||||
__ vpaddd(c1Vec, c1Vec, cState, vector_len);
|
||||
__ vpaddd(d1Vec, d1Vec, dState, vector_len);
|
||||
|
||||
__ vpaddd(a2Vec, a2Vec, aState, vector_len);
|
||||
__ vpaddd(b2Vec, b2Vec, bState, vector_len);
|
||||
__ vpaddd(c2Vec, c2Vec, cState, vector_len);
|
||||
__ vpaddd(d2Vec, d2Vec, d2State, vector_len);
|
||||
|
||||
// Write the data to the keystream array
|
||||
if (vector_len == Assembler::AVX_128bit) {
|
||||
__ movdqu(Address(result, 0), a1Vec);
|
||||
__ movdqu(Address(result, 16), b1Vec);
|
||||
__ movdqu(Address(result, 32), c1Vec);
|
||||
__ movdqu(Address(result, 48), d1Vec);
|
||||
__ movdqu(Address(result, 64), a2Vec);
|
||||
__ movdqu(Address(result, 80), b2Vec);
|
||||
__ movdqu(Address(result, 96), c2Vec);
|
||||
__ movdqu(Address(result, 112), d2Vec);
|
||||
} else {
|
||||
// Each half of the YMM has to be written 64 bytes apart from
|
||||
// each other in memory so the final keystream buffer holds
|
||||
// two consecutive keystream blocks.
|
||||
__ vextracti128(Address(result, 0), a1Vec, 0);
|
||||
__ vextracti128(Address(result, 64), a1Vec, 1);
|
||||
__ vextracti128(Address(result, 16), b1Vec, 0);
|
||||
__ vextracti128(Address(result, 80), b1Vec, 1);
|
||||
__ vextracti128(Address(result, 32), c1Vec, 0);
|
||||
__ vextracti128(Address(result, 96), c1Vec, 1);
|
||||
__ vextracti128(Address(result, 48), d1Vec, 0);
|
||||
__ vextracti128(Address(result, 112), d1Vec, 1);
|
||||
|
||||
__ vextracti128(Address(result, 128), a2Vec, 0);
|
||||
__ vextracti128(Address(result, 192), a2Vec, 1);
|
||||
__ vextracti128(Address(result, 144), b2Vec, 0);
|
||||
__ vextracti128(Address(result, 208), b2Vec, 1);
|
||||
__ vextracti128(Address(result, 160), c2Vec, 0);
|
||||
__ vextracti128(Address(result, 224), c2Vec, 1);
|
||||
__ vextracti128(Address(result, 176), d2Vec, 0);
|
||||
__ vextracti128(Address(result, 240), d2Vec, 1);
|
||||
}
|
||||
|
||||
// This function will always write 128 or 256 bytes into the
|
||||
// key stream buffer, depending on the length of the SIMD
|
||||
// registers. That length should be returned through %rax.
|
||||
__ mov64(rax, outlen);
|
||||
|
||||
__ leave();
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
/* The 4-block AVX512-enabled ChaCha20 block function implementation */
|
||||
address StubGenerator::generate_chacha20Block_avx512() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
|
||||
address start = __ pc();
|
||||
|
||||
Label L_twoRounds;
|
||||
const Register state = c_rarg0;
|
||||
const Register result = c_rarg1;
|
||||
const Register loopCounter = r8;
|
||||
|
||||
const XMMRegister aState = xmm0;
|
||||
const XMMRegister bState = xmm1;
|
||||
const XMMRegister cState = xmm2;
|
||||
const XMMRegister dState = xmm3;
|
||||
const XMMRegister a1Vec = xmm4;
|
||||
const XMMRegister b1Vec = xmm5;
|
||||
const XMMRegister c1Vec = xmm6;
|
||||
const XMMRegister d1Vec = xmm7;
|
||||
const XMMRegister a2Vec = xmm8;
|
||||
const XMMRegister b2Vec = xmm9;
|
||||
const XMMRegister c2Vec = xmm10;
|
||||
const XMMRegister d2Vec = xmm11;
|
||||
const XMMRegister a3Vec = xmm12;
|
||||
const XMMRegister b3Vec = xmm13;
|
||||
const XMMRegister c3Vec = xmm14;
|
||||
const XMMRegister d3Vec = xmm15;
|
||||
const XMMRegister a4Vec = xmm16;
|
||||
const XMMRegister b4Vec = xmm17;
|
||||
const XMMRegister c4Vec = xmm18;
|
||||
const XMMRegister d4Vec = xmm19;
|
||||
const XMMRegister d2State = xmm20;
|
||||
const XMMRegister d3State = xmm21;
|
||||
const XMMRegister d4State = xmm22;
|
||||
const XMMRegister scratch = xmm23;
|
||||
|
||||
__ enter();
|
||||
|
||||
// Load the initial state in columnar orientation.
|
||||
// We will broadcast each 128-bit segment of the state array into
|
||||
// all four double-quadword slots on ZMM State registers. They will
|
||||
// be copied into the working ZMM registers and then added back in
|
||||
// at the very end of the block function. The add mask should be
|
||||
// applied to the dState register so it does not need to be fetched
|
||||
// when adding the start state back into the final working state.
|
||||
__ lea(rax, ExternalAddress(chacha20_ctradd_avx512()));
|
||||
__ evbroadcasti32x4(aState, Address(state, 0), Assembler::AVX_512bit);
|
||||
__ evbroadcasti32x4(bState, Address(state, 16), Assembler::AVX_512bit);
|
||||
__ evbroadcasti32x4(cState, Address(state, 32), Assembler::AVX_512bit);
|
||||
__ evbroadcasti32x4(dState, Address(state, 48), Assembler::AVX_512bit);
|
||||
__ vpaddd(dState, dState, Address(rax, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqul(scratch, Address(rax, 64), Assembler::AVX_512bit);
|
||||
__ vpaddd(d2State, dState, scratch, Assembler::AVX_512bit);
|
||||
__ vpaddd(d3State, d2State, scratch, Assembler::AVX_512bit);
|
||||
__ vpaddd(d4State, d3State, scratch, Assembler::AVX_512bit);
|
||||
|
||||
__ evmovdqul(a1Vec, aState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(b1Vec, bState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(c1Vec, cState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(d1Vec, dState, Assembler::AVX_512bit);
|
||||
|
||||
__ evmovdqul(a2Vec, aState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(b2Vec, bState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(c2Vec, cState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(d2Vec, d2State, Assembler::AVX_512bit);
|
||||
|
||||
__ evmovdqul(a3Vec, aState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(b3Vec, bState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(c3Vec, cState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(d3Vec, d3State, Assembler::AVX_512bit);
|
||||
|
||||
__ evmovdqul(a4Vec, aState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(b4Vec, bState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(c4Vec, cState, Assembler::AVX_512bit);
|
||||
__ evmovdqul(d4Vec, d4State, Assembler::AVX_512bit);
|
||||
|
||||
__ movl(loopCounter, 10); // Set 10 2-round iterations
|
||||
__ BIND(L_twoRounds);
|
||||
|
||||
// The first set of operations on the vectors covers the first 4 quarter
|
||||
// round operations:
|
||||
// Qround(state, 0, 4, 8,12)
|
||||
// Qround(state, 1, 5, 9,13)
|
||||
// Qround(state, 2, 6,10,14)
|
||||
// Qround(state, 3, 7,11,15)
|
||||
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
|
||||
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
|
||||
// to diagonals. The a1Vec does not need to change orientation.
|
||||
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, true);
|
||||
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, true);
|
||||
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, true);
|
||||
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, true);
|
||||
|
||||
// The second set of operations on the vectors covers the second 4 quarter
|
||||
// round operations, now acting on the diagonals:
|
||||
// Qround(state, 0, 5,10,15)
|
||||
// Qround(state, 1, 6,11,12)
|
||||
// Qround(state, 2, 7, 8,13)
|
||||
// Qround(state, 3, 4, 9,14)
|
||||
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
|
||||
xnoreg, xnoreg, Assembler::AVX_512bit);
|
||||
|
||||
// Before we start the next iteration, we need to perform shuffles
|
||||
// on the b/c/d vectors to move them back to columnar organizations
|
||||
// from their current diagonal orientation.
|
||||
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, false);
|
||||
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, false);
|
||||
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, false);
|
||||
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, false);
|
||||
|
||||
__ decrement(loopCounter);
|
||||
__ jcc(Assembler::notZero, L_twoRounds);
|
||||
|
||||
// Add the initial state now held on the a/b/c/dState registers to the
|
||||
// final working register values. We will also add in the counter add
|
||||
// mask onto zmm3 after adding in the start state.
|
||||
__ vpaddd(a1Vec, a1Vec, aState, Assembler::AVX_512bit);
|
||||
__ vpaddd(b1Vec, b1Vec, bState, Assembler::AVX_512bit);
|
||||
__ vpaddd(c1Vec, c1Vec, cState, Assembler::AVX_512bit);
|
||||
__ vpaddd(d1Vec, d1Vec, dState, Assembler::AVX_512bit);
|
||||
|
||||
__ vpaddd(a2Vec, a2Vec, aState, Assembler::AVX_512bit);
|
||||
__ vpaddd(b2Vec, b2Vec, bState, Assembler::AVX_512bit);
|
||||
__ vpaddd(c2Vec, c2Vec, cState, Assembler::AVX_512bit);
|
||||
__ vpaddd(d2Vec, d2Vec, d2State, Assembler::AVX_512bit);
|
||||
|
||||
__ vpaddd(a3Vec, a3Vec, aState, Assembler::AVX_512bit);
|
||||
__ vpaddd(b3Vec, b3Vec, bState, Assembler::AVX_512bit);
|
||||
__ vpaddd(c3Vec, c3Vec, cState, Assembler::AVX_512bit);
|
||||
__ vpaddd(d3Vec, d3Vec, d3State, Assembler::AVX_512bit);
|
||||
|
||||
__ vpaddd(a4Vec, a4Vec, aState, Assembler::AVX_512bit);
|
||||
__ vpaddd(b4Vec, b4Vec, bState, Assembler::AVX_512bit);
|
||||
__ vpaddd(c4Vec, c4Vec, cState, Assembler::AVX_512bit);
|
||||
__ vpaddd(d4Vec, d4Vec, d4State, Assembler::AVX_512bit);
|
||||
|
||||
// Write the ZMM state registers out to the key stream buffer
|
||||
// Each ZMM is divided into 4 128-bit segments. Each segment
|
||||
// is written to memory at 64-byte displacements from one
|
||||
// another. The result is that all 4 blocks will be in their
|
||||
// proper order when serialized.
|
||||
cc20_keystream_collate_avx512(a1Vec, b1Vec, c1Vec, d1Vec, result, 0);
|
||||
cc20_keystream_collate_avx512(a2Vec, b2Vec, c2Vec, d2Vec, result, 256);
|
||||
cc20_keystream_collate_avx512(a3Vec, b3Vec, c3Vec, d3Vec, result, 512);
|
||||
cc20_keystream_collate_avx512(a4Vec, b4Vec, c4Vec, d4Vec, result, 768);
|
||||
|
||||
// This function will always write 1024 bytes into the key stream buffer
|
||||
// and that length should be returned through %rax.
|
||||
__ mov64(rax, 1024);
|
||||
|
||||
__ leave();
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide a function that implements the ChaCha20 quarter round function.
|
||||
*
|
||||
* @param aVec the SIMD register containing only the "a" values
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param scratch SIMD register used for non-byte-aligned left rotations
|
||||
* @param lrot8 shuffle control mask for an 8-byte left rotation (32-bit lane)
|
||||
* @param lrot16 shuffle control mask for a 16-byte left rotation (32-bit lane)
|
||||
* @param vector_len the length of the vector
|
||||
*/
|
||||
void StubGenerator::cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
|
||||
XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
|
||||
XMMRegister lrot8, XMMRegister lrot16, int vector_len) {
|
||||
|
||||
// a += b; d ^= a; d <<<= 16
|
||||
__ vpaddd(aVec, aVec, bVec, vector_len);
|
||||
__ vpxor(dVec, dVec, aVec, vector_len);
|
||||
if (vector_len == Assembler::AVX_512bit) {
|
||||
__ evprold(dVec, dVec, 16, vector_len);
|
||||
} else {
|
||||
__ vpshufb(dVec, dVec, lrot16, vector_len);
|
||||
}
|
||||
|
||||
// c += d; b ^= c; b <<<= 12 (b << 12 | scratch >>> 20)
|
||||
__ vpaddd(cVec, cVec, dVec, vector_len);
|
||||
__ vpxor(bVec, bVec, cVec, vector_len);
|
||||
if (vector_len == Assembler::AVX_512bit) {
|
||||
__ evprold(bVec, bVec, 12, vector_len);
|
||||
} else {
|
||||
__ vpsrld(scratch, bVec, 20, vector_len);
|
||||
__ vpslld(bVec, bVec, 12, vector_len);
|
||||
__ vpor(bVec, bVec, scratch, vector_len);
|
||||
}
|
||||
|
||||
// a += b; d ^= a; d <<<= 8 (d << 8 | scratch >>> 24)
|
||||
__ vpaddd(aVec, aVec, bVec, vector_len);
|
||||
__ vpxor(dVec, dVec, aVec, vector_len);
|
||||
if (vector_len == Assembler::AVX_512bit) {
|
||||
__ evprold(dVec, dVec, 8, vector_len);
|
||||
} else {
|
||||
__ vpshufb(dVec, dVec, lrot8, vector_len);
|
||||
}
|
||||
|
||||
// c += d; b ^= c; b <<<= 7 (b << 7 | scratch >>> 25)
|
||||
__ vpaddd(cVec, cVec, dVec, vector_len);
|
||||
__ vpxor(bVec, bVec, cVec, vector_len);
|
||||
if (vector_len == Assembler::AVX_512bit) {
|
||||
__ evprold(bVec, bVec, 7, vector_len);
|
||||
} else {
|
||||
__ vpsrld(scratch, bVec, 25, vector_len);
|
||||
__ vpslld(bVec, bVec, 7, vector_len);
|
||||
__ vpor(bVec, bVec, scratch, vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shift the b, c, and d vectors between columnar and diagonal representations.
|
||||
* Note that the "a" vector does not shift.
|
||||
*
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param vector_len the size of the SIMD register to operate upon
|
||||
* @param colToDiag true if moving columnar to diagonal, false if
|
||||
* moving diagonal back to columnar.
|
||||
*/
|
||||
void StubGenerator::cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
|
||||
XMMRegister dVec, int vector_len, bool colToDiag) {
|
||||
int bShift = colToDiag ? 0x39 : 0x93;
|
||||
int cShift = 0x4E;
|
||||
int dShift = colToDiag ? 0x93 : 0x39;
|
||||
|
||||
__ vpshufd(bVec, bVec, bShift, vector_len);
|
||||
__ vpshufd(cVec, cVec, cShift, vector_len);
|
||||
__ vpshufd(dVec, dVec, dShift, vector_len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write 256 bytes of keystream output held in 4 AVX512 SIMD registers
|
||||
* in a quarter round parallel organization.
|
||||
*
|
||||
* @param aVec the SIMD register containing only the "a" values
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param baseAddr the register holding the base output address
|
||||
* @param baseOffset the offset from baseAddr for writes
|
||||
*/
|
||||
void StubGenerator::cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister
|
||||
bVec,
|
||||
XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset) {
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 0), aVec, 0);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 64), aVec, 1);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 128), aVec, 2);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 192), aVec, 3);
|
||||
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 16), bVec, 0);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 80), bVec, 1);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 144), bVec, 2);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 208), bVec, 3);
|
||||
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 32), cVec, 0);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 96), cVec, 1);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 160), cVec, 2);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 224), cVec, 3);
|
||||
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 48), dVec, 0);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 112), dVec, 1);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 176), dVec, 2);
|
||||
__ vextracti32x4(Address(baseAddr, baseOffset + 240), dVec, 3);
|
||||
}
|
||||
|
||||
#undef __
|
@ -1122,6 +1122,22 @@ void VM_Version::get_processor_features() {
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
// ChaCha20 Intrinsics
|
||||
// As long as the system supports AVX as a baseline we can do a
|
||||
// SIMD-enabled block function. StubGenerator makes the determination
|
||||
// based on the VM capabilities whether to use an AVX2 or AVX512-enabled
|
||||
// version.
|
||||
if (UseAVX >= 1) {
|
||||
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
UseChaCha20Intrinsics = true;
|
||||
}
|
||||
} else if (UseChaCha20Intrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
warning("ChaCha20 intrinsic requires AVX instructions");
|
||||
}
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
|
||||
}
|
||||
|
||||
// Base64 Intrinsics (Check the condition for which the intrinsic will be active)
|
||||
if ((UseAVX > 2) && supports_avx512vl() && supports_avx512bw()) {
|
||||
if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
|
||||
|
@ -475,6 +475,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
|
||||
case vmIntrinsics::_ghash_processBlocks:
|
||||
if (!UseGHASHIntrinsics) return true;
|
||||
break;
|
||||
case vmIntrinsics::_chacha20Block:
|
||||
if (!UseChaCha20Intrinsics) return true;
|
||||
break;
|
||||
case vmIntrinsics::_base64_encodeBlock:
|
||||
case vmIntrinsics::_base64_decodeBlock:
|
||||
if (!UseBASE64Intrinsics) return true;
|
||||
|
@ -532,6 +532,12 @@ class methodHandle;
|
||||
do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \
|
||||
do_name(processMultipleBlocks_name, "processMultipleBlocks") \
|
||||
\
|
||||
/* support for com.sun.crypto.provider.ChaCha20Cipher */ \
|
||||
do_class(com_sun_crypto_provider_chacha20cipher, "com/sun/crypto/provider/ChaCha20Cipher") \
|
||||
do_intrinsic(_chacha20Block, com_sun_crypto_provider_chacha20cipher, chacha20Block_name, chacha20Block_signature, F_S) \
|
||||
do_name(chacha20Block_name, "implChaCha20Block") \
|
||||
do_signature(chacha20Block_signature, "([I[B)I") \
|
||||
\
|
||||
/* support for java.util.zip */ \
|
||||
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
|
||||
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \
|
||||
|
@ -318,6 +318,7 @@
|
||||
static_field(StubRoutines, _ghash_processBlocks, address) \
|
||||
static_field(StubRoutines, _md5_implCompress, address) \
|
||||
static_field(StubRoutines, _md5_implCompressMB, address) \
|
||||
static_field(StubRoutines, _chacha20Block, address) \
|
||||
static_field(StubRoutines, _sha1_implCompress, address) \
|
||||
static_field(StubRoutines, _sha1_implCompressMB, address) \
|
||||
static_field(StubRoutines, _sha256_implCompress, address) \
|
||||
|
@ -737,6 +737,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
|
||||
case vmIntrinsics::_bigIntegerLeftShiftWorker:
|
||||
case vmIntrinsics::_vectorizedMismatch:
|
||||
case vmIntrinsics::_ghash_processBlocks:
|
||||
case vmIntrinsics::_chacha20Block:
|
||||
case vmIntrinsics::_base64_encodeBlock:
|
||||
case vmIntrinsics::_base64_decodeBlock:
|
||||
case vmIntrinsics::_poly1305_processBlocks:
|
||||
|
@ -1168,6 +1168,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
|
||||
strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "chacha20Block") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "md5_implCompress") == 0 ||
|
||||
|
@ -608,6 +608,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
|
||||
|
||||
case vmIntrinsics::_ghash_processBlocks:
|
||||
return inline_ghash_processBlocks();
|
||||
case vmIntrinsics::_chacha20Block:
|
||||
return inline_chacha20Block();
|
||||
case vmIntrinsics::_base64_encodeBlock:
|
||||
return inline_base64_encodeBlock();
|
||||
case vmIntrinsics::_base64_decodeBlock:
|
||||
@ -6897,6 +6899,36 @@ bool LibraryCallKit::inline_ghash_processBlocks() {
|
||||
return true;
|
||||
}
|
||||
|
||||
//------------------------------inline_chacha20Block
|
||||
bool LibraryCallKit::inline_chacha20Block() {
|
||||
address stubAddr;
|
||||
const char *stubName;
|
||||
assert(UseChaCha20Intrinsics, "need ChaCha20 intrinsics support");
|
||||
|
||||
stubAddr = StubRoutines::chacha20Block();
|
||||
stubName = "chacha20Block";
|
||||
|
||||
Node* state = argument(0);
|
||||
Node* result = argument(1);
|
||||
|
||||
state = must_be_not_null(state, true);
|
||||
result = must_be_not_null(result, true);
|
||||
|
||||
Node* state_start = array_element_address(state, intcon(0), T_INT);
|
||||
assert(state_start, "state is NULL");
|
||||
Node* result_start = array_element_address(result, intcon(0), T_BYTE);
|
||||
assert(result_start, "result is NULL");
|
||||
|
||||
Node* cc20Blk = make_runtime_call(RC_LEAF|RC_NO_FP,
|
||||
OptoRuntime::chacha20Block_Type(),
|
||||
stubAddr, stubName, TypePtr::BOTTOM,
|
||||
state_start, result_start);
|
||||
// return key stream length (int)
|
||||
Node* retvalue = _gvn.transform(new ProjNode(cc20Blk, TypeFunc::Parms));
|
||||
set_result(retvalue);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LibraryCallKit::inline_base64_encodeBlock() {
|
||||
address stubAddr;
|
||||
const char *stubName;
|
||||
|
@ -291,6 +291,7 @@ class LibraryCallKit : public GraphKit {
|
||||
Node* inline_counterMode_AESCrypt_predicate();
|
||||
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
|
||||
bool inline_ghash_processBlocks();
|
||||
bool inline_chacha20Block();
|
||||
bool inline_base64_encodeBlock();
|
||||
bool inline_base64_decodeBlock();
|
||||
bool inline_poly1305_processBlocks();
|
||||
|
@ -1222,6 +1222,26 @@ const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
|
||||
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
|
||||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
// ChaCha20 Block function
|
||||
const TypeFunc* OptoRuntime::chacha20Block_Type() {
|
||||
int argcnt = 2;
|
||||
|
||||
const Type** fields = TypeTuple::fields(argcnt);
|
||||
int argp = TypeFunc::Parms;
|
||||
fields[argp++] = TypePtr::NOTNULL; // state
|
||||
fields[argp++] = TypePtr::NOTNULL; // result
|
||||
|
||||
assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
|
||||
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
|
||||
|
||||
// result type needed
|
||||
fields = TypeTuple::fields(1);
|
||||
fields[TypeFunc::Parms + 0] = TypeInt::INT; // key stream outlen as int
|
||||
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
|
||||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
// Base64 encode function
|
||||
const TypeFunc* OptoRuntime::base64_encodeBlock_Type() {
|
||||
int argcnt = 6;
|
||||
|
@ -278,6 +278,7 @@ private:
|
||||
static const TypeFunc* vectorizedMismatch_Type();
|
||||
|
||||
static const TypeFunc* ghash_processBlocks_Type();
|
||||
static const TypeFunc* chacha20Block_Type();
|
||||
static const TypeFunc* base64_encodeBlock_Type();
|
||||
static const TypeFunc* base64_decodeBlock_Type();
|
||||
static const TypeFunc* poly1305_processBlocks_Type();
|
||||
|
@ -323,6 +323,9 @@ const int ObjectAlignmentInBytes = 8;
|
||||
product(bool, UseAESCTRIntrinsics, false, DIAGNOSTIC, \
|
||||
"Use intrinsics for the paralleled version of AES/CTR crypto") \
|
||||
\
|
||||
product(bool, UseChaCha20Intrinsics, false, DIAGNOSTIC, \
|
||||
"Use intrinsics for the vectorized version of ChaCha20") \
|
||||
\
|
||||
product(bool, UseMD5Intrinsics, false, DIAGNOSTIC, \
|
||||
"Use intrinsics for MD5 crypto hash function") \
|
||||
\
|
||||
|
@ -128,6 +128,7 @@ address StubRoutines::_electronicCodeBook_decryptAESCrypt = NULL;
|
||||
address StubRoutines::_counterMode_AESCrypt = NULL;
|
||||
address StubRoutines::_galoisCounterMode_AESCrypt = NULL;
|
||||
address StubRoutines::_ghash_processBlocks = NULL;
|
||||
address StubRoutines::_chacha20Block = NULL;
|
||||
address StubRoutines::_base64_encodeBlock = NULL;
|
||||
address StubRoutines::_base64_decodeBlock = NULL;
|
||||
address StubRoutines::_poly1305_processBlocks = NULL;
|
||||
|
@ -209,6 +209,7 @@ class StubRoutines: AllStatic {
|
||||
static address _counterMode_AESCrypt;
|
||||
static address _galoisCounterMode_AESCrypt;
|
||||
static address _ghash_processBlocks;
|
||||
static address _chacha20Block;
|
||||
static address _base64_encodeBlock;
|
||||
static address _base64_decodeBlock;
|
||||
static address _poly1305_processBlocks;
|
||||
@ -388,6 +389,7 @@ class StubRoutines: AllStatic {
|
||||
static address poly1305_processBlocks() { return _poly1305_processBlocks; }
|
||||
static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
|
||||
static address ghash_processBlocks() { return _ghash_processBlocks; }
|
||||
static address chacha20Block() { return _chacha20Block; }
|
||||
static address base64_encodeBlock() { return _base64_encodeBlock; }
|
||||
static address base64_decodeBlock() { return _base64_decodeBlock; }
|
||||
static address md5_implCompress() { return _md5_implCompress; }
|
||||
|
@ -541,6 +541,7 @@
|
||||
static_field(StubRoutines, _counterMode_AESCrypt, address) \
|
||||
static_field(StubRoutines, _galoisCounterMode_AESCrypt, address) \
|
||||
static_field(StubRoutines, _ghash_processBlocks, address) \
|
||||
static_field(StubRoutines, _chacha20Block, address) \
|
||||
static_field(StubRoutines, _base64_encodeBlock, address) \
|
||||
static_field(StubRoutines, _base64_decodeBlock, address) \
|
||||
static_field(StubRoutines, _poly1305_processBlocks, address) \
|
||||
|
@ -39,6 +39,9 @@ import javax.crypto.*;
|
||||
import javax.crypto.spec.ChaCha20ParameterSpec;
|
||||
import javax.crypto.spec.IvParameterSpec;
|
||||
import javax.crypto.spec.SecretKeySpec;
|
||||
|
||||
import jdk.internal.vm.annotation.ForceInline;
|
||||
import jdk.internal.vm.annotation.IntrinsicCandidate;
|
||||
import sun.security.util.DerValue;
|
||||
|
||||
/**
|
||||
@ -58,8 +61,9 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
private static final int STATE_CONST_3 = 0x6b206574;
|
||||
|
||||
// The keystream block size in bytes and as integers
|
||||
private static final int KEYSTREAM_SIZE = 64;
|
||||
private static final int KS_SIZE_INTS = KEYSTREAM_SIZE / Integer.BYTES;
|
||||
private static final int KS_MAX_LEN = 1024;
|
||||
private static final int KS_BLK_SIZE = 64;
|
||||
private static final int KS_SIZE_INTS = KS_BLK_SIZE / Integer.BYTES;
|
||||
private static final int CIPHERBUF_BASE = 1024;
|
||||
|
||||
// The initialization state of the cipher
|
||||
@ -85,14 +89,18 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
private long finalCounterValue;
|
||||
private long counter;
|
||||
|
||||
// Two arrays, both implemented as 16-element integer arrays:
|
||||
// The base state, created at initialization time, and a working
|
||||
// state which is a clone of the start state, and is then modified
|
||||
// with the counter and the ChaCha20 block function.
|
||||
// The base state is created at initialization time as a 16-int array
|
||||
// and then is copied into either local variables for computations (Java) or
|
||||
// into SIMD registers (intrinsics).
|
||||
private final int[] startState = new int[KS_SIZE_INTS];
|
||||
private final byte[] keyStream = new byte[KEYSTREAM_SIZE];
|
||||
|
||||
// The offset into the current keystream
|
||||
// The output keystream array is sized to hold keystream output from the
|
||||
// implChaCha20Block method. This can range from a single block at a time
|
||||
// (Java software) up to 16 blocks on x86_64 with AVX512 support.
|
||||
private final byte[] keyStream = new byte[KS_MAX_LEN];
|
||||
|
||||
// The keystream buffer limit and offset
|
||||
private int keyStrLimit;
|
||||
private int keyStrOffset;
|
||||
|
||||
// AEAD-related fields and constants
|
||||
@ -561,12 +569,14 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
}
|
||||
}
|
||||
|
||||
// We can also get one block's worth of keystream created
|
||||
// We can also generate the first block (or blocks if intrinsics
|
||||
// are capable of doing multiple blocks at a time) of keystream.
|
||||
finalCounterValue = counter + MAX_UINT32;
|
||||
generateKeystream();
|
||||
this.keyStrLimit = chaCha20Block(startState, counter, keyStream);
|
||||
this.keyStrOffset = 0;
|
||||
this.counter += (keyStrLimit / KS_BLK_SIZE);
|
||||
direction = opmode;
|
||||
aadDone = false;
|
||||
this.keyStrOffset = 0;
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
@ -831,31 +841,34 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Using the current state and counter create the next set of keystream
|
||||
* bytes. This method will generate the next 512 bits of keystream and
|
||||
* return it in the {@code keyStream} parameter. Following the
|
||||
* block function the counter will be incremented.
|
||||
*/
|
||||
private void generateKeystream() {
|
||||
chaCha20Block(startState, counter, keyStream);
|
||||
counter++;
|
||||
@ForceInline
|
||||
private static int chaCha20Block(int[] initState, long counter,
|
||||
byte[] result) {
|
||||
if (initState.length != KS_SIZE_INTS || result.length != KS_MAX_LEN) {
|
||||
throw new IllegalArgumentException(
|
||||
"Illegal state or keystream buffer length");
|
||||
}
|
||||
|
||||
// Set the counter value before sending into the underlying
|
||||
// private block method
|
||||
initState[12] = (int)counter;
|
||||
return implChaCha20Block(initState, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a full 20-round ChaCha20 transform on the initial state.
|
||||
*
|
||||
* @param initState the starting state, not including the counter
|
||||
* value.
|
||||
* @param counter the counter value to apply
|
||||
* @param initState the starting state using the current counter value.
|
||||
* @param result the array that will hold the result of the ChaCha20
|
||||
* block function.
|
||||
*
|
||||
* @note it is the caller's responsibility to ensure that the workState
|
||||
* is sized the same as the initState, no checking is performed internally.
|
||||
* @return the number of keystream bytes generated. In a pure Java method
|
||||
* this will always be 64 bytes, but intrinsics that make use of
|
||||
* AVX2 or AVX512 registers may generate multiple blocks of keystream
|
||||
* in a single call and therefore may be a larger multiple of 64.
|
||||
*/
|
||||
private static void chaCha20Block(int[] initState, long counter,
|
||||
byte[] result) {
|
||||
@IntrinsicCandidate
|
||||
private static int implChaCha20Block(int[] initState, byte[] result) {
|
||||
// Create an initial state and clone a working copy
|
||||
int ws00 = STATE_CONST_0;
|
||||
int ws01 = STATE_CONST_1;
|
||||
@ -869,7 +882,7 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
int ws09 = initState[9];
|
||||
int ws10 = initState[10];
|
||||
int ws11 = initState[11];
|
||||
int ws12 = (int)counter;
|
||||
int ws12 = initState[12];
|
||||
int ws13 = initState[13];
|
||||
int ws14 = initState[14];
|
||||
int ws15 = initState[15];
|
||||
@ -986,11 +999,12 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
asIntLittleEndian.set(result, 36, ws09 + initState[9]);
|
||||
asIntLittleEndian.set(result, 40, ws10 + initState[10]);
|
||||
asIntLittleEndian.set(result, 44, ws11 + initState[11]);
|
||||
// Add the counter back into workState[12]
|
||||
asIntLittleEndian.set(result, 48, ws12 + (int)counter);
|
||||
asIntLittleEndian.set(result, 48, ws12 + initState[12]);
|
||||
asIntLittleEndian.set(result, 52, ws13 + initState[13]);
|
||||
asIntLittleEndian.set(result, 56, ws14 + initState[14]);
|
||||
asIntLittleEndian.set(result, 60, ws15 + initState[15]);
|
||||
|
||||
return KS_BLK_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1009,12 +1023,21 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
int remainingData = inLen;
|
||||
|
||||
while (remainingData > 0) {
|
||||
int ksRemain = keyStream.length - keyStrOffset;
|
||||
int ksRemain = keyStrLimit - keyStrOffset;
|
||||
if (ksRemain <= 0) {
|
||||
if (counter <= finalCounterValue) {
|
||||
generateKeystream();
|
||||
// Intrinsics can do multiple blocks at once. This means
|
||||
// it may overrun the counter. In order to prevent key
|
||||
// stream reuse, we adjust the key stream limit to only the
|
||||
// key stream length that is calculated from unique
|
||||
// counter values.
|
||||
keyStrLimit = chaCha20Block(startState, counter, keyStream);
|
||||
counter += (keyStrLimit / KS_BLK_SIZE);
|
||||
if (counter > finalCounterValue) {
|
||||
keyStrLimit -= (int)(counter - finalCounterValue) * 64;
|
||||
}
|
||||
keyStrOffset = 0;
|
||||
ksRemain = keyStream.length;
|
||||
ksRemain = keyStrLimit;
|
||||
} else {
|
||||
throw new KeyException("Counter exhausted. " +
|
||||
"Reinitialize with new key and/or nonce");
|
||||
@ -1060,9 +1083,10 @@ abstract class ChaCha20Cipher extends CipherSpi {
|
||||
private void initAuthenticator() throws InvalidKeyException {
|
||||
authenticator = new Poly1305();
|
||||
|
||||
// Derive the Poly1305 key from the starting state
|
||||
byte[] serializedKey = new byte[KEYSTREAM_SIZE];
|
||||
chaCha20Block(startState, 0, serializedKey);
|
||||
// Derive the Poly1305 key from the starting state with the counter
|
||||
// value forced to zero.
|
||||
byte[] serializedKey = new byte[KS_MAX_LEN];
|
||||
chaCha20Block(startState, 0L, serializedKey);
|
||||
|
||||
authenticator.engineInit(new SecretKeySpec(serializedKey, 0, 32,
|
||||
authAlgName), null);
|
||||
|
@ -0,0 +1,317 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.intrinsics.chacha;
|
||||
|
||||
import javax.crypto.Cipher;
|
||||
import javax.crypto.spec.ChaCha20ParameterSpec;
|
||||
import javax.crypto.spec.SecretKeySpec;
|
||||
import java.security.GeneralSecurityException;
|
||||
import java.util.*;
|
||||
|
||||
public class ExerciseChaCha20 {
|
||||
|
||||
private static final int WARMUP_CYCLES = 200000;
|
||||
|
||||
// Use the test vectors from RFC 7539 to exercise the ChaCha20 block
|
||||
// intrinsic
|
||||
public static final List<TestData> testList = List.of(
|
||||
new TestData("RFC 7539 Sample Test Vector",
|
||||
"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
|
||||
"000000000000004a00000000",
|
||||
1, Cipher.ENCRYPT_MODE,
|
||||
"4c616469657320616e642047656e746c656d656e206f662074686520636c6173" +
|
||||
"73206f66202739393a204966204920636f756c64206f6666657220796f75206f" +
|
||||
"6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73" +
|
||||
"637265656e20776f756c642062652069742e",
|
||||
null,
|
||||
"6e2e359a2568f98041ba0728dd0d6981e97e7aec1d4360c20a27afccfd9fae0b" +
|
||||
"f91b65c5524733ab8f593dabcd62b3571639d624e65152ab8f530c359f0861d8" +
|
||||
"07ca0dbf500d6a6156a38e088a22b65e52bc514d16ccf806818ce91ab7793736" +
|
||||
"5af90bbf74a35be6b40b8eedf2785e42874d"),
|
||||
new TestData("RFC 7539 Test Vector 1 (all zeroes)",
|
||||
"0000000000000000000000000000000000000000000000000000000000000000",
|
||||
"000000000000000000000000",
|
||||
0, Cipher.ENCRYPT_MODE,
|
||||
"0000000000000000000000000000000000000000000000000000000000000000" +
|
||||
"0000000000000000000000000000000000000000000000000000000000000000",
|
||||
null,
|
||||
"76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7" +
|
||||
"da41597c5157488d7724e03fb8d84a376a43b8f41518a11cc387b669b2ee6586"),
|
||||
new TestData("RFC 7539 Test Vector 2",
|
||||
"0000000000000000000000000000000000000000000000000000000000000001",
|
||||
"000000000000000000000002",
|
||||
1, Cipher.ENCRYPT_MODE,
|
||||
"416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
|
||||
"6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
|
||||
"636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
|
||||
"20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
|
||||
"746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
|
||||
"206f6620616e204945544620616374697669747920697320636f6e7369646572" +
|
||||
"656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
|
||||
"73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
|
||||
"747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
|
||||
"7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
|
||||
"74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
|
||||
"207768696368206172652061646472657373656420746f",
|
||||
null,
|
||||
"a3fbf07df3fa2fde4f376ca23e82737041605d9f4f4f57bd8cff2c1d4b7955ec" +
|
||||
"2a97948bd3722915c8f3d337f7d370050e9e96d647b7c39f56e031ca5eb6250d" +
|
||||
"4042e02785ececfa4b4bb5e8ead0440e20b6e8db09d881a7c6132f420e527950" +
|
||||
"42bdfa7773d8a9051447b3291ce1411c680465552aa6c405b7764d5e87bea85a" +
|
||||
"d00f8449ed8f72d0d662ab052691ca66424bc86d2df80ea41f43abf937d3259d" +
|
||||
"c4b2d0dfb48a6c9139ddd7f76966e928e635553ba76c5c879d7b35d49eb2e62b" +
|
||||
"0871cdac638939e25e8a1e0ef9d5280fa8ca328b351c3c765989cbcf3daa8b6c" +
|
||||
"cc3aaf9f3979c92b3720fc88dc95ed84a1be059c6499b9fda236e7e818b04b0b" +
|
||||
"c39c1e876b193bfe5569753f88128cc08aaa9b63d1a16f80ef2554d7189c411f" +
|
||||
"5869ca52c5b83fa36ff216b9c1d30062bebcfd2dc5bce0911934fda79a86f6e6" +
|
||||
"98ced759c3ff9b6477338f3da4f9cd8514ea9982ccafb341b2384dd902f3d1ab" +
|
||||
"7ac61dd29c6f21ba5b862f3730e37cfdc4fd806c22f221"),
|
||||
new TestData("RFC 7539 Test Vector 3",
|
||||
"1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0",
|
||||
"000000000000000000000002",
|
||||
42, Cipher.ENCRYPT_MODE,
|
||||
"2754776173206272696c6c69672c20616e642074686520736c6974687920746f" +
|
||||
"7665730a446964206779726520616e642067696d626c6520696e207468652077" +
|
||||
"6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" +
|
||||
"732c0a416e6420746865206d6f6d65207261746873206f757467726162652e",
|
||||
null,
|
||||
"62e6347f95ed87a45ffae7426f27a1df5fb69110044c0d73118effa95b01e5cf" +
|
||||
"166d3df2d721caf9b21e5fb14c616871fd84c54f9d65b283196c7fe4f60553eb" +
|
||||
"f39c6402c42234e32a356b3e764312a61a5532055716ead6962568f87d3f3f77" +
|
||||
"04c6a8d1bcd1bf4d50d6154b6da731b187b58dfd728afa36757a797ac188d1")
|
||||
);
|
||||
|
||||
public static class TestData {
|
||||
public TestData(String name, String keyStr, String nonceStr, int ctr,
|
||||
int dir, String inputStr, String aadStr, String outStr) {
|
||||
testName = Objects.requireNonNull(name);
|
||||
HexFormat hex = HexFormat.of();
|
||||
key = hex.parseHex(Objects.requireNonNull(keyStr));
|
||||
nonce = hex.parseHex(Objects.requireNonNull(nonceStr));
|
||||
if ((counter = ctr) < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"counter must be 0 or greater");
|
||||
}
|
||||
direction = dir;
|
||||
if ((direction != Cipher.ENCRYPT_MODE) &&
|
||||
(direction != Cipher.DECRYPT_MODE)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Direction must be ENCRYPT_MODE or DECRYPT_MODE");
|
||||
}
|
||||
input = hex.parseHex(Objects.requireNonNull(inputStr));
|
||||
aad = (aadStr != null) ? hex.parseHex(aadStr) : null;
|
||||
expOutput = hex.parseHex(Objects.requireNonNull(outStr));
|
||||
}
|
||||
|
||||
public final String testName;
|
||||
public final byte[] key;
|
||||
public final byte[] nonce;
|
||||
public final int counter;
|
||||
public final int direction;
|
||||
public final byte[] input;
|
||||
public final byte[] aad;
|
||||
public final byte[] expOutput;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
int testsPassed = 0;
|
||||
int testNumber = 0;
|
||||
|
||||
// Use the first test vector to warm up the JVM and activate
|
||||
// the intrinsics.
|
||||
System.out.println("Running " + WARMUP_CYCLES + " warm up cycles");
|
||||
for (int i = 0; i < WARMUP_CYCLES; i++) {
|
||||
runSinglePartTest(testList.get(0));
|
||||
}
|
||||
|
||||
System.out.println("----- Single-part Tests -----");
|
||||
for (TestData test : testList) {
|
||||
System.out.println("*** Test " + ++testNumber + ": " +
|
||||
test.testName);
|
||||
if (runSinglePartTest(test)) {
|
||||
testsPassed++;
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
System.out.println("----- Multi-part Tests -----");
|
||||
for (TestData test : testList) {
|
||||
System.out.println("*** Test " + ++testNumber + ": " +
|
||||
test.testName);
|
||||
if (runMultiPartTest(test)) {
|
||||
testsPassed++;
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Total tests: " + testNumber +
|
||||
", Passed: " + testsPassed + ", Failed: " +
|
||||
(testNumber - testsPassed));
|
||||
if (testsPassed != testNumber) {
|
||||
throw new RuntimeException("One or more tests failed. " +
|
||||
"Check output for details");
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean runSinglePartTest(TestData testData)
|
||||
throws GeneralSecurityException {
|
||||
boolean encRes = false;
|
||||
boolean decRes = false;
|
||||
byte[] encryptedResult;
|
||||
byte[] decryptedResult;
|
||||
|
||||
// Get a Cipher instance and set up the parameters
|
||||
Cipher mambo = Cipher.getInstance("ChaCha20");
|
||||
SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
|
||||
ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
|
||||
testData.nonce, testData.counter);
|
||||
|
||||
// Encrypt our input
|
||||
mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
|
||||
encryptedResult = mambo.doFinal(testData.input);
|
||||
|
||||
if (!Arrays.equals(encryptedResult, testData.expOutput)) {
|
||||
System.out.println("ERROR - Output Mismatch!");
|
||||
System.out.println("Expected:\n" +
|
||||
dumpHexBytes(testData.expOutput, 16, "\n", " "));
|
||||
System.out.println("Actual:\n" +
|
||||
dumpHexBytes(encryptedResult, 16, "\n", " "));
|
||||
System.out.println();
|
||||
} else {
|
||||
encRes = true;
|
||||
}
|
||||
|
||||
// Decrypt the result of the encryption operation
|
||||
mambo = Cipher.getInstance("ChaCha20");
|
||||
mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
|
||||
decryptedResult = mambo.doFinal(encryptedResult);
|
||||
|
||||
if (!Arrays.equals(decryptedResult, testData.input)) {
|
||||
System.out.println("ERROR - Output Mismatch!");
|
||||
System.out.println("Expected:\n" +
|
||||
dumpHexBytes(testData.input, 16, "\n", " "));
|
||||
System.out.println("Actual:\n" +
|
||||
dumpHexBytes(decryptedResult, 16, "\n", " "));
|
||||
System.out.println();
|
||||
} else {
|
||||
decRes = true;
|
||||
}
|
||||
|
||||
return (encRes && decRes);
|
||||
}
|
||||
|
||||
private static boolean runMultiPartTest(TestData testData)
|
||||
throws GeneralSecurityException {
|
||||
boolean encRes = false;
|
||||
boolean decRes = false;
|
||||
|
||||
// Get a cipher instance and initialize it
|
||||
Cipher mambo = Cipher.getInstance("ChaCha20");
|
||||
SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
|
||||
ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
|
||||
testData.nonce, testData.counter);
|
||||
|
||||
byte[] encryptedResult = new byte[testData.input.length];
|
||||
mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
|
||||
System.out.print("Encrypt - ");
|
||||
doMulti(mambo, testData.input, encryptedResult);
|
||||
|
||||
if (!Arrays.equals(encryptedResult, testData.expOutput)) {
|
||||
System.out.println("ERROR - Output Mismatch!");
|
||||
System.out.println("Expected:\n" +
|
||||
dumpHexBytes(testData.expOutput));
|
||||
System.out.println("Actual:\n" +
|
||||
dumpHexBytes(encryptedResult));
|
||||
System.out.println();
|
||||
} else {
|
||||
encRes = true;
|
||||
}
|
||||
|
||||
// Decrypt the result of the encryption operation
|
||||
mambo = Cipher.getInstance("ChaCha20");
|
||||
byte[] decryptedResult = new byte[encryptedResult.length];
|
||||
mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
|
||||
System.out.print("Decrypt - ");
|
||||
doMulti(mambo, encryptedResult, decryptedResult);
|
||||
|
||||
if (!Arrays.equals(decryptedResult, testData.input)) {
|
||||
System.out.println("ERROR - Output Mismatch!");
|
||||
System.out.println("Expected:\n" + dumpHexBytes(testData.input));
|
||||
System.out.println("Actual:\n" + dumpHexBytes(decryptedResult));
|
||||
System.out.println();
|
||||
} else {
|
||||
decRes = true;
|
||||
}
|
||||
|
||||
return (encRes && decRes);
|
||||
}
|
||||
|
||||
private static void doMulti(Cipher c, byte[] input, byte[] output)
|
||||
throws GeneralSecurityException {
|
||||
int offset = 0;
|
||||
boolean done = false;
|
||||
Random randIn = new Random(System.currentTimeMillis());
|
||||
|
||||
// Send small updates between 1 - 8 bytes in length until we get
|
||||
// 8 or less bytes from the end of the input, then finalize.
|
||||
System.out.println("Input length: " + input.length);
|
||||
System.out.print("Multipart (bytes in/out): ");
|
||||
while (!done) {
|
||||
int mPartLen = randIn.nextInt(8) + 1;
|
||||
int bytesLeft = input.length - offset;
|
||||
int processed;
|
||||
if (mPartLen < bytesLeft) {
|
||||
System.out.print(mPartLen + "/");
|
||||
processed = c.update(input, offset, mPartLen,
|
||||
output, offset);
|
||||
offset += processed;
|
||||
System.out.print(processed + " ");
|
||||
} else {
|
||||
processed = c.doFinal(input, offset, bytesLeft,
|
||||
output, offset);
|
||||
System.out.print(bytesLeft + "/" + processed + " ");
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
private static String dumpHexBytes(byte[] data) {
|
||||
return dumpHexBytes(data, 16, "\n", " ");
|
||||
}
|
||||
|
||||
private static String dumpHexBytes(byte[] data, int itemsPerLine,
|
||||
String lineDelim, String itemDelim) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (data != null) {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
if (i % itemsPerLine == 0 && i != 0) {
|
||||
sb.append(lineDelim);
|
||||
}
|
||||
sb.append(String.format("%02X", data[i])).append(itemDelim);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
171
test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java
Normal file
171
test/hotspot/jtreg/compiler/intrinsics/chacha/TestChaCha20.java
Normal file
@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Red Hat, Inc. All rights reserved.
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.intrinsics.chacha;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import jdk.test.lib.Platform;
|
||||
import jdk.test.lib.process.OutputAnalyzer;
|
||||
import jdk.test.lib.process.ProcessTools;
|
||||
import jdk.test.whitebox.cpuinfo.CPUInfo;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8247645
|
||||
* @summary ChaCha20 Intrinsics
|
||||
* @library /test/lib
|
||||
* @build compiler.intrinsics.chacha.ExerciseChaCha20
|
||||
* jdk.test.whitebox.WhiteBox
|
||||
* @run driver jdk.test.lib.helpers.ClassFileInstaller jdk.test.whitebox.WhiteBox
|
||||
* @run main/othervm/timeout=7200
|
||||
* -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI
|
||||
* compiler.intrinsics.chacha.TestChaCha20
|
||||
*/
|
||||
public class TestChaCha20 {
|
||||
|
||||
// Default to 1/4 of the CPUs, and allow users to override.
|
||||
static final int MAX_PARALLELISM = Integer.getInteger("maxParallelism",
|
||||
Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
|
||||
|
||||
private static List<String> mix(List<String> o, String... mix) {
|
||||
List<String> n = new ArrayList<>(o);
|
||||
for (String m : mix) {
|
||||
n.add(m);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
private static boolean containsFuzzy(List<String> list, String sub) {
|
||||
for (String s : list) {
|
||||
if (s.contains(sub)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
List<List<String>> configs = new ArrayList<>();
|
||||
List<String> cpuFeatures = CPUInfo.getFeatures();
|
||||
|
||||
System.out.print("CPU Features: ");
|
||||
cpuFeatures.forEach(f -> System.out.print(f + " "));
|
||||
System.out.println();
|
||||
|
||||
if (Platform.isX64()) {
|
||||
// If CPU features were not found, provide a default config.
|
||||
if (cpuFeatures.isEmpty()) {
|
||||
configs.add(new ArrayList());
|
||||
}
|
||||
|
||||
// Otherwise, select the tests that make sense on current platform.
|
||||
if (containsFuzzy(cpuFeatures, "avx512")) {
|
||||
System.out.println("Setting up AVX512 worker");
|
||||
configs.add(List.of("-XX:UseAVX=3"));
|
||||
}
|
||||
if (containsFuzzy(cpuFeatures, "avx2")) {
|
||||
System.out.println("Setting up AVX2 worker");
|
||||
configs.add(List.of("-XX:UseAVX=2"));
|
||||
}
|
||||
if (containsFuzzy(cpuFeatures, "avx")) {
|
||||
System.out.println("Setting up AVX worker");
|
||||
configs.add(List.of("-XX:UseAVX=1"));
|
||||
}
|
||||
} else if (Platform.isAArch64()) {
|
||||
// AArch64 intrinsics require the advanced simd instructions
|
||||
if (containsFuzzy(cpuFeatures, "simd")) {
|
||||
System.out.println("Setting up ASIMD worker");
|
||||
configs.add(new ArrayList());
|
||||
}
|
||||
} else {
|
||||
// We only have ChaCha20 intrinsics on x64 and aarch64
|
||||
// currently. If the platform is neither of these then
|
||||
// the ChaCha20 known answer tests in
|
||||
// com/sun/crypto/provider/Cipher are sufficient.
|
||||
return;
|
||||
}
|
||||
|
||||
// If by this point we have no configs, it means we are running
|
||||
// on a platform that intrinsics have been written for, but does
|
||||
// not possess the necessary instruction sets for that processor.
|
||||
// We can exit out if that is the case.
|
||||
if (configs.isEmpty()) {
|
||||
System.out.println("No intrinsics-capable configurations found");
|
||||
return;
|
||||
}
|
||||
|
||||
// We can expand this array later to include other tests if new
|
||||
// ChaCha20 intrinsics are developed.
|
||||
String[] classNames = {
|
||||
"compiler.intrinsics.chacha.ExerciseChaCha20"
|
||||
};
|
||||
|
||||
ArrayList<Fork> forks = new ArrayList<>();
|
||||
int jobs = 0;
|
||||
|
||||
for (List<String> c : configs) {
|
||||
for (String className : classNames) {
|
||||
// Start a new job
|
||||
{
|
||||
ProcessBuilder pb = ProcessTools.createTestJvm(
|
||||
mix(c, "-Xmx256m", className));
|
||||
Process p = pb.start();
|
||||
OutputAnalyzer oa = new OutputAnalyzer(p);
|
||||
forks.add(new Fork(p, oa));
|
||||
jobs++;
|
||||
}
|
||||
|
||||
// Wait for the completion of other jobs
|
||||
while (jobs >= MAX_PARALLELISM) {
|
||||
Fork f = findDone(forks);
|
||||
if (f != null) {
|
||||
OutputAnalyzer oa = f.oa();
|
||||
oa.shouldHaveExitValue(0);
|
||||
forks.remove(f);
|
||||
jobs--;
|
||||
} else {
|
||||
// Nothing is done, wait a little.
|
||||
Thread.sleep(200);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain the rest
|
||||
for (Fork f : forks) {
|
||||
OutputAnalyzer oa = f.oa();
|
||||
oa.shouldHaveExitValue(0);
|
||||
}
|
||||
}
|
||||
|
||||
private static Fork findDone(List<Fork> forks) {
|
||||
for (Fork f : forks) {
|
||||
if (!f.p().isAlive()) {
|
||||
return f;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static record Fork(Process p, OutputAnalyzer oa) {};
|
||||
}
|
@ -198,7 +198,7 @@ public abstract class CipherBench extends CryptoBase {
|
||||
@Param({"256"})
|
||||
private int keyLength;
|
||||
|
||||
@Param({"1024", "" + 16 * 1024})
|
||||
@Param({"256", "1024", "4096", "16384"})
|
||||
private int dataSize;
|
||||
|
||||
protected int ivLength() {
|
||||
@ -223,7 +223,7 @@ public abstract class CipherBench extends CryptoBase {
|
||||
@Param({"256"})
|
||||
private int keyLength;
|
||||
|
||||
@Param({"1024", "" + 16 * 1024})
|
||||
@Param({"256", "1024", "4096", "16384"})
|
||||
private int dataSize;
|
||||
|
||||
protected int ivLength() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user