8247645: ChaCha20 intrinsics

Reviewed-by: sviswanathan, ngasson, vlivanov, ascarpino
This commit is contained in:
Jamil Nimeh 2022-11-29 14:40:20 +00:00
parent 33587ffd35
commit cd6bebbf34
28 changed files with 1590 additions and 38 deletions

View File

@ -2322,6 +2322,40 @@ public:
}
}
// Single-structure load/store method (all addressing variants)
void ld_st(FloatRegister Vt, SIMD_RegVariant T, int index, Address a,
int op1, int op2, int regs) {
int expectedImmediate = (regVariant_to_elemBits(T) >> 3) * regs;
int sVal = (T < D) ? (index >> (2 - T)) & 0x01 : 0;
int opcode = (T < D) ? (T << 2) : ((T & 0x02) << 2);
int size = (T < D) ? (index & (0x3 << T)) : 1; // only care about low 2b
Register Xn = a.base();
int Rm;
switch (a.getMode()) {
case Address::base_plus_offset:
guarantee(a.offset() == 0, "no offset allowed here");
Rm = 0;
break;
case Address::post:
guarantee(a.offset() == expectedImmediate, "bad offset");
op1 |= 0b100;
Rm = 0b11111;
break;
case Address::post_reg:
op1 |= 0b100;
Rm = a.index()->encoding();
break;
default:
ShouldNotReachHere();
}
starti;
f(0,31), f((index >> (3 - T)), 30);
f(op1, 29, 21), f(Rm, 20, 16), f(op2 | opcode | sVal, 15, 12);
f(size, 11, 10), srf(Xn, 5), rf(Vt, 0);
}
public:
#define INSN1(NAME, op1, op2) \
@ -2379,6 +2413,66 @@ public:
#undef INSN3
#undef INSN4
// Handle common single-structure ld/st parameter sanity checks
// for all variations (1 to 4) of SIMD reigster inputs. This
// method will call the routine that generates the opcode.
template<typename R, typename... Rx>
void ldst_sstr(SIMD_RegVariant T, int index, const Address &a,
int op1, int op2, R firstReg, Rx... otherRegs) {
const FloatRegister vtSet[] = { firstReg, otherRegs... };
const int regCount = sizeof...(otherRegs) + 1;
assert(index >= 0 && (T <= D) && ((T == B && index <= 15) ||
(T == H && index <= 7) || (T == S && index <= 3) ||
(T == D && index <= 1)), "invalid index");
assert(regCount >= 1 && regCount <= 4, "illegal register count");
// Check to make sure when multiple SIMD registers are used
// that they are in successive order.
for (int i = 0; i < regCount - 1; i++) {
assert(vtSet[i]->successor() == vtSet[i + 1],
"Registers must be ordered");
}
ld_st(firstReg, T, index, a, op1, op2, regCount);
}
// Define a set of INSN1/2/3/4 macros to handle single-structure
// load/store instructions.
#define INSN1(NAME, op1, op2) \
void NAME(FloatRegister Vt, SIMD_RegVariant T, int index, \
const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt); \
}
#define INSN2(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_RegVariant T, \
int index, const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2); \
}
#define INSN3(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
SIMD_RegVariant T, int index, const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3); \
}
#define INSN4(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
FloatRegister Vt4, SIMD_RegVariant T, int index, \
const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3, Vt4); \
}
INSN1(st1, 0b001101000, 0b0000);
INSN2(st2, 0b001101001, 0b0000);
INSN3(st3, 0b001101000, 0b0010);
INSN4(st4, 0b001101001, 0b0010);
#undef INSN1
#undef INSN2
#undef INSN3
#undef INSN4
#define INSN(NAME, opc) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
starti; \
@ -2749,6 +2843,7 @@ public:
INSN(ushr, 1, 0b000001, /* isSHR = */ true);
INSN(usra, 1, 0b000101, /* isSHR = */ true);
INSN(ssra, 0, 0b000101, /* isSHR = */ true);
INSN(sli, 1, 0b010101, /* isSHR = */ false);
#undef INSN

View File

@ -1450,6 +1450,13 @@ public:
void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
void aes_round(FloatRegister input, FloatRegister subkey);
// ChaCha20 functions support block
void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
FloatRegister tbl);
void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
FloatRegister dVec, bool colToDiag);
// Place an ISB after code may have been modified due to a safepoint.
void safepoint_isb();

View File

@ -0,0 +1,90 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "macroAssembler_aarch64.hpp"
#include "memory/resourceArea.hpp"
#include "runtime/stubRoutines.hpp"
/**
* Perform the quarter round calculations on values contained within
* four SIMD registers.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param scratch scratch SIMD register used for 12 and 7 bit left rotations
* @param table the SIMD register used as a table for 8 bit left rotations
*/
void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
FloatRegister table) {
// a += b, d ^= a, d <<<= 16
addv(aVec, T4S, aVec, bVec);
eor(dVec, T16B, dVec, aVec);
rev32(dVec, T8H, dVec);
// c += d, b ^= c, b <<<= 12
addv(cVec, T4S, cVec, dVec);
eor(scratch, T16B, bVec, cVec);
ushr(bVec, T4S, scratch, 20);
sli(bVec, T4S, scratch, 12);
// a += b, d ^= a, d <<<= 8
addv(aVec, T4S, aVec, bVec);
eor(dVec, T16B, dVec, aVec);
tbl(dVec, T16B, dVec, 1, table);
// c += d, b ^= c, b <<<= 7
addv(cVec, T4S, cVec, dVec);
eor(scratch, T16B, bVec, cVec);
ushr(bVec, T4S, scratch, 25);
sli(bVec, T4S, scratch, 7);
}
/**
* Shift the b, c, and d vectors between columnar and diagonal representations.
* Note that the "a" vector does not shift.
*
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param colToDiag true if moving columnar to diagonal, false if
* moving diagonal back to columnar.
*/
void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
FloatRegister dVec, bool colToDiag) {
int bShift = colToDiag ? 4 : 12;
int cShift = 8;
int dShift = colToDiag ? 12 : 4;
ext(bVec, T16B, bVec, bVec, bShift);
ext(cVec, T16B, cVec, cVec, cShift);
ext(dVec, T16B, dVec, dVec, dShift);
}

View File

@ -4081,6 +4081,132 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// ChaCha20 block function. This version parallelizes by loading
// individual 32-bit state elements into vectors for four blocks
// (e.g. all four blocks' worth of state[0] in one register, etc.)
//
// state (int[16]) = c_rarg0
// keystream (byte[1024]) = c_rarg1
// return - number of bytes of keystream (always 256)
address generate_chacha20Block_blockpar() {
Label L_twoRounds, L_cc20_const;
// The constant data is broken into two 128-bit segments to be loaded
// onto FloatRegisters. The first 128 bits are a counter add overlay
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
// The second 128-bits is a table constant used for 8-bit left rotations.
__ BIND(L_cc20_const);
__ emit_int64(0x0000000100000000UL);
__ emit_int64(0x0000000300000002UL);
__ emit_int64(0x0605040702010003UL);
__ emit_int64(0x0E0D0C0F0A09080BUL);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
__ enter();
int i, j;
const Register state = c_rarg0;
const Register keystream = c_rarg1;
const Register loopCtr = r10;
const Register tmpAddr = r11;
const FloatRegister stateFirst = v0;
const FloatRegister stateSecond = v1;
const FloatRegister stateThird = v2;
const FloatRegister stateFourth = v3;
const FloatRegister origCtrState = v28;
const FloatRegister scratch = v29;
const FloatRegister lrot8Tbl = v30;
// Organize SIMD registers in an array that facilitates
// putting repetitive opcodes into loop structures. It is
// important that each grouping of 4 registers is monotonically
// increasing to support the requirements of multi-register
// instructions (e.g. ld4r, st4, etc.)
const FloatRegister workSt[16] = {
v4, v5, v6, v7, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27
};
// Load from memory and interlace across 16 SIMD registers,
// With each word from memory being broadcast to all lanes of
// each successive SIMD register.
// Addr(0) -> All lanes in workSt[i]
// Addr(4) -> All lanes workSt[i + 1], etc.
__ mov(tmpAddr, state);
for (i = 0; i < 16; i += 4) {
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
__ post(tmpAddr, 16));
}
// Pull in constant data. The first 16 bytes are the add overlay
// which is applied to the vector holding the counter (state[12]).
// The second 16 bytes is the index register for the 8-bit left
// rotation tbl instruction.
__ adr(tmpAddr, L_cc20_const);
__ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
__ addv(workSt[12], __ T4S, workSt[12], origCtrState);
// Set up the 10 iteration loop and perform all 8 quarter round ops
__ mov(loopCtr, 10);
__ BIND(L_twoRounds);
__ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
scratch, lrot8Tbl);
// Decrement and iterate
__ sub(loopCtr, loopCtr, 1);
__ cbnz(loopCtr, L_twoRounds);
__ mov(tmpAddr, state);
// Add the starting state back to the post-loop keystream
// state. We read/interlace the state array from memory into
// 4 registers similar to what we did in the beginning. Then
// add the counter overlay onto workSt[12] at the end.
for (i = 0; i < 16; i += 4) {
__ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
__ post(tmpAddr, 16));
__ addv(workSt[i], __ T4S, workSt[i], stateFirst);
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
}
__ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask
// Write to key stream, storing the same element out of workSt[0..15]
// to consecutive 4-byte offsets in the key stream buffer, then repeating
// for the next element position.
for (i = 0; i < 4; i++) {
for (j = 0; j < 16; j += 4) {
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
__ post(keystream, 16));
}
}
__ mov(r0, 256); // Return length of output keystream
__ leave();
__ ret(lr);
return start;
}
/**
* Arguments:
*
@ -7919,6 +8045,10 @@ class StubGenerator: public StubCodeGenerator {
}
#endif // COMPILER2
if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
}
if (UseBASE64Intrinsics) {
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();

View File

@ -366,6 +366,17 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (_features & CPU_ASIMD) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
UseChaCha20Intrinsics = true;
}
} else if (UseChaCha20Intrinsics) {
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
warning("ChaCha20 intrinsic requires ASIMD instructions");
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}
if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
UseBASE64Intrinsics = true;
}

View File

@ -5269,6 +5269,16 @@ void Assembler::pshufhw(XMMRegister dst, XMMRegister src, int mode) {
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
}
void Assembler::vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
(vector_len == AVX_256bit ? VM_Version::supports_avx2() :
(vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
}
void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
assert(isByte(mode), "invalid value");
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@ -5290,6 +5300,16 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
emit_int8(mode & 0xFF);
}
void Assembler::vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
(vector_len == AVX_256bit ? VM_Version::supports_avx2() :
(vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false)), "");
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int24(0x70, (0xC0 | encode), mode & 0xFF);
}
void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, "");

View File

@ -1946,6 +1946,8 @@ private:
void pshufhw(XMMRegister dst, XMMRegister src, int mode);
void pshuflw(XMMRegister dst, XMMRegister src, int mode);
void pshuflw(XMMRegister dst, Address src, int mode);
void vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
void vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
//shuffle floats and doubles
void shufps(XMMRegister, XMMRegister, int);

View File

@ -3809,6 +3809,8 @@ void StubGenerator::generate_all() {
generate_ghash_stubs();
generate_chacha_stubs();
if (UseMD5Intrinsics) {
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");

View File

@ -387,6 +387,18 @@ class StubGenerator: public StubCodeGenerator {
// Ghash single and multi block operations using AVX instructions
address generate_avx_ghash_processBlocks();
// ChaCha20 stubs and helper functions
void generate_chacha_stubs();
address generate_chacha20Block_avx();
address generate_chacha20Block_avx512();
void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
XMMRegister lrot8, XMMRegister lrot16, int vector_len);
void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
XMMRegister dVec, int vector_len, bool colToDiag);
void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec,
XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset);
// Poly1305 multiblock using IFMA instructions
address generate_poly1305_processBlocks();
void poly1305_process_blocks_avx512(const Register input, const Register length,

View File

@ -0,0 +1,582 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
#include "stubGenerator_x86_64.hpp"
#define __ _masm->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif // PRODUCT
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// Constants
/**
* This AVX/AVX2 add mask generation can be used for multiple duties:
* 1.) Provide +0/+1 counter increments by loading 256 bits
* at offset 0
* 2.) Provide +2/+2 counter increments for the second set
* of 4 AVX2 registers at offset 32 (256-bit load)
* 3.) Provide a +1 increment for the second set of 4 AVX
* registers at offset 16 (128-bit load)
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX[] = {
0x0000000000000000UL, 0x0000000000000000UL,
0x0000000000000001UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
};
static address chacha20_ctradd_avx() {
return (address)CC20_COUNTER_ADD_AVX;
}
/**
* Add masks for 4-block ChaCha20 Block calculations
* The first 512 bits creates a +0/+1/+2/+3 add overlay.
* The second 512 bits is a +4/+4/+4/+4 add overlay. This
* can be used to increment the counter fields for the next 4 blocks.
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX512[] = {
0x0000000000000000UL, 0x0000000000000000UL,
0x0000000000000001UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
0x0000000000000003UL, 0x0000000000000000UL,
0x0000000000000004UL, 0x0000000000000000UL,
0x0000000000000004UL, 0x0000000000000000UL,
0x0000000000000004UL, 0x0000000000000000UL,
0x0000000000000004UL, 0x0000000000000000UL
};
static address chacha20_ctradd_avx512() {
return (address)CC20_COUNTER_ADD_AVX512;
}
/**
* The first 256 bits represents a byte-wise permutation
* for an 8-bit left-rotation on 32-bit lanes.
* The second 256 bits is a 16-bit rotation on 32-bit lanes.
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_LROT_CONSTS[] = {
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
0x0504070601000302UL, 0x0D0C0F0E09080B0AUL,
0x0504070601000302UL, 0x0D0C0F0E09080B0AUL
};
static address chacha20_lrot_consts() {
return (address)CC20_LROT_CONSTS;
}
void StubGenerator::generate_chacha_stubs() {
// Generate ChaCha20 intrinsics code
if (UseChaCha20Intrinsics) {
if (VM_Version::supports_evex()) {
StubRoutines::_chacha20Block = generate_chacha20Block_avx512();
} else { // Either AVX or AVX2 is supported
assert(VM_Version::supports_avx() == true, "Must at least support AVX instructions");
StubRoutines::_chacha20Block = generate_chacha20Block_avx();
}
}
}
/* The 2-block AVX/AVX2-enabled ChaCha20 block function implementation */
address StubGenerator::generate_chacha20Block_avx() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
Label L_twoRounds;
const Register state = c_rarg0;
const Register result = c_rarg1;
const Register loopCounter = r8;
const Register rotAddr = r9;
const XMMRegister aState = xmm0;
const XMMRegister bState = xmm1;
const XMMRegister cState = xmm2;
const XMMRegister dState = xmm3;
const XMMRegister a1Vec = xmm4;
const XMMRegister b1Vec = xmm5;
const XMMRegister c1Vec = xmm6;
const XMMRegister d1Vec = xmm7;
const XMMRegister a2Vec = xmm8;
const XMMRegister b2Vec = xmm9;
const XMMRegister c2Vec = xmm10;
const XMMRegister d2Vec = xmm11;
const XMMRegister scratch = xmm12;
const XMMRegister d2State = xmm13;
const XMMRegister lrot8 = xmm14;
const XMMRegister lrot16 = xmm15;
int vector_len;
int outlen;
// This function will only be called if AVX2 or AVX are supported
// AVX512 uses a different function.
if (VM_Version::supports_avx2()) {
vector_len = Assembler::AVX_256bit;
outlen = 256;
} else if (VM_Version::supports_avx()) {
vector_len = Assembler::AVX_128bit;
outlen = 128;
}
__ enter();
// Load the initial state in columnar orientation and then copy
// that starting state to the working register set.
// Also load the address of the add mask for later use in handling
// multi-block counter increments.
__ lea(rotAddr, ExternalAddress(chacha20_lrot_consts()));
__ lea(rax, ExternalAddress(chacha20_ctradd_avx()));
if (vector_len == Assembler::AVX_128bit) {
__ movdqu(aState, Address(state, 0)); // Bytes 0 - 15 -> a1Vec
__ movdqu(bState, Address(state, 16)); // Bytes 16 - 31 -> b1Vec
__ movdqu(cState, Address(state, 32)); // Bytes 32 - 47 -> c1Vec
__ movdqu(dState, Address(state, 48)); // Bytes 48 - 63 -> d1Vec
__ movdqu(a1Vec, aState);
__ movdqu(b1Vec, bState);
__ movdqu(c1Vec, cState);
__ movdqu(d1Vec, dState);
__ movdqu(a2Vec, aState);
__ movdqu(b2Vec, bState);
__ movdqu(c2Vec, cState);
__ vpaddd(d2State, dState, Address(rax, 16), vector_len);
__ movdqu(d2Vec, d2State);
__ movdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
__ movdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
} else {
// We will broadcast each 128-bit segment of the state array into
// the high and low halves of ymm state registers. Then apply the add
// mask to the dState register. These will then be copied into the
// a/b/c/d1Vec working registers.
__ vbroadcastf128(aState, Address(state, 0), vector_len);
__ vbroadcastf128(bState, Address(state, 16), vector_len);
__ vbroadcastf128(cState, Address(state, 32), vector_len);
__ vbroadcastf128(dState, Address(state, 48), vector_len);
__ vpaddd(dState, dState, Address(rax, 0), vector_len);
__ vpaddd(d2State, dState, Address(rax, 32), vector_len);
__ vmovdqu(a1Vec, aState);
__ vmovdqu(b1Vec, bState);
__ vmovdqu(c1Vec, cState);
__ vmovdqu(d1Vec, dState);
__ vmovdqu(a2Vec, aState);
__ vmovdqu(b2Vec, bState);
__ vmovdqu(c2Vec, cState);
__ vmovdqu(d2Vec, d2State);
__ vmovdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
__ vmovdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
}
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first quarter round macro call covers the first 4 QR operations:
// Qround(state, 0, 4, 8,12)
// Qround(state, 1, 5, 9,13)
// Qround(state, 2, 6,10,14)
// Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
// to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, true);
// The second set of operations on the vectors covers the second 4 quarter
// round operations, now acting on the diagonals:
// Qround(state, 0, 5,10,15)
// Qround(state, 1, 6,11,12)
// Qround(state, 2, 7, 8,13)
// Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Before we start the next iteration, we need to perform shuffles
// on the b/c/d vectors to move them back to columnar organizations
// from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, false);
__ decrement(loopCounter);
__ jcc(Assembler::notZero, L_twoRounds);
// Add the original start state back into the current state.
__ vpaddd(a1Vec, a1Vec, aState, vector_len);
__ vpaddd(b1Vec, b1Vec, bState, vector_len);
__ vpaddd(c1Vec, c1Vec, cState, vector_len);
__ vpaddd(d1Vec, d1Vec, dState, vector_len);
__ vpaddd(a2Vec, a2Vec, aState, vector_len);
__ vpaddd(b2Vec, b2Vec, bState, vector_len);
__ vpaddd(c2Vec, c2Vec, cState, vector_len);
__ vpaddd(d2Vec, d2Vec, d2State, vector_len);
// Write the data to the keystream array
if (vector_len == Assembler::AVX_128bit) {
__ movdqu(Address(result, 0), a1Vec);
__ movdqu(Address(result, 16), b1Vec);
__ movdqu(Address(result, 32), c1Vec);
__ movdqu(Address(result, 48), d1Vec);
__ movdqu(Address(result, 64), a2Vec);
__ movdqu(Address(result, 80), b2Vec);
__ movdqu(Address(result, 96), c2Vec);
__ movdqu(Address(result, 112), d2Vec);
} else {
// Each half of the YMM has to be written 64 bytes apart from
// each other in memory so the final keystream buffer holds
// two consecutive keystream blocks.
__ vextracti128(Address(result, 0), a1Vec, 0);
__ vextracti128(Address(result, 64), a1Vec, 1);
__ vextracti128(Address(result, 16), b1Vec, 0);
__ vextracti128(Address(result, 80), b1Vec, 1);
__ vextracti128(Address(result, 32), c1Vec, 0);
__ vextracti128(Address(result, 96), c1Vec, 1);
__ vextracti128(Address(result, 48), d1Vec, 0);
__ vextracti128(Address(result, 112), d1Vec, 1);
__ vextracti128(Address(result, 128), a2Vec, 0);
__ vextracti128(Address(result, 192), a2Vec, 1);
__ vextracti128(Address(result, 144), b2Vec, 0);
__ vextracti128(Address(result, 208), b2Vec, 1);
__ vextracti128(Address(result, 160), c2Vec, 0);
__ vextracti128(Address(result, 224), c2Vec, 1);
__ vextracti128(Address(result, 176), d2Vec, 0);
__ vextracti128(Address(result, 240), d2Vec, 1);
}
// This function will always write 128 or 256 bytes into the
// key stream buffer, depending on the length of the SIMD
// registers. That length should be returned through %rax.
__ mov64(rax, outlen);
__ leave();
__ ret(0);
return start;
}
/* The 4-block AVX512-enabled ChaCha20 block function implementation */
address StubGenerator::generate_chacha20Block_avx512() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
Label L_twoRounds;
const Register state = c_rarg0;
const Register result = c_rarg1;
const Register loopCounter = r8;
const XMMRegister aState = xmm0;
const XMMRegister bState = xmm1;
const XMMRegister cState = xmm2;
const XMMRegister dState = xmm3;
const XMMRegister a1Vec = xmm4;
const XMMRegister b1Vec = xmm5;
const XMMRegister c1Vec = xmm6;
const XMMRegister d1Vec = xmm7;
const XMMRegister a2Vec = xmm8;
const XMMRegister b2Vec = xmm9;
const XMMRegister c2Vec = xmm10;
const XMMRegister d2Vec = xmm11;
const XMMRegister a3Vec = xmm12;
const XMMRegister b3Vec = xmm13;
const XMMRegister c3Vec = xmm14;
const XMMRegister d3Vec = xmm15;
const XMMRegister a4Vec = xmm16;
const XMMRegister b4Vec = xmm17;
const XMMRegister c4Vec = xmm18;
const XMMRegister d4Vec = xmm19;
const XMMRegister d2State = xmm20;
const XMMRegister d3State = xmm21;
const XMMRegister d4State = xmm22;
const XMMRegister scratch = xmm23;
__ enter();
// Load the initial state in columnar orientation.
// We will broadcast each 128-bit segment of the state array into
// all four double-quadword slots on ZMM State registers. They will
// be copied into the working ZMM registers and then added back in
// at the very end of the block function. The add mask should be
// applied to the dState register so it does not need to be fetched
// when adding the start state back into the final working state.
__ lea(rax, ExternalAddress(chacha20_ctradd_avx512()));
__ evbroadcasti32x4(aState, Address(state, 0), Assembler::AVX_512bit);
__ evbroadcasti32x4(bState, Address(state, 16), Assembler::AVX_512bit);
__ evbroadcasti32x4(cState, Address(state, 32), Assembler::AVX_512bit);
__ evbroadcasti32x4(dState, Address(state, 48), Assembler::AVX_512bit);
__ vpaddd(dState, dState, Address(rax, 0), Assembler::AVX_512bit);
__ evmovdqul(scratch, Address(rax, 64), Assembler::AVX_512bit);
__ vpaddd(d2State, dState, scratch, Assembler::AVX_512bit);
__ vpaddd(d3State, d2State, scratch, Assembler::AVX_512bit);
__ vpaddd(d4State, d3State, scratch, Assembler::AVX_512bit);
__ evmovdqul(a1Vec, aState, Assembler::AVX_512bit);
__ evmovdqul(b1Vec, bState, Assembler::AVX_512bit);
__ evmovdqul(c1Vec, cState, Assembler::AVX_512bit);
__ evmovdqul(d1Vec, dState, Assembler::AVX_512bit);
__ evmovdqul(a2Vec, aState, Assembler::AVX_512bit);
__ evmovdqul(b2Vec, bState, Assembler::AVX_512bit);
__ evmovdqul(c2Vec, cState, Assembler::AVX_512bit);
__ evmovdqul(d2Vec, d2State, Assembler::AVX_512bit);
__ evmovdqul(a3Vec, aState, Assembler::AVX_512bit);
__ evmovdqul(b3Vec, bState, Assembler::AVX_512bit);
__ evmovdqul(c3Vec, cState, Assembler::AVX_512bit);
__ evmovdqul(d3Vec, d3State, Assembler::AVX_512bit);
__ evmovdqul(a4Vec, aState, Assembler::AVX_512bit);
__ evmovdqul(b4Vec, bState, Assembler::AVX_512bit);
__ evmovdqul(c4Vec, cState, Assembler::AVX_512bit);
__ evmovdqul(d4Vec, d4State, Assembler::AVX_512bit);
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first set of operations on the vectors covers the first 4 quarter
// round operations:
// Qround(state, 0, 4, 8,12)
// Qround(state, 1, 5, 9,13)
// Qround(state, 2, 6,10,14)
// Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors
// to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, true);
// The second set of operations on the vectors covers the second 4 quarter
// round operations, now acting on the diagonals:
// Qround(state, 0, 5,10,15)
// Qround(state, 1, 6,11,12)
// Qround(state, 2, 7, 8,13)
// Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Before we start the next iteration, we need to perform shuffles
// on the b/c/d vectors to move them back to columnar organizations
// from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, false);
__ decrement(loopCounter);
__ jcc(Assembler::notZero, L_twoRounds);
// Add the initial state now held on the a/b/c/dState registers to the
// final working register values. We will also add in the counter add
// mask onto zmm3 after adding in the start state.
__ vpaddd(a1Vec, a1Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b1Vec, b1Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c1Vec, c1Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d1Vec, d1Vec, dState, Assembler::AVX_512bit);
__ vpaddd(a2Vec, a2Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b2Vec, b2Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c2Vec, c2Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d2Vec, d2Vec, d2State, Assembler::AVX_512bit);
__ vpaddd(a3Vec, a3Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b3Vec, b3Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c3Vec, c3Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d3Vec, d3Vec, d3State, Assembler::AVX_512bit);
__ vpaddd(a4Vec, a4Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b4Vec, b4Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c4Vec, c4Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d4Vec, d4Vec, d4State, Assembler::AVX_512bit);
// Write the ZMM state registers out to the key stream buffer
// Each ZMM is divided into 4 128-bit segments. Each segment
// is written to memory at 64-byte displacements from one
// another. The result is that all 4 blocks will be in their
// proper order when serialized.
cc20_keystream_collate_avx512(a1Vec, b1Vec, c1Vec, d1Vec, result, 0);
cc20_keystream_collate_avx512(a2Vec, b2Vec, c2Vec, d2Vec, result, 256);
cc20_keystream_collate_avx512(a3Vec, b3Vec, c3Vec, d3Vec, result, 512);
cc20_keystream_collate_avx512(a4Vec, b4Vec, c4Vec, d4Vec, result, 768);
// This function will always write 1024 bytes into the key stream buffer
// and that length should be returned through %rax.
__ mov64(rax, 1024);
__ leave();
__ ret(0);
return start;
}
/**
* Provide a function that implements the ChaCha20 quarter round function.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param scratch SIMD register used for non-byte-aligned left rotations
* @param lrot8 shuffle control mask for an 8-byte left rotation (32-bit lane)
* @param lrot16 shuffle control mask for a 16-byte left rotation (32-bit lane)
* @param vector_len the length of the vector
*/
void StubGenerator::cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
XMMRegister lrot8, XMMRegister lrot16, int vector_len) {
// a += b; d ^= a; d <<<= 16
__ vpaddd(aVec, aVec, bVec, vector_len);
__ vpxor(dVec, dVec, aVec, vector_len);
if (vector_len == Assembler::AVX_512bit) {
__ evprold(dVec, dVec, 16, vector_len);
} else {
__ vpshufb(dVec, dVec, lrot16, vector_len);
}
// c += d; b ^= c; b <<<= 12 (b << 12 | scratch >>> 20)
__ vpaddd(cVec, cVec, dVec, vector_len);
__ vpxor(bVec, bVec, cVec, vector_len);
if (vector_len == Assembler::AVX_512bit) {
__ evprold(bVec, bVec, 12, vector_len);
} else {
__ vpsrld(scratch, bVec, 20, vector_len);
__ vpslld(bVec, bVec, 12, vector_len);
__ vpor(bVec, bVec, scratch, vector_len);
}
// a += b; d ^= a; d <<<= 8 (d << 8 | scratch >>> 24)
__ vpaddd(aVec, aVec, bVec, vector_len);
__ vpxor(dVec, dVec, aVec, vector_len);
if (vector_len == Assembler::AVX_512bit) {
__ evprold(dVec, dVec, 8, vector_len);
} else {
__ vpshufb(dVec, dVec, lrot8, vector_len);
}
// c += d; b ^= c; b <<<= 7 (b << 7 | scratch >>> 25)
__ vpaddd(cVec, cVec, dVec, vector_len);
__ vpxor(bVec, bVec, cVec, vector_len);
if (vector_len == Assembler::AVX_512bit) {
__ evprold(bVec, bVec, 7, vector_len);
} else {
__ vpsrld(scratch, bVec, 25, vector_len);
__ vpslld(bVec, bVec, 7, vector_len);
__ vpor(bVec, bVec, scratch, vector_len);
}
}
/**
* Shift the b, c, and d vectors between columnar and diagonal representations.
* Note that the "a" vector does not shift.
*
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param vector_len the size of the SIMD register to operate upon
* @param colToDiag true if moving columnar to diagonal, false if
* moving diagonal back to columnar.
*/
void StubGenerator::cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
XMMRegister dVec, int vector_len, bool colToDiag) {
int bShift = colToDiag ? 0x39 : 0x93;
int cShift = 0x4E;
int dShift = colToDiag ? 0x93 : 0x39;
__ vpshufd(bVec, bVec, bShift, vector_len);
__ vpshufd(cVec, cVec, cShift, vector_len);
__ vpshufd(dVec, dVec, dShift, vector_len);
}
/**
* Write 256 bytes of keystream output held in 4 AVX512 SIMD registers
* in a quarter round parallel organization.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param baseAddr the register holding the base output address
* @param baseOffset the offset from baseAddr for writes
*/
void StubGenerator::cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister
bVec,
XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset) {
__ vextracti32x4(Address(baseAddr, baseOffset + 0), aVec, 0);
__ vextracti32x4(Address(baseAddr, baseOffset + 64), aVec, 1);
__ vextracti32x4(Address(baseAddr, baseOffset + 128), aVec, 2);
__ vextracti32x4(Address(baseAddr, baseOffset + 192), aVec, 3);
__ vextracti32x4(Address(baseAddr, baseOffset + 16), bVec, 0);
__ vextracti32x4(Address(baseAddr, baseOffset + 80), bVec, 1);
__ vextracti32x4(Address(baseAddr, baseOffset + 144), bVec, 2);
__ vextracti32x4(Address(baseAddr, baseOffset + 208), bVec, 3);
__ vextracti32x4(Address(baseAddr, baseOffset + 32), cVec, 0);
__ vextracti32x4(Address(baseAddr, baseOffset + 96), cVec, 1);
__ vextracti32x4(Address(baseAddr, baseOffset + 160), cVec, 2);
__ vextracti32x4(Address(baseAddr, baseOffset + 224), cVec, 3);
__ vextracti32x4(Address(baseAddr, baseOffset + 48), dVec, 0);
__ vextracti32x4(Address(baseAddr, baseOffset + 112), dVec, 1);
__ vextracti32x4(Address(baseAddr, baseOffset + 176), dVec, 2);
__ vextracti32x4(Address(baseAddr, baseOffset + 240), dVec, 3);
}
#undef __

View File

@ -1122,6 +1122,22 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
// ChaCha20 Intrinsics
// As long as the system supports AVX as a baseline we can do a
// SIMD-enabled block function. StubGenerator makes the determination
// based on the VM capabilities whether to use an AVX2 or AVX512-enabled
// version.
if (UseAVX >= 1) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
UseChaCha20Intrinsics = true;
}
} else if (UseChaCha20Intrinsics) {
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
warning("ChaCha20 intrinsic requires AVX instructions");
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}
// Base64 Intrinsics (Check the condition for which the intrinsic will be active)
if ((UseAVX > 2) && supports_avx512vl() && supports_avx512bw()) {
if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {

View File

@ -475,6 +475,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
case vmIntrinsics::_ghash_processBlocks:
if (!UseGHASHIntrinsics) return true;
break;
case vmIntrinsics::_chacha20Block:
if (!UseChaCha20Intrinsics) return true;
break;
case vmIntrinsics::_base64_encodeBlock:
case vmIntrinsics::_base64_decodeBlock:
if (!UseBASE64Intrinsics) return true;

View File

@ -532,6 +532,12 @@ class methodHandle;
do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \
do_name(processMultipleBlocks_name, "processMultipleBlocks") \
\
/* support for com.sun.crypto.provider.ChaCha20Cipher */ \
do_class(com_sun_crypto_provider_chacha20cipher, "com/sun/crypto/provider/ChaCha20Cipher") \
do_intrinsic(_chacha20Block, com_sun_crypto_provider_chacha20cipher, chacha20Block_name, chacha20Block_signature, F_S) \
do_name(chacha20Block_name, "implChaCha20Block") \
do_signature(chacha20Block_signature, "([I[B)I") \
\
/* support for java.util.zip */ \
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \

View File

@ -318,6 +318,7 @@
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _md5_implCompress, address) \
static_field(StubRoutines, _md5_implCompressMB, address) \
static_field(StubRoutines, _chacha20Block, address) \
static_field(StubRoutines, _sha1_implCompress, address) \
static_field(StubRoutines, _sha1_implCompressMB, address) \
static_field(StubRoutines, _sha256_implCompress, address) \

View File

@ -737,6 +737,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_bigIntegerLeftShiftWorker:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_ghash_processBlocks:
case vmIntrinsics::_chacha20Block:
case vmIntrinsics::_base64_encodeBlock:
case vmIntrinsics::_base64_decodeBlock:
case vmIntrinsics::_poly1305_processBlocks:

View File

@ -1168,6 +1168,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "chacha20Block") == 0 ||
strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "md5_implCompress") == 0 ||

View File

@ -608,6 +608,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_ghash_processBlocks:
return inline_ghash_processBlocks();
case vmIntrinsics::_chacha20Block:
return inline_chacha20Block();
case vmIntrinsics::_base64_encodeBlock:
return inline_base64_encodeBlock();
case vmIntrinsics::_base64_decodeBlock:
@ -6897,6 +6899,36 @@ bool LibraryCallKit::inline_ghash_processBlocks() {
return true;
}
//------------------------------inline_chacha20Block
bool LibraryCallKit::inline_chacha20Block() {
address stubAddr;
const char *stubName;
assert(UseChaCha20Intrinsics, "need ChaCha20 intrinsics support");
stubAddr = StubRoutines::chacha20Block();
stubName = "chacha20Block";
Node* state = argument(0);
Node* result = argument(1);
state = must_be_not_null(state, true);
result = must_be_not_null(result, true);
Node* state_start = array_element_address(state, intcon(0), T_INT);
assert(state_start, "state is NULL");
Node* result_start = array_element_address(result, intcon(0), T_BYTE);
assert(result_start, "result is NULL");
Node* cc20Blk = make_runtime_call(RC_LEAF|RC_NO_FP,
OptoRuntime::chacha20Block_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
state_start, result_start);
// return key stream length (int)
Node* retvalue = _gvn.transform(new ProjNode(cc20Blk, TypeFunc::Parms));
set_result(retvalue);
return true;
}
bool LibraryCallKit::inline_base64_encodeBlock() {
address stubAddr;
const char *stubName;

View File

@ -291,6 +291,7 @@ class LibraryCallKit : public GraphKit {
Node* inline_counterMode_AESCrypt_predicate();
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
bool inline_ghash_processBlocks();
bool inline_chacha20Block();
bool inline_base64_encodeBlock();
bool inline_base64_decodeBlock();
bool inline_poly1305_processBlocks();

View File

@ -1222,6 +1222,26 @@ const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
// ChaCha20 Block function
const TypeFunc* OptoRuntime::chacha20Block_Type() {
int argcnt = 2;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // state
fields[argp++] = TypePtr::NOTNULL; // result
assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
// result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms + 0] = TypeInt::INT; // key stream outlen as int
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
return TypeFunc::make(domain, range);
}
// Base64 encode function
const TypeFunc* OptoRuntime::base64_encodeBlock_Type() {
int argcnt = 6;

View File

@ -278,6 +278,7 @@ private:
static const TypeFunc* vectorizedMismatch_Type();
static const TypeFunc* ghash_processBlocks_Type();
static const TypeFunc* chacha20Block_Type();
static const TypeFunc* base64_encodeBlock_Type();
static const TypeFunc* base64_decodeBlock_Type();
static const TypeFunc* poly1305_processBlocks_Type();

View File

@ -323,6 +323,9 @@ const int ObjectAlignmentInBytes = 8;
product(bool, UseAESCTRIntrinsics, false, DIAGNOSTIC, \
"Use intrinsics for the paralleled version of AES/CTR crypto") \
\
product(bool, UseChaCha20Intrinsics, false, DIAGNOSTIC, \
"Use intrinsics for the vectorized version of ChaCha20") \
\
product(bool, UseMD5Intrinsics, false, DIAGNOSTIC, \
"Use intrinsics for MD5 crypto hash function") \
\

View File

@ -128,6 +128,7 @@ address StubRoutines::_electronicCodeBook_decryptAESCrypt = NULL;
address StubRoutines::_counterMode_AESCrypt = NULL;
address StubRoutines::_galoisCounterMode_AESCrypt = NULL;
address StubRoutines::_ghash_processBlocks = NULL;
address StubRoutines::_chacha20Block = NULL;
address StubRoutines::_base64_encodeBlock = NULL;
address StubRoutines::_base64_decodeBlock = NULL;
address StubRoutines::_poly1305_processBlocks = NULL;

View File

@ -209,6 +209,7 @@ class StubRoutines: AllStatic {
static address _counterMode_AESCrypt;
static address _galoisCounterMode_AESCrypt;
static address _ghash_processBlocks;
static address _chacha20Block;
static address _base64_encodeBlock;
static address _base64_decodeBlock;
static address _poly1305_processBlocks;
@ -388,6 +389,7 @@ class StubRoutines: AllStatic {
static address poly1305_processBlocks() { return _poly1305_processBlocks; }
static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
static address ghash_processBlocks() { return _ghash_processBlocks; }
static address chacha20Block() { return _chacha20Block; }
static address base64_encodeBlock() { return _base64_encodeBlock; }
static address base64_decodeBlock() { return _base64_decodeBlock; }
static address md5_implCompress() { return _md5_implCompress; }

View File

@ -541,6 +541,7 @@
static_field(StubRoutines, _counterMode_AESCrypt, address) \
static_field(StubRoutines, _galoisCounterMode_AESCrypt, address) \
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _chacha20Block, address) \
static_field(StubRoutines, _base64_encodeBlock, address) \
static_field(StubRoutines, _base64_decodeBlock, address) \
static_field(StubRoutines, _poly1305_processBlocks, address) \

View File

@ -39,6 +39,9 @@ import javax.crypto.*;
import javax.crypto.spec.ChaCha20ParameterSpec;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import jdk.internal.vm.annotation.ForceInline;
import jdk.internal.vm.annotation.IntrinsicCandidate;
import sun.security.util.DerValue;
/**
@ -58,8 +61,9 @@ abstract class ChaCha20Cipher extends CipherSpi {
private static final int STATE_CONST_3 = 0x6b206574;
// The keystream block size in bytes and as integers
private static final int KEYSTREAM_SIZE = 64;
private static final int KS_SIZE_INTS = KEYSTREAM_SIZE / Integer.BYTES;
private static final int KS_MAX_LEN = 1024;
private static final int KS_BLK_SIZE = 64;
private static final int KS_SIZE_INTS = KS_BLK_SIZE / Integer.BYTES;
private static final int CIPHERBUF_BASE = 1024;
// The initialization state of the cipher
@ -85,14 +89,18 @@ abstract class ChaCha20Cipher extends CipherSpi {
private long finalCounterValue;
private long counter;
// Two arrays, both implemented as 16-element integer arrays:
// The base state, created at initialization time, and a working
// state which is a clone of the start state, and is then modified
// with the counter and the ChaCha20 block function.
// The base state is created at initialization time as a 16-int array
// and then is copied into either local variables for computations (Java) or
// into SIMD registers (intrinsics).
private final int[] startState = new int[KS_SIZE_INTS];
private final byte[] keyStream = new byte[KEYSTREAM_SIZE];
// The offset into the current keystream
// The output keystream array is sized to hold keystream output from the
// implChaCha20Block method. This can range from a single block at a time
// (Java software) up to 16 blocks on x86_64 with AVX512 support.
private final byte[] keyStream = new byte[KS_MAX_LEN];
// The keystream buffer limit and offset
private int keyStrLimit;
private int keyStrOffset;
// AEAD-related fields and constants
@ -561,12 +569,14 @@ abstract class ChaCha20Cipher extends CipherSpi {
}
}
// We can also get one block's worth of keystream created
// We can also generate the first block (or blocks if intrinsics
// are capable of doing multiple blocks at a time) of keystream.
finalCounterValue = counter + MAX_UINT32;
generateKeystream();
this.keyStrLimit = chaCha20Block(startState, counter, keyStream);
this.keyStrOffset = 0;
this.counter += (keyStrLimit / KS_BLK_SIZE);
direction = opmode;
aadDone = false;
this.keyStrOffset = 0;
initialized = true;
}
@ -831,31 +841,34 @@ abstract class ChaCha20Cipher extends CipherSpi {
}
}
/**
* Using the current state and counter create the next set of keystream
* bytes. This method will generate the next 512 bits of keystream and
* return it in the {@code keyStream} parameter. Following the
* block function the counter will be incremented.
*/
private void generateKeystream() {
chaCha20Block(startState, counter, keyStream);
counter++;
@ForceInline
private static int chaCha20Block(int[] initState, long counter,
byte[] result) {
if (initState.length != KS_SIZE_INTS || result.length != KS_MAX_LEN) {
throw new IllegalArgumentException(
"Illegal state or keystream buffer length");
}
// Set the counter value before sending into the underlying
// private block method
initState[12] = (int)counter;
return implChaCha20Block(initState, result);
}
/**
* Perform a full 20-round ChaCha20 transform on the initial state.
*
* @param initState the starting state, not including the counter
* value.
* @param counter the counter value to apply
* @param initState the starting state using the current counter value.
* @param result the array that will hold the result of the ChaCha20
* block function.
*
* @note it is the caller's responsibility to ensure that the workState
* is sized the same as the initState, no checking is performed internally.
* @return the number of keystream bytes generated. In a pure Java method
* this will always be 64 bytes, but intrinsics that make use of
* AVX2 or AVX512 registers may generate multiple blocks of keystream
* in a single call and therefore may be a larger multiple of 64.
*/
private static void chaCha20Block(int[] initState, long counter,
byte[] result) {
@IntrinsicCandidate
private static int implChaCha20Block(int[] initState, byte[] result) {
// Create an initial state and clone a working copy
int ws00 = STATE_CONST_0;
int ws01 = STATE_CONST_1;
@ -869,7 +882,7 @@ abstract class ChaCha20Cipher extends CipherSpi {
int ws09 = initState[9];
int ws10 = initState[10];
int ws11 = initState[11];
int ws12 = (int)counter;
int ws12 = initState[12];
int ws13 = initState[13];
int ws14 = initState[14];
int ws15 = initState[15];
@ -986,11 +999,12 @@ abstract class ChaCha20Cipher extends CipherSpi {
asIntLittleEndian.set(result, 36, ws09 + initState[9]);
asIntLittleEndian.set(result, 40, ws10 + initState[10]);
asIntLittleEndian.set(result, 44, ws11 + initState[11]);
// Add the counter back into workState[12]
asIntLittleEndian.set(result, 48, ws12 + (int)counter);
asIntLittleEndian.set(result, 48, ws12 + initState[12]);
asIntLittleEndian.set(result, 52, ws13 + initState[13]);
asIntLittleEndian.set(result, 56, ws14 + initState[14]);
asIntLittleEndian.set(result, 60, ws15 + initState[15]);
return KS_BLK_SIZE;
}
/**
@ -1009,12 +1023,21 @@ abstract class ChaCha20Cipher extends CipherSpi {
int remainingData = inLen;
while (remainingData > 0) {
int ksRemain = keyStream.length - keyStrOffset;
int ksRemain = keyStrLimit - keyStrOffset;
if (ksRemain <= 0) {
if (counter <= finalCounterValue) {
generateKeystream();
// Intrinsics can do multiple blocks at once. This means
// it may overrun the counter. In order to prevent key
// stream reuse, we adjust the key stream limit to only the
// key stream length that is calculated from unique
// counter values.
keyStrLimit = chaCha20Block(startState, counter, keyStream);
counter += (keyStrLimit / KS_BLK_SIZE);
if (counter > finalCounterValue) {
keyStrLimit -= (int)(counter - finalCounterValue) * 64;
}
keyStrOffset = 0;
ksRemain = keyStream.length;
ksRemain = keyStrLimit;
} else {
throw new KeyException("Counter exhausted. " +
"Reinitialize with new key and/or nonce");
@ -1060,9 +1083,10 @@ abstract class ChaCha20Cipher extends CipherSpi {
private void initAuthenticator() throws InvalidKeyException {
authenticator = new Poly1305();
// Derive the Poly1305 key from the starting state
byte[] serializedKey = new byte[KEYSTREAM_SIZE];
chaCha20Block(startState, 0, serializedKey);
// Derive the Poly1305 key from the starting state with the counter
// value forced to zero.
byte[] serializedKey = new byte[KS_MAX_LEN];
chaCha20Block(startState, 0L, serializedKey);
authenticator.engineInit(new SecretKeySpec(serializedKey, 0, 32,
authAlgName), null);

View File

@ -0,0 +1,317 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.intrinsics.chacha;
import javax.crypto.Cipher;
import javax.crypto.spec.ChaCha20ParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import java.security.GeneralSecurityException;
import java.util.*;
public class ExerciseChaCha20 {
private static final int WARMUP_CYCLES = 200000;
// Use the test vectors from RFC 7539 to exercise the ChaCha20 block
// intrinsic
public static final List<TestData> testList = List.of(
new TestData("RFC 7539 Sample Test Vector",
"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
"000000000000004a00000000",
1, Cipher.ENCRYPT_MODE,
"4c616469657320616e642047656e746c656d656e206f662074686520636c6173" +
"73206f66202739393a204966204920636f756c64206f6666657220796f75206f" +
"6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73" +
"637265656e20776f756c642062652069742e",
null,
"6e2e359a2568f98041ba0728dd0d6981e97e7aec1d4360c20a27afccfd9fae0b" +
"f91b65c5524733ab8f593dabcd62b3571639d624e65152ab8f530c359f0861d8" +
"07ca0dbf500d6a6156a38e088a22b65e52bc514d16ccf806818ce91ab7793736" +
"5af90bbf74a35be6b40b8eedf2785e42874d"),
new TestData("RFC 7539 Test Vector 1 (all zeroes)",
"0000000000000000000000000000000000000000000000000000000000000000",
"000000000000000000000000",
0, Cipher.ENCRYPT_MODE,
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000",
null,
"76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7" +
"da41597c5157488d7724e03fb8d84a376a43b8f41518a11cc387b669b2ee6586"),
new TestData("RFC 7539 Test Vector 2",
"0000000000000000000000000000000000000000000000000000000000000001",
"000000000000000000000002",
1, Cipher.ENCRYPT_MODE,
"416e79207375626d697373696f6e20746f20746865204945544620696e74656e" +
"6465642062792074686520436f6e7472696275746f7220666f72207075626c69" +
"636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" +
"20496e7465726e65742d4472616674206f722052464320616e6420616e792073" +
"746174656d656e74206d6164652077697468696e2074686520636f6e74657874" +
"206f6620616e204945544620616374697669747920697320636f6e7369646572" +
"656420616e20224945544620436f6e747269627574696f6e222e205375636820" +
"73746174656d656e747320696e636c756465206f72616c2073746174656d656e" +
"747320696e20494554462073657373696f6e732c2061732077656c6c20617320" +
"7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" +
"74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" +
"207768696368206172652061646472657373656420746f",
null,
"a3fbf07df3fa2fde4f376ca23e82737041605d9f4f4f57bd8cff2c1d4b7955ec" +
"2a97948bd3722915c8f3d337f7d370050e9e96d647b7c39f56e031ca5eb6250d" +
"4042e02785ececfa4b4bb5e8ead0440e20b6e8db09d881a7c6132f420e527950" +
"42bdfa7773d8a9051447b3291ce1411c680465552aa6c405b7764d5e87bea85a" +
"d00f8449ed8f72d0d662ab052691ca66424bc86d2df80ea41f43abf937d3259d" +
"c4b2d0dfb48a6c9139ddd7f76966e928e635553ba76c5c879d7b35d49eb2e62b" +
"0871cdac638939e25e8a1e0ef9d5280fa8ca328b351c3c765989cbcf3daa8b6c" +
"cc3aaf9f3979c92b3720fc88dc95ed84a1be059c6499b9fda236e7e818b04b0b" +
"c39c1e876b193bfe5569753f88128cc08aaa9b63d1a16f80ef2554d7189c411f" +
"5869ca52c5b83fa36ff216b9c1d30062bebcfd2dc5bce0911934fda79a86f6e6" +
"98ced759c3ff9b6477338f3da4f9cd8514ea9982ccafb341b2384dd902f3d1ab" +
"7ac61dd29c6f21ba5b862f3730e37cfdc4fd806c22f221"),
new TestData("RFC 7539 Test Vector 3",
"1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0",
"000000000000000000000002",
42, Cipher.ENCRYPT_MODE,
"2754776173206272696c6c69672c20616e642074686520736c6974687920746f" +
"7665730a446964206779726520616e642067696d626c6520696e207468652077" +
"6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" +
"732c0a416e6420746865206d6f6d65207261746873206f757467726162652e",
null,
"62e6347f95ed87a45ffae7426f27a1df5fb69110044c0d73118effa95b01e5cf" +
"166d3df2d721caf9b21e5fb14c616871fd84c54f9d65b283196c7fe4f60553eb" +
"f39c6402c42234e32a356b3e764312a61a5532055716ead6962568f87d3f3f77" +
"04c6a8d1bcd1bf4d50d6154b6da731b187b58dfd728afa36757a797ac188d1")
);
public static class TestData {
public TestData(String name, String keyStr, String nonceStr, int ctr,
int dir, String inputStr, String aadStr, String outStr) {
testName = Objects.requireNonNull(name);
HexFormat hex = HexFormat.of();
key = hex.parseHex(Objects.requireNonNull(keyStr));
nonce = hex.parseHex(Objects.requireNonNull(nonceStr));
if ((counter = ctr) < 0) {
throw new IllegalArgumentException(
"counter must be 0 or greater");
}
direction = dir;
if ((direction != Cipher.ENCRYPT_MODE) &&
(direction != Cipher.DECRYPT_MODE)) {
throw new IllegalArgumentException(
"Direction must be ENCRYPT_MODE or DECRYPT_MODE");
}
input = hex.parseHex(Objects.requireNonNull(inputStr));
aad = (aadStr != null) ? hex.parseHex(aadStr) : null;
expOutput = hex.parseHex(Objects.requireNonNull(outStr));
}
public final String testName;
public final byte[] key;
public final byte[] nonce;
public final int counter;
public final int direction;
public final byte[] input;
public final byte[] aad;
public final byte[] expOutput;
}
public static void main(String[] args) throws Exception {
int testsPassed = 0;
int testNumber = 0;
// Use the first test vector to warm up the JVM and activate
// the intrinsics.
System.out.println("Running " + WARMUP_CYCLES + " warm up cycles");
for (int i = 0; i < WARMUP_CYCLES; i++) {
runSinglePartTest(testList.get(0));
}
System.out.println("----- Single-part Tests -----");
for (TestData test : testList) {
System.out.println("*** Test " + ++testNumber + ": " +
test.testName);
if (runSinglePartTest(test)) {
testsPassed++;
}
}
System.out.println();
System.out.println("----- Multi-part Tests -----");
for (TestData test : testList) {
System.out.println("*** Test " + ++testNumber + ": " +
test.testName);
if (runMultiPartTest(test)) {
testsPassed++;
}
}
System.out.println();
System.out.println("Total tests: " + testNumber +
", Passed: " + testsPassed + ", Failed: " +
(testNumber - testsPassed));
if (testsPassed != testNumber) {
throw new RuntimeException("One or more tests failed. " +
"Check output for details");
}
}
private static boolean runSinglePartTest(TestData testData)
throws GeneralSecurityException {
boolean encRes = false;
boolean decRes = false;
byte[] encryptedResult;
byte[] decryptedResult;
// Get a Cipher instance and set up the parameters
Cipher mambo = Cipher.getInstance("ChaCha20");
SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
testData.nonce, testData.counter);
// Encrypt our input
mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
encryptedResult = mambo.doFinal(testData.input);
if (!Arrays.equals(encryptedResult, testData.expOutput)) {
System.out.println("ERROR - Output Mismatch!");
System.out.println("Expected:\n" +
dumpHexBytes(testData.expOutput, 16, "\n", " "));
System.out.println("Actual:\n" +
dumpHexBytes(encryptedResult, 16, "\n", " "));
System.out.println();
} else {
encRes = true;
}
// Decrypt the result of the encryption operation
mambo = Cipher.getInstance("ChaCha20");
mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
decryptedResult = mambo.doFinal(encryptedResult);
if (!Arrays.equals(decryptedResult, testData.input)) {
System.out.println("ERROR - Output Mismatch!");
System.out.println("Expected:\n" +
dumpHexBytes(testData.input, 16, "\n", " "));
System.out.println("Actual:\n" +
dumpHexBytes(decryptedResult, 16, "\n", " "));
System.out.println();
} else {
decRes = true;
}
return (encRes && decRes);
}
private static boolean runMultiPartTest(TestData testData)
throws GeneralSecurityException {
boolean encRes = false;
boolean decRes = false;
// Get a cipher instance and initialize it
Cipher mambo = Cipher.getInstance("ChaCha20");
SecretKeySpec mamboKey = new SecretKeySpec(testData.key, "ChaCha20");
ChaCha20ParameterSpec mamboSpec = new ChaCha20ParameterSpec(
testData.nonce, testData.counter);
byte[] encryptedResult = new byte[testData.input.length];
mambo.init(Cipher.ENCRYPT_MODE, mamboKey, mamboSpec);
System.out.print("Encrypt - ");
doMulti(mambo, testData.input, encryptedResult);
if (!Arrays.equals(encryptedResult, testData.expOutput)) {
System.out.println("ERROR - Output Mismatch!");
System.out.println("Expected:\n" +
dumpHexBytes(testData.expOutput));
System.out.println("Actual:\n" +
dumpHexBytes(encryptedResult));
System.out.println();
} else {
encRes = true;
}
// Decrypt the result of the encryption operation
mambo = Cipher.getInstance("ChaCha20");
byte[] decryptedResult = new byte[encryptedResult.length];
mambo.init(Cipher.DECRYPT_MODE, mamboKey, mamboSpec);
System.out.print("Decrypt - ");
doMulti(mambo, encryptedResult, decryptedResult);
if (!Arrays.equals(decryptedResult, testData.input)) {
System.out.println("ERROR - Output Mismatch!");
System.out.println("Expected:\n" + dumpHexBytes(testData.input));
System.out.println("Actual:\n" + dumpHexBytes(decryptedResult));
System.out.println();
} else {
decRes = true;
}
return (encRes && decRes);
}
private static void doMulti(Cipher c, byte[] input, byte[] output)
throws GeneralSecurityException {
int offset = 0;
boolean done = false;
Random randIn = new Random(System.currentTimeMillis());
// Send small updates between 1 - 8 bytes in length until we get
// 8 or less bytes from the end of the input, then finalize.
System.out.println("Input length: " + input.length);
System.out.print("Multipart (bytes in/out): ");
while (!done) {
int mPartLen = randIn.nextInt(8) + 1;
int bytesLeft = input.length - offset;
int processed;
if (mPartLen < bytesLeft) {
System.out.print(mPartLen + "/");
processed = c.update(input, offset, mPartLen,
output, offset);
offset += processed;
System.out.print(processed + " ");
} else {
processed = c.doFinal(input, offset, bytesLeft,
output, offset);
System.out.print(bytesLeft + "/" + processed + " ");
done = true;
}
}
System.out.println();
}
private static String dumpHexBytes(byte[] data) {
return dumpHexBytes(data, 16, "\n", " ");
}
private static String dumpHexBytes(byte[] data, int itemsPerLine,
String lineDelim, String itemDelim) {
StringBuilder sb = new StringBuilder();
if (data != null) {
for (int i = 0; i < data.length; i++) {
if (i % itemsPerLine == 0 && i != 0) {
sb.append(lineDelim);
}
sb.append(String.format("%02X", data[i])).append(itemDelim);
}
}
return sb.toString();
}
}

View File

@ -0,0 +1,171 @@
/*
* Copyright (c) 2021, Red Hat, Inc. All rights reserved.
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.intrinsics.chacha;
import java.util.ArrayList;
import java.util.List;
import jdk.test.lib.Platform;
import jdk.test.lib.process.OutputAnalyzer;
import jdk.test.lib.process.ProcessTools;
import jdk.test.whitebox.cpuinfo.CPUInfo;
/**
* @test
* @bug 8247645
* @summary ChaCha20 Intrinsics
* @library /test/lib
* @build compiler.intrinsics.chacha.ExerciseChaCha20
* jdk.test.whitebox.WhiteBox
* @run driver jdk.test.lib.helpers.ClassFileInstaller jdk.test.whitebox.WhiteBox
* @run main/othervm/timeout=7200
* -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI
* compiler.intrinsics.chacha.TestChaCha20
*/
public class TestChaCha20 {
// Default to 1/4 of the CPUs, and allow users to override.
static final int MAX_PARALLELISM = Integer.getInteger("maxParallelism",
Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
private static List<String> mix(List<String> o, String... mix) {
List<String> n = new ArrayList<>(o);
for (String m : mix) {
n.add(m);
}
return n;
}
private static boolean containsFuzzy(List<String> list, String sub) {
for (String s : list) {
if (s.contains(sub)) return true;
}
return false;
}
public static void main(String... args) throws Exception {
List<List<String>> configs = new ArrayList<>();
List<String> cpuFeatures = CPUInfo.getFeatures();
System.out.print("CPU Features: ");
cpuFeatures.forEach(f -> System.out.print(f + " "));
System.out.println();
if (Platform.isX64()) {
// If CPU features were not found, provide a default config.
if (cpuFeatures.isEmpty()) {
configs.add(new ArrayList());
}
// Otherwise, select the tests that make sense on current platform.
if (containsFuzzy(cpuFeatures, "avx512")) {
System.out.println("Setting up AVX512 worker");
configs.add(List.of("-XX:UseAVX=3"));
}
if (containsFuzzy(cpuFeatures, "avx2")) {
System.out.println("Setting up AVX2 worker");
configs.add(List.of("-XX:UseAVX=2"));
}
if (containsFuzzy(cpuFeatures, "avx")) {
System.out.println("Setting up AVX worker");
configs.add(List.of("-XX:UseAVX=1"));
}
} else if (Platform.isAArch64()) {
// AArch64 intrinsics require the advanced simd instructions
if (containsFuzzy(cpuFeatures, "simd")) {
System.out.println("Setting up ASIMD worker");
configs.add(new ArrayList());
}
} else {
// We only have ChaCha20 intrinsics on x64 and aarch64
// currently. If the platform is neither of these then
// the ChaCha20 known answer tests in
// com/sun/crypto/provider/Cipher are sufficient.
return;
}
// If by this point we have no configs, it means we are running
// on a platform that intrinsics have been written for, but does
// not possess the necessary instruction sets for that processor.
// We can exit out if that is the case.
if (configs.isEmpty()) {
System.out.println("No intrinsics-capable configurations found");
return;
}
// We can expand this array later to include other tests if new
// ChaCha20 intrinsics are developed.
String[] classNames = {
"compiler.intrinsics.chacha.ExerciseChaCha20"
};
ArrayList<Fork> forks = new ArrayList<>();
int jobs = 0;
for (List<String> c : configs) {
for (String className : classNames) {
// Start a new job
{
ProcessBuilder pb = ProcessTools.createTestJvm(
mix(c, "-Xmx256m", className));
Process p = pb.start();
OutputAnalyzer oa = new OutputAnalyzer(p);
forks.add(new Fork(p, oa));
jobs++;
}
// Wait for the completion of other jobs
while (jobs >= MAX_PARALLELISM) {
Fork f = findDone(forks);
if (f != null) {
OutputAnalyzer oa = f.oa();
oa.shouldHaveExitValue(0);
forks.remove(f);
jobs--;
} else {
// Nothing is done, wait a little.
Thread.sleep(200);
}
}
}
}
// Drain the rest
for (Fork f : forks) {
OutputAnalyzer oa = f.oa();
oa.shouldHaveExitValue(0);
}
}
private static Fork findDone(List<Fork> forks) {
for (Fork f : forks) {
if (!f.p().isAlive()) {
return f;
}
}
return null;
}
private static record Fork(Process p, OutputAnalyzer oa) {};
}

View File

@ -198,7 +198,7 @@ public abstract class CipherBench extends CryptoBase {
@Param({"256"})
private int keyLength;
@Param({"1024", "" + 16 * 1024})
@Param({"256", "1024", "4096", "16384"})
private int dataSize;
protected int ivLength() {
@ -223,7 +223,7 @@ public abstract class CipherBench extends CryptoBase {
@Param({"256"})
private int keyLength;
@Param({"1024", "" + 16 * 1024})
@Param({"256", "1024", "4096", "16384"})
private int dataSize;
protected int ivLength() {