8315716: RISC-V: implement ChaCha20 intrinsic
Reviewed-by: luhenry, fyang
This commit is contained in:
parent
8a9c4d5266
commit
8f8c45b54a
@ -1790,6 +1790,11 @@ enum Nf {
|
||||
INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
|
||||
INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);
|
||||
|
||||
INSN(vsse8_v, 0b0100111, 0b000, 0b10, 0b0);
|
||||
INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
|
||||
INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
|
||||
INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);
|
||||
|
||||
#undef INSN
|
||||
#undef patch_VLdSt
|
||||
|
||||
|
@ -1289,6 +1289,13 @@ public:
|
||||
}
|
||||
|
||||
// vector pseudo instructions
|
||||
// rotate vector register left with shift bits, 32-bit version
|
||||
inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
|
||||
vsrl_vi(tmp_vr, vd, 32 - shift);
|
||||
vsll_vi(vd, vd, shift);
|
||||
vor_vv(vd, vd, tmp_vr);
|
||||
}
|
||||
|
||||
inline void vl1r_v(VectorRegister vd, Register rs) {
|
||||
vl1re8_v(vd, rs);
|
||||
}
|
||||
|
@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return (address) start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the quarter round calculations on values contained within four vector registers.
|
||||
*
|
||||
* @param aVec the SIMD register containing only the "a" values
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param tmp_vr temporary vector register holds intermedia values.
|
||||
*/
|
||||
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
|
||||
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
|
||||
// a += b, d ^= a, d <<<= 16
|
||||
__ vadd_vv(aVec, aVec, bVec);
|
||||
__ vxor_vv(dVec, dVec, aVec);
|
||||
__ vrole32_vi(dVec, 16, tmp_vr);
|
||||
|
||||
// c += d, b ^= c, b <<<= 12
|
||||
__ vadd_vv(cVec, cVec, dVec);
|
||||
__ vxor_vv(bVec, bVec, cVec);
|
||||
__ vrole32_vi(bVec, 12, tmp_vr);
|
||||
|
||||
// a += b, d ^= a, d <<<= 8
|
||||
__ vadd_vv(aVec, aVec, bVec);
|
||||
__ vxor_vv(dVec, dVec, aVec);
|
||||
__ vrole32_vi(dVec, 8, tmp_vr);
|
||||
|
||||
// c += d, b ^= c, b <<<= 7
|
||||
__ vadd_vv(cVec, cVec, dVec);
|
||||
__ vxor_vv(bVec, bVec, cVec);
|
||||
__ vrole32_vi(bVec, 7, tmp_vr);
|
||||
}
|
||||
|
||||
/**
|
||||
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
|
||||
*
|
||||
* Input arguments:
|
||||
* c_rarg0 - state, the starting state
|
||||
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
|
||||
*
|
||||
* Implementation Note:
|
||||
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
|
||||
* N depends on single vector register length.
|
||||
*/
|
||||
address generate_chacha20Block() {
|
||||
Label L_Rounds;
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
|
||||
address start = __ pc();
|
||||
__ enter();
|
||||
|
||||
const int states_len = 16;
|
||||
const int step = 4;
|
||||
const Register state = c_rarg0;
|
||||
const Register key_stream = c_rarg1;
|
||||
const Register tmp_addr = t0;
|
||||
const Register length = t1;
|
||||
|
||||
// Organize vector registers in an array that facilitates
|
||||
// putting repetitive opcodes into loop structures below.
|
||||
const VectorRegister work_vrs[16] = {
|
||||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10, v11, v12, v13, v14, v15
|
||||
};
|
||||
const VectorRegister tmp_vr = v16;
|
||||
const VectorRegister counter_vr = v17;
|
||||
|
||||
{
|
||||
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
|
||||
// in java level.
|
||||
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
|
||||
}
|
||||
|
||||
// Load from source state.
|
||||
// Every element in source state is duplicated to all elements in the corresponding vector.
|
||||
__ mv(tmp_addr, state);
|
||||
for (int i = 0; i < states_len; i += 1) {
|
||||
__ vlse32_v(work_vrs[i], tmp_addr, zr);
|
||||
__ addi(tmp_addr, tmp_addr, step);
|
||||
}
|
||||
// Adjust counter for every individual block.
|
||||
__ vid_v(counter_vr);
|
||||
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
|
||||
|
||||
// Perform 10 iterations of the 8 quarter round set
|
||||
{
|
||||
const Register loop = t2; // share t2 with other non-overlapping usages.
|
||||
__ mv(loop, 10);
|
||||
__ BIND(L_Rounds);
|
||||
|
||||
chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
|
||||
|
||||
chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
|
||||
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
|
||||
|
||||
__ sub(loop, loop, 1);
|
||||
__ bnez(loop, L_Rounds);
|
||||
}
|
||||
|
||||
// Add the original state into the end working state.
|
||||
// We do this by first duplicating every element in source state array to the corresponding
|
||||
// vector, then adding it to the post-loop working state.
|
||||
__ mv(tmp_addr, state);
|
||||
for (int i = 0; i < states_len; i += 1) {
|
||||
__ vlse32_v(tmp_vr, tmp_addr, zr);
|
||||
__ addi(tmp_addr, tmp_addr, step);
|
||||
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
|
||||
}
|
||||
// Add the counter overlay onto work_vrs[12] at the end.
|
||||
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
|
||||
|
||||
// Store result to key stream.
|
||||
{
|
||||
const Register stride = t2; // share t2 with other non-overlapping usages.
|
||||
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
|
||||
__ mv(stride, 64);
|
||||
for (int i = 0; i < states_len; i += 1) {
|
||||
__ vsse32_v(work_vrs[i], key_stream, stride);
|
||||
__ addi(key_stream, key_stream, step);
|
||||
}
|
||||
}
|
||||
|
||||
// Return length of output key_stream
|
||||
__ slli(c_rarg0, length, 6);
|
||||
|
||||
__ leave();
|
||||
__ ret();
|
||||
|
||||
return (address) start;
|
||||
}
|
||||
|
||||
#if INCLUDE_JFR
|
||||
|
||||
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
|
||||
@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
|
||||
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
|
||||
}
|
||||
|
||||
if (UseChaCha20Intrinsics) {
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block();
|
||||
}
|
||||
|
||||
#endif // COMPILER2_OR_JVMCI
|
||||
}
|
||||
|
||||
|
@ -253,6 +253,16 @@ void VM_Version::initialize() {
|
||||
warning("Block zeroing is not available");
|
||||
FLAG_SET_DEFAULT(UseBlockZeroing, false);
|
||||
}
|
||||
if (UseRVV) {
|
||||
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
|
||||
}
|
||||
} else if (UseChaCha20Intrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
|
||||
warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
|
||||
}
|
||||
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
|
||||
}
|
||||
|
||||
#ifdef COMPILER2
|
||||
c2_initialize();
|
||||
|
Loading…
x
Reference in New Issue
Block a user