8315716: RISC-V: implement ChaCha20 intrinsic

Reviewed-by: luhenry, fyang
This commit is contained in:
Hamlin Li 2023-10-11 14:48:28 +00:00
parent 8a9c4d5266
commit 8f8c45b54a
4 changed files with 163 additions and 0 deletions

View File

@ -1790,6 +1790,11 @@ enum Nf {
INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);
INSN(vsse8_v, 0b0100111, 0b000, 0b10, 0b0);
INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);
#undef INSN
#undef patch_VLdSt

View File

@ -1289,6 +1289,13 @@ public:
}
// vector pseudo instructions
// rotate vector register left with shift bits, 32-bit version
inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
vsrl_vi(tmp_vr, vd, 32 - shift);
vsll_vi(vd, vd, shift);
vor_vv(vd, vd, tmp_vr);
}
inline void vl1r_v(VectorRegister vd, Register rs) {
vl1re8_v(vd, rs);
}

View File

@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
return (address) start;
}
/**
* Perform the quarter round calculations on values contained within four vector registers.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param tmp_vr temporary vector register holds intermedia values.
*/
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
// a += b, d ^= a, d <<<= 16
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 16, tmp_vr);
// c += d, b ^= c, b <<<= 12
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 12, tmp_vr);
// a += b, d ^= a, d <<<= 8
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 8, tmp_vr);
// c += d, b ^= c, b <<<= 7
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 7, tmp_vr);
}
/**
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
*
* Input arguments:
* c_rarg0 - state, the starting state
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
*
* Implementation Note:
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
* N depends on single vector register length.
*/
address generate_chacha20Block() {
Label L_Rounds;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
__ enter();
const int states_len = 16;
const int step = 4;
const Register state = c_rarg0;
const Register key_stream = c_rarg1;
const Register tmp_addr = t0;
const Register length = t1;
// Organize vector registers in an array that facilitates
// putting repetitive opcodes into loop structures below.
const VectorRegister work_vrs[16] = {
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15
};
const VectorRegister tmp_vr = v16;
const VectorRegister counter_vr = v17;
{
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
// in java level.
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
}
// Load from source state.
// Every element in source state is duplicated to all elements in the corresponding vector.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(work_vrs[i], tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
}
// Adjust counter for every individual block.
__ vid_v(counter_vr);
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
// Perform 10 iterations of the 8 quarter round set
{
const Register loop = t2; // share t2 with other non-overlapping usages.
__ mv(loop, 10);
__ BIND(L_Rounds);
chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
__ sub(loop, loop, 1);
__ bnez(loop, L_Rounds);
}
// Add the original state into the end working state.
// We do this by first duplicating every element in source state array to the corresponding
// vector, then adding it to the post-loop working state.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(tmp_vr, tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
}
// Add the counter overlay onto work_vrs[12] at the end.
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
// Store result to key stream.
{
const Register stride = t2; // share t2 with other non-overlapping usages.
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
__ mv(stride, 64);
for (int i = 0; i < states_len; i += 1) {
__ vsse32_v(work_vrs[i], key_stream, stride);
__ addi(key_stream, key_stream, step);
}
}
// Return length of output key_stream
__ slli(c_rarg0, length, 6);
__ leave();
__ ret();
return (address) start;
}
#if INCLUDE_JFR
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
}
if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block();
}
#endif // COMPILER2_OR_JVMCI
}

View File

@ -253,6 +253,16 @@ void VM_Version::initialize() {
warning("Block zeroing is not available");
FLAG_SET_DEFAULT(UseBlockZeroing, false);
}
if (UseRVV) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
}
} else if (UseChaCha20Intrinsics) {
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}
#ifdef COMPILER2
c2_initialize();