From 8f8c45b54a0ca2d676b76521fef87fb3a3ccad97 Mon Sep 17 00:00:00 2001 From: Hamlin Li Date: Wed, 11 Oct 2023 14:48:28 +0000 Subject: [PATCH] 8315716: RISC-V: implement ChaCha20 intrinsic Reviewed-by: luhenry, fyang --- src/hotspot/cpu/riscv/assembler_riscv.hpp | 5 + .../cpu/riscv/macroAssembler_riscv.hpp | 7 + src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 141 ++++++++++++++++++ src/hotspot/cpu/riscv/vm_version_riscv.cpp | 10 ++ 4 files changed, 163 insertions(+) diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp index 1278954740e..bdbf472f0b6 100644 --- a/src/hotspot/cpu/riscv/assembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp @@ -1790,6 +1790,11 @@ enum Nf { INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0); INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0); + INSN(vsse8_v, 0b0100111, 0b000, 0b10, 0b0); + INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0); + INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0); + INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0); + #undef INSN #undef patch_VLdSt diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index da3487392e6..9c664d9d278 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -1289,6 +1289,13 @@ public: } // vector pseudo instructions + // rotate vector register left with shift bits, 32-bit version + inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) { + vsrl_vi(tmp_vr, vd, 32 - shift); + vsll_vi(vd, vd, shift); + vor_vv(vd, vd, tmp_vr); + } + inline void vl1r_v(VectorRegister vd, Register rs) { vl1re8_v(vd, rs); } diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 4bd3938f3fc..6f302f747bb 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator { return (address) start; } + /** + * Perform the quarter round calculations on values contained within four vector registers. + * + * @param aVec the SIMD register containing only the "a" values + * @param bVec the SIMD register containing only the "b" values + * @param cVec the SIMD register containing only the "c" values + * @param dVec the SIMD register containing only the "d" values + * @param tmp_vr temporary vector register holds intermedia values. + */ + void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, + VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { + // a += b, d ^= a, d <<<= 16 + __ vadd_vv(aVec, aVec, bVec); + __ vxor_vv(dVec, dVec, aVec); + __ vrole32_vi(dVec, 16, tmp_vr); + + // c += d, b ^= c, b <<<= 12 + __ vadd_vv(cVec, cVec, dVec); + __ vxor_vv(bVec, bVec, cVec); + __ vrole32_vi(bVec, 12, tmp_vr); + + // a += b, d ^= a, d <<<= 8 + __ vadd_vv(aVec, aVec, bVec); + __ vxor_vv(dVec, dVec, aVec); + __ vrole32_vi(dVec, 8, tmp_vr); + + // c += d, b ^= c, b <<<= 7 + __ vadd_vv(cVec, cVec, dVec); + __ vxor_vv(bVec, bVec, cVec); + __ vrole32_vi(bVec, 7, tmp_vr); + } + + /** + * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) + * + * Input arguments: + * c_rarg0 - state, the starting state + * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function + * + * Implementation Note: + * Parallelization is achieved by loading individual state elements into vectors for N blocks. + * N depends on single vector register length. + */ + address generate_chacha20Block() { + Label L_Rounds; + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "chacha20Block"); + address start = __ pc(); + __ enter(); + + const int states_len = 16; + const int step = 4; + const Register state = c_rarg0; + const Register key_stream = c_rarg1; + const Register tmp_addr = t0; + const Register length = t1; + + // Organize vector registers in an array that facilitates + // putting repetitive opcodes into loop structures below. + const VectorRegister work_vrs[16] = { + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15 + }; + const VectorRegister tmp_vr = v16; + const VectorRegister counter_vr = v17; + + { + // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 + // in java level. + __ vsetivli(length, 16, Assembler::e32, Assembler::m1); + } + + // Load from source state. + // Every element in source state is duplicated to all elements in the corresponding vector. + __ mv(tmp_addr, state); + for (int i = 0; i < states_len; i += 1) { + __ vlse32_v(work_vrs[i], tmp_addr, zr); + __ addi(tmp_addr, tmp_addr, step); + } + // Adjust counter for every individual block. + __ vid_v(counter_vr); + __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); + + // Perform 10 iterations of the 8 quarter round set + { + const Register loop = t2; // share t2 with other non-overlapping usages. + __ mv(loop, 10); + __ BIND(L_Rounds); + + chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); + chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); + chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); + chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); + + chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); + chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); + chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); + chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); + + __ sub(loop, loop, 1); + __ bnez(loop, L_Rounds); + } + + // Add the original state into the end working state. + // We do this by first duplicating every element in source state array to the corresponding + // vector, then adding it to the post-loop working state. + __ mv(tmp_addr, state); + for (int i = 0; i < states_len; i += 1) { + __ vlse32_v(tmp_vr, tmp_addr, zr); + __ addi(tmp_addr, tmp_addr, step); + __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); + } + // Add the counter overlay onto work_vrs[12] at the end. + __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); + + // Store result to key stream. + { + const Register stride = t2; // share t2 with other non-overlapping usages. + // Every block occupies 64 bytes, so we use 64 as stride of the vector store. + __ mv(stride, 64); + for (int i = 0; i < states_len; i += 1) { + __ vsse32_v(work_vrs[i], key_stream, stride); + __ addi(key_stream, key_stream, step); + } + } + + // Return length of output key_stream + __ slli(c_rarg0, length, 6); + + __ leave(); + __ ret(); + + return (address) start; + } + #if INCLUDE_JFR static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { @@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); } + + if (UseChaCha20Intrinsics) { + StubRoutines::_chacha20Block = generate_chacha20Block(); + } + #endif // COMPILER2_OR_JVMCI } diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp index a19a4e4b08a..9a88be33d90 100644 --- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp +++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp @@ -253,6 +253,16 @@ void VM_Version::initialize() { warning("Block zeroing is not available"); FLAG_SET_DEFAULT(UseBlockZeroing, false); } + if (UseRVV) { + if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) { + FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true); + } + } else if (UseChaCha20Intrinsics) { + if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) { + warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)"); + } + FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false); + } #ifdef COMPILER2 c2_initialize();