8316592: RISC-V: implement poly1305 intrinsic

Reviewed-by: fyang, luhenry, mli
This commit is contained in:
ArsenyBochkarev 2023-11-21 07:36:55 +00:00 committed by Vladimir Kempik
parent 3544d2dd86
commit 46e4028adf
4 changed files with 237 additions and 0 deletions

View File

@ -2048,6 +2048,23 @@ void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1
beq(trial_klass, tmp1, L);
}
// Multiply and multiply-accumulate unsigned 64-bit registers.
void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
assert_different_registers(prod_lo, prod_hi);
mul(prod_lo, n, m);
mulhu(prod_hi, n, m);
}
void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
Register m, Register tmp1, Register tmp2) {
assert_different_registers(sum_lo, sum_hi);
assert_different_registers(sum_hi, tmp2);
wide_mul(tmp1, tmp2, n, m);
cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
}
// Move an oop into a register.
void MacroAssembler::movoop(Register dst, jobject obj) {
int oop_index;

View File

@ -198,6 +198,10 @@ class MacroAssembler: public Assembler {
void store_klass(Register dst, Register src, Register tmp = t0);
void cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L);
void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m);
void wide_madd(Register sum_lo, Register sum_hi, Register n,
Register m, Register tmp1, Register tmp2);
void encode_klass_not_null(Register r, Register tmp = t0);
void decode_klass_not_null(Register r, Register tmp = t0);
void encode_klass_not_null(Register dst, Register src, Register tmp);

View File

@ -4419,6 +4419,214 @@ class StubGenerator: public StubCodeGenerator {
return (address) start;
}
#ifdef COMPILER2
static const int64_t right_2_bits = right_n_bits(2);
static const int64_t right_3_bits = right_n_bits(3);
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
// are represented as long[5], with BITS_PER_LIMB = 26.
// Pack five 26-bit limbs into three 64-bit registers.
void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
// The goal is to have 128-bit value in dest2:dest1:dest0
__ ld(dest0, Address(src, 0)); // 26 bits in dest0
__ ld(tmp1, Address(src, sizeof(jlong)));
__ slli(tmp1, tmp1, 26);
__ add(dest0, dest0, tmp1); // 52 bits in dest0
__ ld(tmp2, Address(src, 2 * sizeof(jlong)));
__ slli(tmp1, tmp2, 52);
__ add(dest0, dest0, tmp1); // dest0 is full
__ srli(dest1, tmp2, 12); // 14-bit in dest1
__ ld(tmp1, Address(src, 3 * sizeof(jlong)));
__ slli(tmp1, tmp1, 14);
__ add(dest1, dest1, tmp1); // 40-bit in dest1
__ ld(tmp1, Address(src, 4 * sizeof(jlong)));
__ slli(tmp2, tmp1, 40);
__ add(dest1, dest1, tmp2); // dest1 is full
if (dest2->is_valid()) {
__ srli(tmp1, tmp1, 24);
__ mv(dest2, tmp1); // 2 bits in dest2
} else {
#ifdef ASSERT
Label OK;
__ srli(tmp1, tmp1, 24);
__ beq(zr, tmp1, OK); // 2 bits
__ stop("high bits of Poly1305 integer should be zero");
__ should_not_reach_here();
__ bind(OK);
#endif
}
}
// As above, but return only a 128-bit integer, packed into two
// 64-bit registers.
void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
}
// U_2:U_1:U_0: += (U_2 >> 2) * 5
void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
// First, U_2:U_1:U_0 += (U_2 >> 2)
__ srli(tmp1, U_2, 2);
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
__ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
__ add(U_2, U_2, tmp2);
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
__ slli(tmp1, tmp1, 2);
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
__ add(U_2, U_2, tmp2);
}
// Poly1305, RFC 7539
// void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
// Arguments:
// c_rarg0: input_start -- where the input is stored
// c_rarg1: length
// c_rarg2: acc_start -- where the output will be stored
// c_rarg3: r_start -- where the randomly generated 128-bit key is stored
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
// description of the tricks used to simplify and accelerate this
// computation.
address generate_poly1305_processBlocks() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
address start = __ pc();
__ enter();
Label here;
RegSet saved_regs = RegSet::range(x18, x21);
RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
__ push_reg(saved_regs, sp);
// Arguments
const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
// R_n is the 128-bit randomly-generated key, packed into two
// registers. The caller passes this key to us as long[5], with
// BITS_PER_LIMB = 26.
const Register R_0 = *regs, R_1 = *++regs;
poly1305_pack_26(R_0, R_1, r_start, t1, t2);
// RR_n is (R_n >> 2) * 5
const Register RR_0 = *++regs, RR_1 = *++regs;
__ srli(t1, R_0, 2);
__ shadd(RR_0, t1, t1, t2, 2);
__ srli(t1, R_1, 2);
__ shadd(RR_1, t1, t1, t2, 2);
// U_n is the current checksum
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
static constexpr int BLOCK_LENGTH = 16;
Label DONE, LOOP;
__ mv(t1, BLOCK_LENGTH);
__ blt(length, t1, DONE); {
__ bind(LOOP);
// S_n is to be the sum of U_n and the next block of data
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
__ ld(S_0, Address(input_start, 0));
__ ld(S_1, Address(input_start, wordSize));
__ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
__ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
__ add(S_2, U_2, t1);
__ addi(S_2, S_2, 1);
const Register U_0HI = *++regs, U_1HI = *++regs;
// NB: this logic depends on some of the special properties of
// Poly1305 keys. In particular, because we know that the top
// four bits of R_0 and R_1 are zero, we can add together
// partial products without any risk of needing to propagate a
// carry out.
__ wide_mul(U_0, U_0HI, S_0, R_0);
__ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
__ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
__ wide_mul(U_1, U_1HI, S_0, R_1);
__ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
__ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
__ andi(U_2, R_0, right_2_bits);
__ mul(U_2, S_2, U_2);
// Partial reduction mod 2**130 - 5
__ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
__ adc(U_2, U_2, U_1HI, t1);
// Sum is now in U_2:U_1:U_0.
// U_2:U_1:U_0: += (U_2 >> 2) * 5
poly1305_reduce(U_2, U_1, U_0, t1, t2);
__ sub(length, length, BLOCK_LENGTH);
__ addi(input_start, input_start, BLOCK_LENGTH);
__ mv(t1, BLOCK_LENGTH);
__ bge(length, t1, LOOP);
}
// Further reduce modulo 2^130 - 5
poly1305_reduce(U_2, U_1, U_0, t1, t2);
// Unpack the sum into five 26-bit limbs and write to memory.
// First 26 bits is the first limb
__ slli(t1, U_0, 38); // Take lowest 26 bits
__ srli(t1, t1, 38);
__ sd(t1, Address(acc_start)); // First 26-bit limb
// 27-52 bits of U_0 is the second limb
__ slli(t1, U_0, 12); // Take next 27-52 bits
__ srli(t1, t1, 38);
__ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
// Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
__ srli(t1, U_0, 52);
__ slli(t2, U_1, 50);
__ srli(t2, t2, 38);
__ add(t1, t1, t2);
__ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
// Storing 15-40 bits of U_1
__ slli(t1, U_1, 24); // Already used up 14 bits
__ srli(t1, t1, 38); // Clear all other bits from t1
__ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
// Storing 41-64 bits of U_1 and first three bits from U_2 in one register
__ srli(t1, U_1, 40);
__ andi(t2, U_2, right_3_bits);
__ slli(t2, t2, 24);
__ add(t1, t1, t2);
__ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
__ bind(DONE);
__ pop_reg(saved_regs, sp);
__ leave(); // Required for proper stackwalking
__ ret();
return start;
}
#endif // COMPILER2
#if INCLUDE_JFR
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@ -4640,6 +4848,10 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_montgomerySquare = g.generate_square();
}
if (UsePoly1305Intrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}
if (UseRVVForBigIntegerShiftIntrinsics) {
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();

View File

@ -191,6 +191,10 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseMD5Intrinsics, true);
}
if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
}
if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
}