8186915: AARCH64: Intrinsify squareToLen and mulAdd
Reviewed-by: aph
This commit is contained in:
parent
87a8a4301f
commit
b787f1b55b
@ -2840,6 +2840,44 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
|
||||
bind(L_done);
|
||||
}
|
||||
|
||||
// Code for BigInteger::mulAdd instrinsic
|
||||
// out = r0
|
||||
// in = r1
|
||||
// offset = r2 (already out.length-offset)
|
||||
// len = r3
|
||||
// k = r4
|
||||
//
|
||||
// pseudo code from java implementation:
|
||||
// carry = 0;
|
||||
// offset = out.length-offset - 1;
|
||||
// for (int j=len-1; j >= 0; j--) {
|
||||
// product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
|
||||
// out[offset--] = (int)product;
|
||||
// carry = product >>> 32;
|
||||
// }
|
||||
// return (int)carry;
|
||||
void MacroAssembler::mul_add(Register out, Register in, Register offset,
|
||||
Register len, Register k) {
|
||||
Label LOOP, END;
|
||||
// pre-loop
|
||||
cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
|
||||
csel(out, zr, out, Assembler::EQ);
|
||||
br(Assembler::EQ, END);
|
||||
add(in, in, len, LSL, 2); // in[j+1] address
|
||||
add(offset, out, offset, LSL, 2); // out[offset + 1] address
|
||||
mov(out, zr); // used to keep carry now
|
||||
BIND(LOOP);
|
||||
ldrw(rscratch1, Address(pre(in, -4)));
|
||||
madd(rscratch1, rscratch1, k, out);
|
||||
ldrw(rscratch2, Address(pre(offset, -4)));
|
||||
add(rscratch1, rscratch1, rscratch2);
|
||||
strw(rscratch1, Address(offset));
|
||||
lsr(out, rscratch1, 32);
|
||||
subs(len, len, 1);
|
||||
br(Assembler::NE, LOOP);
|
||||
BIND(END);
|
||||
}
|
||||
|
||||
/**
|
||||
* Emits code to update CRC-32 with a byte value according to constants in table
|
||||
*
|
||||
|
@ -1265,6 +1265,7 @@ public:
|
||||
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
|
||||
Register zlen, Register tmp1, Register tmp2, Register tmp3,
|
||||
Register tmp4, Register tmp5, Register tmp6, Register tmp7);
|
||||
void mul_add(Register out, Register in, Register offs, Register len, Register k);
|
||||
// ISB may be needed because of a safepoint
|
||||
void maybe_isb() { isb(); }
|
||||
|
||||
|
@ -3607,6 +3607,63 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_squareToLen() {
|
||||
// squareToLen algorithm for sizes 1..127 described in java code works
|
||||
// faster than multiply_to_len on some CPUs and slower on others, but
|
||||
// multiply_to_len shows a bit better overall results
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "squareToLen");
|
||||
address start = __ pc();
|
||||
|
||||
const Register x = r0;
|
||||
const Register xlen = r1;
|
||||
const Register z = r2;
|
||||
const Register zlen = r3;
|
||||
const Register y = r4; // == x
|
||||
const Register ylen = r5; // == xlen
|
||||
|
||||
const Register tmp1 = r10;
|
||||
const Register tmp2 = r11;
|
||||
const Register tmp3 = r12;
|
||||
const Register tmp4 = r13;
|
||||
const Register tmp5 = r14;
|
||||
const Register tmp6 = r15;
|
||||
const Register tmp7 = r16;
|
||||
|
||||
RegSet spilled_regs = RegSet::of(y, ylen);
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter();
|
||||
__ push(spilled_regs, sp);
|
||||
__ mov(y, x);
|
||||
__ mov(ylen, xlen);
|
||||
__ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
|
||||
__ pop(spilled_regs, sp);
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_mulAdd() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "mulAdd");
|
||||
|
||||
address start = __ pc();
|
||||
|
||||
const Register out = r0;
|
||||
const Register in = r1;
|
||||
const Register offset = r2;
|
||||
const Register len = r3;
|
||||
const Register k = r4;
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter();
|
||||
__ mul_add(out, in, offset, len, k);
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
|
||||
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
|
||||
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
|
||||
@ -4913,6 +4970,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
||||
}
|
||||
|
||||
if (UseSquareToLenIntrinsic) {
|
||||
StubRoutines::_squareToLen = generate_squareToLen();
|
||||
}
|
||||
|
||||
if (UseMulAddIntrinsic) {
|
||||
StubRoutines::_mulAdd = generate_mulAdd();
|
||||
}
|
||||
|
||||
if (UseMontgomeryMultiplyIntrinsic) {
|
||||
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
|
||||
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
|
||||
|
@ -340,6 +340,14 @@ void VM_Version::get_processor_features() {
|
||||
UseMultiplyToLenIntrinsic = true;
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
|
||||
UseSquareToLenIntrinsic = true;
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
|
||||
UseMulAddIntrinsic = true;
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
|
||||
UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user