8186915: AARCH64: Intrinsify squareToLen and mulAdd

Reviewed-by: aph
This commit is contained in:
Dmitrij Pochepko 2017-10-02 17:20:14 +03:00
parent 87a8a4301f
commit b787f1b55b
4 changed files with 112 additions and 0 deletions

@ -2840,6 +2840,44 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
bind(L_done);
}
// Code for BigInteger::mulAdd instrinsic
// out = r0
// in = r1
// offset = r2 (already out.length-offset)
// len = r3
// k = r4
//
// pseudo code from java implementation:
// carry = 0;
// offset = out.length-offset - 1;
// for (int j=len-1; j >= 0; j--) {
// product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
// out[offset--] = (int)product;
// carry = product >>> 32;
// }
// return (int)carry;
void MacroAssembler::mul_add(Register out, Register in, Register offset,
Register len, Register k) {
Label LOOP, END;
// pre-loop
cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
csel(out, zr, out, Assembler::EQ);
br(Assembler::EQ, END);
add(in, in, len, LSL, 2); // in[j+1] address
add(offset, out, offset, LSL, 2); // out[offset + 1] address
mov(out, zr); // used to keep carry now
BIND(LOOP);
ldrw(rscratch1, Address(pre(in, -4)));
madd(rscratch1, rscratch1, k, out);
ldrw(rscratch2, Address(pre(offset, -4)));
add(rscratch1, rscratch1, rscratch2);
strw(rscratch1, Address(offset));
lsr(out, rscratch1, 32);
subs(len, len, 1);
br(Assembler::NE, LOOP);
BIND(END);
}
/**
* Emits code to update CRC-32 with a byte value according to constants in table
*

@ -1265,6 +1265,7 @@ public:
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
Register zlen, Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6, Register tmp7);
void mul_add(Register out, Register in, Register offs, Register len, Register k);
// ISB may be needed because of a safepoint
void maybe_isb() { isb(); }

@ -3607,6 +3607,63 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_squareToLen() {
// squareToLen algorithm for sizes 1..127 described in java code works
// faster than multiply_to_len on some CPUs and slower on others, but
// multiply_to_len shows a bit better overall results
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "squareToLen");
address start = __ pc();
const Register x = r0;
const Register xlen = r1;
const Register z = r2;
const Register zlen = r3;
const Register y = r4; // == x
const Register ylen = r5; // == xlen
const Register tmp1 = r10;
const Register tmp2 = r11;
const Register tmp3 = r12;
const Register tmp4 = r13;
const Register tmp5 = r14;
const Register tmp6 = r15;
const Register tmp7 = r16;
RegSet spilled_regs = RegSet::of(y, ylen);
BLOCK_COMMENT("Entry:");
__ enter();
__ push(spilled_regs, sp);
__ mov(y, x);
__ mov(ylen, xlen);
__ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
__ pop(spilled_regs, sp);
__ leave();
__ ret(lr);
return start;
}
address generate_mulAdd() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "mulAdd");
address start = __ pc();
const Register out = r0;
const Register in = r1;
const Register offset = r2;
const Register len = r3;
const Register k = r4;
BLOCK_COMMENT("Entry:");
__ enter();
__ mul_add(out, in, offset, len, k);
__ leave();
__ ret(lr);
return start;
}
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@ -4913,6 +4970,14 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
}
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
if (UseMontgomeryMultiplyIntrinsic) {
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);

@ -340,6 +340,14 @@ void VM_Version::get_processor_features() {
UseMultiplyToLenIntrinsic = true;
}
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
UseSquareToLenIntrinsic = true;
}
if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
UseMulAddIntrinsic = true;
}
if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0;
}