8243114: Implement montgomery{Multiply,Square}intrinsics on Windows

Reviewed-by: dholmes, andrew
This commit is contained in:
Andrew Haley 2020-06-25 14:19:37 +01:00 committed by Simon Tooke
parent 0f2ac2021b
commit 47e465cf1b
2 changed files with 74 additions and 57 deletions

View File

@ -3679,14 +3679,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
#ifndef _WINDOWS
#define ASM_SUBTRACT
#ifdef ASM_SUBTRACT
// Subtract 0:b from carry:a. Return carry.
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
long i = 0, cnt = len;
unsigned long tmp;
static julong
sub(julong a[], julong b[], julong carry, long len) {
long long i = 0, cnt = len;
julong tmp;
asm volatile("clc; "
"0: ; "
"mov (%[b], %[i], 8), %[tmp]; "
@ -3699,24 +3696,6 @@ sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
: "memory");
return tmp;
}
#else // ASM_SUBTRACT
typedef int __attribute__((mode(TI))) int128;
// Subtract 0:b from carry:a. Return carry.
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
int128 tmp = 0;
int i;
for (i = 0; i < len; i++) {
tmp += a[i];
tmp -= b[i];
a[i] = tmp;
tmp >>= 64;
assert(-1 <= tmp && tmp <= 0, "invariant");
}
return tmp + carry;
}
#endif // ! ASM_SUBTRACT
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
@ -3739,17 +3718,59 @@ do { \
: "r"(A), "a"(B) : "cc"); \
} while(0)
#else //_WINDOWS
static julong
sub(julong a[], julong b[], julong carry, long len) {
long i;
julong tmp;
unsigned char c = 1;
for (i = 0; i < len; i++) {
c = _addcarry_u64(c, a[i], ~b[i], &tmp);
a[i] = tmp;
}
c = _addcarry_u64(c, carry, ~0, &tmp);
return tmp;
}
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2) \
do { \
julong hi, lo; \
lo = _umul128(A, B, &hi); \
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
c = _addcarry_u64(c, hi, T1, &T1); \
_addcarry_u64(c, T2, 0, &T2); \
} while(0)
// As above, but add twice the double-length result into the
// accumulator.
#define MACC2(A, B, T0, T1, T2) \
do { \
julong hi, lo; \
lo = _umul128(A, B, &hi); \
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
c = _addcarry_u64(c, hi, T1, &T1); \
_addcarry_u64(c, T2, 0, &T2); \
c = _addcarry_u64(0, lo, T0, &T0); \
c = _addcarry_u64(c, hi, T1, &T1); \
_addcarry_u64(c, T2, 0, &T2); \
} while(0)
#endif //_WINDOWS
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
static void __attribute__((noinline))
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
static void NOINLINE
montgomery_multiply(julong a[], julong b[], julong n[],
julong m[], julong inv, int len) {
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
@ -3785,13 +3806,13 @@ montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
static void __attribute__((noinline))
montgomery_square(unsigned long a[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
static void NOINLINE
montgomery_square(julong a[], julong n[],
julong m[], julong inv, int len) {
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery square");
for (i = 0; i < len; i++) {
int j;
@ -3837,13 +3858,13 @@ montgomery_square(unsigned long a[], unsigned long n[],
}
// Swap words in a longword.
static unsigned long swap(unsigned long x) {
static julong swap(julong x) {
return (x << 32) | (x >> 32);
}
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
static void reverse_words(julong *s, julong *d, int len) {
d += len;
while(len-- > 0) {
d--;
@ -3865,24 +3886,24 @@ void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 4;
int total_allocation = longwords * sizeof (julong) * 4;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
unsigned long
julong
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)b_ints, b, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
reverse_words((julong *)a_ints, a, longwords);
reverse_words((julong *)b_ints, b, longwords);
reverse_words((julong *)n_ints, n, longwords);
::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
reverse_words(m, (unsigned long *)m_ints, longwords);
reverse_words(m, (julong *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
@ -3894,30 +3915,28 @@ void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 3;
int total_allocation = longwords * sizeof (julong) * 3;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
unsigned long
julong
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
reverse_words((julong *)a_ints, a, longwords);
reverse_words((julong *)n_ints, n, longwords);
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
::montgomery_square(a, n, m, (unsigned long)inv, longwords);
::montgomery_square(a, n, m, (julong)inv, longwords);
} else {
::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
}
reverse_words(m, (unsigned long *)m_ints, longwords);
reverse_words(m, (julong *)m_ints, longwords);
}
#endif // WINDOWS
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//

View File

@ -6566,7 +6566,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
}
#ifndef _WINDOWS
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
@ -6575,7 +6574,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
#endif // WINDOWS
#endif // COMPILER2
if (UseVectorizedMismatchIntrinsic) {