8243114: Implement montgomery{Multiply,Square}intrinsics on Windows
Reviewed-by: dholmes, andrew
This commit is contained in:
parent
0f2ac2021b
commit
47e465cf1b
@ -3679,14 +3679,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
|
||||
|
||||
#ifndef _WINDOWS
|
||||
|
||||
#define ASM_SUBTRACT
|
||||
|
||||
#ifdef ASM_SUBTRACT
|
||||
// Subtract 0:b from carry:a. Return carry.
|
||||
static unsigned long
|
||||
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
|
||||
long i = 0, cnt = len;
|
||||
unsigned long tmp;
|
||||
static julong
|
||||
sub(julong a[], julong b[], julong carry, long len) {
|
||||
long long i = 0, cnt = len;
|
||||
julong tmp;
|
||||
asm volatile("clc; "
|
||||
"0: ; "
|
||||
"mov (%[b], %[i], 8), %[tmp]; "
|
||||
@ -3699,24 +3696,6 @@ sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
|
||||
: "memory");
|
||||
return tmp;
|
||||
}
|
||||
#else // ASM_SUBTRACT
|
||||
typedef int __attribute__((mode(TI))) int128;
|
||||
|
||||
// Subtract 0:b from carry:a. Return carry.
|
||||
static unsigned long
|
||||
sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
|
||||
int128 tmp = 0;
|
||||
int i;
|
||||
for (i = 0; i < len; i++) {
|
||||
tmp += a[i];
|
||||
tmp -= b[i];
|
||||
a[i] = tmp;
|
||||
tmp >>= 64;
|
||||
assert(-1 <= tmp && tmp <= 0, "invariant");
|
||||
}
|
||||
return tmp + carry;
|
||||
}
|
||||
#endif // ! ASM_SUBTRACT
|
||||
|
||||
// Multiply (unsigned) Long A by Long B, accumulating the double-
|
||||
// length result into the accumulator formed of T0, T1, and T2.
|
||||
@ -3739,17 +3718,59 @@ do { \
|
||||
: "r"(A), "a"(B) : "cc"); \
|
||||
} while(0)
|
||||
|
||||
#else //_WINDOWS
|
||||
|
||||
static julong
|
||||
sub(julong a[], julong b[], julong carry, long len) {
|
||||
long i;
|
||||
julong tmp;
|
||||
unsigned char c = 1;
|
||||
for (i = 0; i < len; i++) {
|
||||
c = _addcarry_u64(c, a[i], ~b[i], &tmp);
|
||||
a[i] = tmp;
|
||||
}
|
||||
c = _addcarry_u64(c, carry, ~0, &tmp);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
// Multiply (unsigned) Long A by Long B, accumulating the double-
|
||||
// length result into the accumulator formed of T0, T1, and T2.
|
||||
#define MACC(A, B, T0, T1, T2) \
|
||||
do { \
|
||||
julong hi, lo; \
|
||||
lo = _umul128(A, B, &hi); \
|
||||
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
|
||||
c = _addcarry_u64(c, hi, T1, &T1); \
|
||||
_addcarry_u64(c, T2, 0, &T2); \
|
||||
} while(0)
|
||||
|
||||
// As above, but add twice the double-length result into the
|
||||
// accumulator.
|
||||
#define MACC2(A, B, T0, T1, T2) \
|
||||
do { \
|
||||
julong hi, lo; \
|
||||
lo = _umul128(A, B, &hi); \
|
||||
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
|
||||
c = _addcarry_u64(c, hi, T1, &T1); \
|
||||
_addcarry_u64(c, T2, 0, &T2); \
|
||||
c = _addcarry_u64(0, lo, T0, &T0); \
|
||||
c = _addcarry_u64(c, hi, T1, &T1); \
|
||||
_addcarry_u64(c, T2, 0, &T2); \
|
||||
} while(0)
|
||||
|
||||
#endif //_WINDOWS
|
||||
|
||||
// Fast Montgomery multiplication. The derivation of the algorithm is
|
||||
// in A Cryptographic Library for the Motorola DSP56000,
|
||||
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
|
||||
|
||||
static void __attribute__((noinline))
|
||||
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
|
||||
unsigned long m[], unsigned long inv, int len) {
|
||||
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
||||
static void NOINLINE
|
||||
montgomery_multiply(julong a[], julong b[], julong n[],
|
||||
julong m[], julong inv, int len) {
|
||||
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
||||
int i;
|
||||
|
||||
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
||||
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery multiply");
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
int j;
|
||||
@ -3785,13 +3806,13 @@ montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
|
||||
// multiplication. However, its loop control is more complex and it
|
||||
// may actually run slower on some machines.
|
||||
|
||||
static void __attribute__((noinline))
|
||||
montgomery_square(unsigned long a[], unsigned long n[],
|
||||
unsigned long m[], unsigned long inv, int len) {
|
||||
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
||||
static void NOINLINE
|
||||
montgomery_square(julong a[], julong n[],
|
||||
julong m[], julong inv, int len) {
|
||||
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
||||
int i;
|
||||
|
||||
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
||||
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery square");
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
int j;
|
||||
@ -3837,13 +3858,13 @@ montgomery_square(unsigned long a[], unsigned long n[],
|
||||
}
|
||||
|
||||
// Swap words in a longword.
|
||||
static unsigned long swap(unsigned long x) {
|
||||
static julong swap(julong x) {
|
||||
return (x << 32) | (x >> 32);
|
||||
}
|
||||
|
||||
// Copy len longwords from s to d, word-swapping as we go. The
|
||||
// destination array is reversed.
|
||||
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
|
||||
static void reverse_words(julong *s, julong *d, int len) {
|
||||
d += len;
|
||||
while(len-- > 0) {
|
||||
d--;
|
||||
@ -3865,24 +3886,24 @@ void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints
|
||||
// Make very sure we don't use so much space that the stack might
|
||||
// overflow. 512 jints corresponds to an 16384-bit integer and
|
||||
// will use here a total of 8k bytes of stack space.
|
||||
int total_allocation = longwords * sizeof (unsigned long) * 4;
|
||||
int total_allocation = longwords * sizeof (julong) * 4;
|
||||
guarantee(total_allocation <= 8192, "must be");
|
||||
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
|
||||
julong *scratch = (julong *)alloca(total_allocation);
|
||||
|
||||
// Local scratch arrays
|
||||
unsigned long
|
||||
julong
|
||||
*a = scratch + 0 * longwords,
|
||||
*b = scratch + 1 * longwords,
|
||||
*n = scratch + 2 * longwords,
|
||||
*m = scratch + 3 * longwords;
|
||||
|
||||
reverse_words((unsigned long *)a_ints, a, longwords);
|
||||
reverse_words((unsigned long *)b_ints, b, longwords);
|
||||
reverse_words((unsigned long *)n_ints, n, longwords);
|
||||
reverse_words((julong *)a_ints, a, longwords);
|
||||
reverse_words((julong *)b_ints, b, longwords);
|
||||
reverse_words((julong *)n_ints, n, longwords);
|
||||
|
||||
::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
|
||||
::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
|
||||
|
||||
reverse_words(m, (unsigned long *)m_ints, longwords);
|
||||
reverse_words(m, (julong *)m_ints, longwords);
|
||||
}
|
||||
|
||||
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
|
||||
@ -3894,30 +3915,28 @@ void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
|
||||
// Make very sure we don't use so much space that the stack might
|
||||
// overflow. 512 jints corresponds to an 16384-bit integer and
|
||||
// will use here a total of 6k bytes of stack space.
|
||||
int total_allocation = longwords * sizeof (unsigned long) * 3;
|
||||
int total_allocation = longwords * sizeof (julong) * 3;
|
||||
guarantee(total_allocation <= 8192, "must be");
|
||||
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
|
||||
julong *scratch = (julong *)alloca(total_allocation);
|
||||
|
||||
// Local scratch arrays
|
||||
unsigned long
|
||||
julong
|
||||
*a = scratch + 0 * longwords,
|
||||
*n = scratch + 1 * longwords,
|
||||
*m = scratch + 2 * longwords;
|
||||
|
||||
reverse_words((unsigned long *)a_ints, a, longwords);
|
||||
reverse_words((unsigned long *)n_ints, n, longwords);
|
||||
reverse_words((julong *)a_ints, a, longwords);
|
||||
reverse_words((julong *)n_ints, n, longwords);
|
||||
|
||||
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
|
||||
::montgomery_square(a, n, m, (unsigned long)inv, longwords);
|
||||
::montgomery_square(a, n, m, (julong)inv, longwords);
|
||||
} else {
|
||||
::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
|
||||
::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
|
||||
}
|
||||
|
||||
reverse_words(m, (unsigned long *)m_ints, longwords);
|
||||
reverse_words(m, (julong *)m_ints, longwords);
|
||||
}
|
||||
|
||||
#endif // WINDOWS
|
||||
|
||||
#ifdef COMPILER2
|
||||
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
|
||||
//
|
||||
|
@ -6566,7 +6566,6 @@ address generate_avx_ghash_processBlocks() {
|
||||
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
|
||||
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
|
||||
}
|
||||
#ifndef _WINDOWS
|
||||
if (UseMontgomeryMultiplyIntrinsic) {
|
||||
StubRoutines::_montgomeryMultiply
|
||||
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
|
||||
@ -6575,7 +6574,6 @@ address generate_avx_ghash_processBlocks() {
|
||||
StubRoutines::_montgomerySquare
|
||||
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
|
||||
}
|
||||
#endif // WINDOWS
|
||||
#endif // COMPILER2
|
||||
|
||||
if (UseVectorizedMismatchIntrinsic) {
|
||||
|
Loading…
Reference in New Issue
Block a user