8167065: Add intrinsic support for double precision shifting on x86_64

Reviewed-by: kvn
This commit is contained in:
Smita Kamath 2019-12-23 14:42:21 -08:00
parent f4af0eadb6
commit 995da6eb2a
22 changed files with 628 additions and 50 deletions
src
hotspot
java.base/share/classes/java/math
jdk.aot/share/classes/jdk.tools.jaotc.binformat/src/jdk/tools/jaotc/binformat
jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.hotspot.test/src/org/graalvm/compiler/hotspot/test
test
hotspot/jtreg/compiler/intrinsics/bigInteger
micro/org/openjdk/bench/java/math

@ -4257,8 +4257,8 @@ void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
void Assembler::vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len) {
assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
vector_len == AVX_256bit? VM_Version::supports_avx2() :
0, "");
(vector_len == AVX_256bit? VM_Version::supports_avx2() :
(vector_len == AVX_512bit? VM_Version::supports_evex() : 0)), "");
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4737,6 +4737,36 @@ void Assembler::shrl(Register dst) {
emit_int8((unsigned char)(0xE8 | encode));
}
void Assembler::shldl(Register dst, Register src) {
int encode = prefix_and_encode(src->encoding(), dst->encoding());
emit_int8(0x0F);
emit_int8((unsigned char)0xA5);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::shldl(Register dst, Register src, int8_t imm8) {
int encode = prefix_and_encode(src->encoding(), dst->encoding());
emit_int8(0x0F);
emit_int8((unsigned char)0xA4);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(imm8);
}
void Assembler::shrdl(Register dst, Register src) {
int encode = prefix_and_encode(src->encoding(), dst->encoding());
emit_int8(0x0F);
emit_int8((unsigned char)0xAD);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::shrdl(Register dst, Register src, int8_t imm8) {
int encode = prefix_and_encode(src->encoding(), dst->encoding());
emit_int8(0x0F);
emit_int8((unsigned char)0xAC);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(imm8);
}
// copies a single word from [esi] to [edi]
void Assembler::smovl() {
emit_int8((unsigned char)0xA5);
@ -6513,6 +6543,23 @@ void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
assert(VM_Version::supports_vbmi2(), "requires vbmi2");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x71);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
assert(VM_Version::supports_vbmi2(), "requires vbmi2");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x73);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::pandn(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@ -8109,26 +8156,6 @@ void Assembler::set_byte_if_not_zero(Register dst) {
emit_int8((unsigned char)(0xE0 | dst->encoding()));
}
void Assembler::shldl(Register dst, Register src) {
emit_int8(0x0F);
emit_int8((unsigned char)0xA5);
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
}
// 0F A4 / r ib
void Assembler::shldl(Register dst, Register src, int8_t imm8) {
emit_int8(0x0F);
emit_int8((unsigned char)0xA4);
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
emit_int8(imm8);
}
void Assembler::shrdl(Register dst, Register src) {
emit_int8(0x0F);
emit_int8((unsigned char)0xAD);
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
}
#else // LP64
void Assembler::set_byte_if_not_zero(Register dst) {

@ -1838,6 +1838,8 @@ private:
void shldl(Register dst, Register src);
void shldl(Register dst, Register src, int8_t imm8);
void shrdl(Register dst, Register src);
void shrdl(Register dst, Register src, int8_t imm8);
void shll(Register dst, int imm8);
void shll(Register dst);
@ -1845,8 +1847,6 @@ private:
void shlq(Register dst, int imm8);
void shlq(Register dst);
void shrdl(Register dst, Register src);
void shrl(Register dst, int imm8);
void shrl(Register dst);
@ -2140,6 +2140,9 @@ private:
void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
// And packed integers
void pand(XMMRegister dst, XMMRegister src);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

@ -5694,6 +5694,247 @@ address generate_avx_ghash_processBlocks() {
return start;
}
address generate_bigIntegerRightShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
address start = __ pc();
Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
// For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
const Register newArr = rdi;
const Register oldArr = rsi;
const Register newIdx = rdx;
const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
const Register totalNumIter = r8;
// For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
// For everything else, we prefer using r9 and r10 since we do not have to save them before use.
const Register tmp1 = r11; // Caller save.
const Register tmp2 = rax; // Caller save.
const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save.
const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save.
const Register tmp5 = r14; // Callee save.
const Register tmp6 = r15;
const XMMRegister x0 = xmm0;
const XMMRegister x1 = xmm1;
const XMMRegister x2 = xmm2;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WINDOWS
setup_arg_regs(4);
// For windows, since last argument is on stack, we need to move it to the appropriate register.
__ movl(totalNumIter, Address(rsp, 6 * wordSize));
// Save callee save registers.
__ push(tmp3);
__ push(tmp4);
#endif
__ push(tmp5);
// Rename temps used throughout the code.
const Register idx = tmp1;
const Register nIdx = tmp2;
__ xorl(idx, idx);
// Start right shift from end of the array.
// For example, if #iteration = 4 and newIdx = 1
// then dest[4] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32)
// if #iteration = 4 and newIdx = 0
// then dest[3] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32)
__ movl(idx, totalNumIter);
__ movl(nIdx, idx);
__ addl(nIdx, newIdx);
// If vectorization is enabled, check if the number of iterations is at least 64
// If not, then go to ShifTwo processing 2 iterations
if (VM_Version::supports_vbmi2()) {
__ cmpptr(totalNumIter, (AVX3Threshold/64));
__ jcc(Assembler::less, ShiftTwo);
if (AVX3Threshold < 16 * 64) {
__ cmpl(totalNumIter, 16);
__ jcc(Assembler::less, ShiftTwo);
}
__ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
__ subl(idx, 16);
__ subl(nIdx, 16);
__ BIND(Shift512Loop);
__ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
__ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
__ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
__ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
__ subl(nIdx, 16);
__ subl(idx, 16);
__ jcc(Assembler::greaterEqual, Shift512Loop);
__ addl(idx, 16);
__ addl(nIdx, 16);
}
__ BIND(ShiftTwo);
__ cmpl(idx, 2);
__ jcc(Assembler::less, ShiftOne);
__ subl(idx, 2);
__ subl(nIdx, 2);
__ BIND(ShiftTwoLoop);
__ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
__ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
__ movl(tmp3, Address(oldArr, idx, Address::times_4));
__ shrdl(tmp5, tmp4);
__ shrdl(tmp4, tmp3);
__ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
__ movl(Address(newArr, nIdx, Address::times_4), tmp4);
__ subl(nIdx, 2);
__ subl(idx, 2);
__ jcc(Assembler::greaterEqual, ShiftTwoLoop);
__ addl(idx, 2);
__ addl(nIdx, 2);
// Do the last iteration
__ BIND(ShiftOne);
__ cmpl(idx, 1);
__ jcc(Assembler::less, Exit);
__ subl(idx, 1);
__ subl(nIdx, 1);
__ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
__ movl(tmp3, Address(oldArr, idx, Address::times_4));
__ shrdl(tmp4, tmp3);
__ movl(Address(newArr, nIdx, Address::times_4), tmp4);
__ BIND(Exit);
// Restore callee save registers.
__ pop(tmp5);
#ifdef _WINDOWS
__ pop(tmp4);
__ pop(tmp3);
restore_arg_regs();
#endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
/**
* Arguments:
*
* Input:
* c_rarg0 - newArr address
* c_rarg1 - oldArr address
* c_rarg2 - newIdx
* c_rarg3 - shiftCount
* not Win64
* c_rarg4 - numIter
* Win64
* rsp40 - numIter
*/
address generate_bigIntegerLeftShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
address start = __ pc();
Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
// For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
const Register newArr = rdi;
const Register oldArr = rsi;
const Register newIdx = rdx;
const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
const Register totalNumIter = r8;
// For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
// For everything else, we prefer using r9 and r10 since we do not have to save them before use.
const Register tmp1 = r11; // Caller save.
const Register tmp2 = rax; // Caller save.
const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save.
const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save.
const Register tmp5 = r14; // Callee save.
const XMMRegister x0 = xmm0;
const XMMRegister x1 = xmm1;
const XMMRegister x2 = xmm2;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WINDOWS
setup_arg_regs(4);
// For windows, since last argument is on stack, we need to move it to the appropriate register.
__ movl(totalNumIter, Address(rsp, 6 * wordSize));
// Save callee save registers.
__ push(tmp3);
__ push(tmp4);
#endif
__ push(tmp5);
// Rename temps used throughout the code
const Register idx = tmp1;
const Register numIterTmp = tmp2;
// Start idx from zero.
__ xorl(idx, idx);
// Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
__ lea(newArr, Address(newArr, newIdx, Address::times_4));
__ movl(numIterTmp, totalNumIter);
// If vectorization is enabled, check if the number of iterations is at least 64
// If not, then go to ShiftTwo shifting two numbers at a time
if (VM_Version::supports_vbmi2()) {
__ cmpl(totalNumIter, (AVX3Threshold/64));
__ jcc(Assembler::less, ShiftTwo);
if (AVX3Threshold < 16 * 64) {
__ cmpl(totalNumIter, 16);
__ jcc(Assembler::less, ShiftTwo);
}
__ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
__ subl(numIterTmp, 16);
__ BIND(Shift512Loop);
__ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
__ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
__ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
__ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
__ addl(idx, 16);
__ subl(numIterTmp, 16);
__ jcc(Assembler::greaterEqual, Shift512Loop);
__ addl(numIterTmp, 16);
}
__ BIND(ShiftTwo);
__ cmpl(totalNumIter, 1);
__ jcc(Assembler::less, Exit);
__ movl(tmp3, Address(oldArr, idx, Address::times_4));
__ subl(numIterTmp, 2);
__ jcc(Assembler::less, ShiftOne);
__ BIND(ShiftTwoLoop);
__ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
__ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
__ shldl(tmp3, tmp4);
__ shldl(tmp4, tmp5);
__ movl(Address(newArr, idx, Address::times_4), tmp3);
__ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
__ movl(tmp3, tmp5);
__ addl(idx, 2);
__ subl(numIterTmp, 2);
__ jcc(Assembler::greaterEqual, ShiftTwoLoop);
// Do the last iteration
__ BIND(ShiftOne);
__ addl(numIterTmp, 2);
__ cmpl(numIterTmp, 1);
__ jcc(Assembler::less, Exit);
__ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
__ shldl(tmp3, tmp4);
__ movl(Address(newArr, idx, Address::times_4), tmp3);
__ BIND(Exit);
// Restore callee save registers.
__ pop(tmp5);
#ifdef _WINDOWS
__ pop(tmp4);
__ pop(tmp3);
restore_arg_regs();
#endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
address generate_libmExp() {
StubCodeMark mark(this, "StubRoutines", "libmExp");
@ -6314,6 +6555,10 @@ address generate_avx_ghash_processBlocks() {
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
if (VM_Version::supports_vbmi2()) {
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
}
#ifndef _WINDOWS
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply

@ -694,6 +694,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512_VPCLMULQDQ;
_features &= ~CPU_VAES;
_features &= ~CPU_VNNI;
_features &= ~CPU_VBMI2;
}
if (UseAVX < 2)
@ -716,7 +717,7 @@ void VM_Version::get_processor_features() {
}
char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@ -749,7 +750,10 @@ void VM_Version::get_processor_features() {
(supports_adx() ? ", adx" : ""),
(supports_evex() ? ", evex" : ""),
(supports_sha() ? ", sha" : ""),
(supports_fma() ? ", fma" : ""));
(supports_fma() ? ", fma" : ""),
(supports_vbmi2() ? ", vbmi2" : ""),
(supports_vaes() ? ", vaes" : ""),
(supports_vnni() ? ", vnni" : ""));
_features_string = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what

@ -345,6 +345,8 @@ protected:
#define CPU_FLUSH ((uint64_t)UCONST64(0x20000000000)) // flush instruction
#define CPU_FLUSHOPT ((uint64_t)UCONST64(0x40000000000)) // flushopt instruction
#define CPU_CLWB ((uint64_t)UCONST64(0x80000000000)) // clwb instruction
#define CPU_VBMI2 ((uint64_t)UCONST64(0x100000000000)) // VBMI2 shift left double instructions
enum Extended_Family {
// AMD
@ -567,6 +569,8 @@ enum Extended_Family {
result |= CPU_VAES;
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0)
result |= CPU_VNNI;
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vbmi2 != 0)
result |= CPU_VBMI2;
}
}
if (_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@ -858,6 +862,7 @@ public:
static bool supports_avx512_vpclmulqdq() { return (_features & CPU_AVX512_VPCLMULQDQ) != 0; }
static bool supports_vaes() { return (_features & CPU_VAES) != 0; }
static bool supports_vnni() { return (_features & CPU_VNNI) != 0; }
static bool supports_vbmi2() { return (_features & CPU_VBMI2) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&

@ -555,6 +555,8 @@ void AOTCodeHeap::link_stub_routines_symbols() {
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_montgomeryMultiply", address, StubRoutines::_montgomeryMultiply);
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_montgomerySquare", address, StubRoutines::_montgomerySquare);
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_vectorizedMismatch", address, StubRoutines::_vectorizedMismatch);
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_bigIntegerRightShiftWorker", address, StubRoutines::_bigIntegerRightShiftWorker);
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_bigIntegerLeftShiftWorker", address, StubRoutines::_bigIntegerLeftShiftWorker);
SET_AOT_GLOBAL_SYMBOL_VALUE("_aot_stub_routines_throw_delayed_StackOverflowError_entry", address, StubRoutines::_throw_delayed_StackOverflowError_entry);

@ -837,6 +837,9 @@ bool vmIntrinsics::is_disabled_by_flags(vmIntrinsics::ID id) {
case vmIntrinsics::_montgomerySquare:
if (!UseMontgomerySquareIntrinsic) return true;
break;
case vmIntrinsics::_bigIntegerRightShiftWorker:
case vmIntrinsics::_bigIntegerLeftShiftWorker:
break;
case vmIntrinsics::_addExactI:
case vmIntrinsics::_addExactL:
case vmIntrinsics::_decrementExactI:

@ -565,6 +565,7 @@
template(char_StringBuffer_signature, "(C)Ljava/lang/StringBuffer;") \
template(int_String_signature, "(I)Ljava/lang/String;") \
template(boolean_boolean_int_signature, "(ZZ)I") \
template(big_integer_shift_worker_signature, "([I[IIII)V") \
template(reflect_method_signature, "Ljava/lang/reflect/Method;") \
/* signature symbols needed by intrinsics */ \
VM_INTRINSICS_DO(VM_INTRINSIC_IGNORE, VM_SYMBOL_IGNORE, VM_SYMBOL_IGNORE, template, VM_ALIAS_IGNORE) \
@ -1007,6 +1008,12 @@
do_name( montgomerySquare_name, "implMontgomerySquare") \
do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \
\
do_intrinsic(_bigIntegerRightShiftWorker, java_math_BigInteger, rightShift_name, big_integer_shift_worker_signature, F_S) \
do_name( rightShift_name, "shiftRightImplWorker") \
\
do_intrinsic(_bigIntegerLeftShiftWorker, java_math_BigInteger, leftShift_name, big_integer_shift_worker_signature, F_S) \
do_name( leftShift_name, "shiftLeftImplWorker") \
\
do_class(jdk_internal_util_ArraysSupport, "jdk/internal/util/ArraysSupport") \
do_intrinsic(_vectorizedMismatch, jdk_internal_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\
do_name(vectorizedMismatch_name, "vectorizedMismatch") \

@ -322,6 +322,8 @@
static_field(StubRoutines, _montgomeryMultiply, address) \
static_field(StubRoutines, _montgomerySquare, address) \
static_field(StubRoutines, _vectorizedMismatch, address) \
static_field(StubRoutines, _bigIntegerRightShiftWorker, address) \
static_field(StubRoutines, _bigIntegerLeftShiftWorker, address) \
\
nonstatic_field(Thread, _tlab, ThreadLocalAllocBuffer) \
nonstatic_field(Thread, _allocated_bytes, jlong) \

@ -628,6 +628,8 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_mulAdd:
case vmIntrinsics::_montgomeryMultiply:
case vmIntrinsics::_montgomerySquare:
case vmIntrinsics::_bigIntegerRightShiftWorker:
case vmIntrinsics::_bigIntegerLeftShiftWorker:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_ghash_processBlocks:
case vmIntrinsics::_base64_encodeBlock:

@ -1006,6 +1006,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
strcmp(call->as_CallLeaf()->_name, "bigIntegerRightShiftWorker") == 0 ||
strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 ||
strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0)
))) {
call->dump();

@ -327,6 +327,7 @@ class LibraryCallKit : public GraphKit {
bool inline_mulAdd();
bool inline_montgomeryMultiply();
bool inline_montgomerySquare();
bool inline_bigIntegerShift(bool isRightShift);
bool inline_vectorizedMismatch();
bool inline_fma(vmIntrinsics::ID id);
bool inline_character_compare(vmIntrinsics::ID id);
@ -845,6 +846,11 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_montgomerySquare:
return inline_montgomerySquare();
case vmIntrinsics::_bigIntegerRightShiftWorker:
return inline_bigIntegerShift(true);
case vmIntrinsics::_bigIntegerLeftShiftWorker:
return inline_bigIntegerShift(false);
case vmIntrinsics::_vectorizedMismatch:
return inline_vectorizedMismatch();
@ -5253,6 +5259,60 @@ bool LibraryCallKit::inline_montgomerySquare() {
return true;
}
bool LibraryCallKit::inline_bigIntegerShift(bool isRightShift) {
address stubAddr = NULL;
const char* stubName = NULL;
stubAddr = isRightShift? StubRoutines::bigIntegerRightShift(): StubRoutines::bigIntegerLeftShift();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
stubName = isRightShift? "bigIntegerRightShiftWorker" : "bigIntegerLeftShiftWorker";
assert(callee()->signature()->size() == 5, "expected 5 arguments");
Node* newArr = argument(0);
Node* oldArr = argument(1);
Node* newIdx = argument(2);
Node* shiftCount = argument(3);
Node* numIter = argument(4);
const Type* newArr_type = newArr->Value(&_gvn);
const TypeAryPtr* top_newArr = newArr_type->isa_aryptr();
const Type* oldArr_type = oldArr->Value(&_gvn);
const TypeAryPtr* top_oldArr = oldArr_type->isa_aryptr();
if (top_newArr == NULL || top_newArr->klass() == NULL || top_oldArr == NULL
|| top_oldArr->klass() == NULL) {
return false;
}
BasicType newArr_elem = newArr_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType oldArr_elem = oldArr_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (newArr_elem != T_INT || oldArr_elem != T_INT) {
return false;
}
// Make the call
{
Node* newArr_start = array_element_address(newArr, intcon(0), newArr_elem);
Node* oldArr_start = array_element_address(oldArr, intcon(0), oldArr_elem);
Node* call = make_runtime_call(RC_LEAF,
OptoRuntime::bigIntegerShift_Type(),
stubAddr,
stubName,
TypePtr::BOTTOM,
newArr_start,
oldArr_start,
newIdx,
shiftCount,
numIter);
}
return true;
}
//-------------inline_vectorizedMismatch------------------------------
bool LibraryCallKit::inline_vectorizedMismatch() {
assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform");

@ -1111,6 +1111,25 @@ const TypeFunc* OptoRuntime::montgomerySquare_Type() {
return TypeFunc::make(domain, range);
}
const TypeFunc * OptoRuntime::bigIntegerShift_Type() {
int argcnt = 5;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // newArr
fields[argp++] = TypePtr::NOTNULL; // oldArr
fields[argp++] = TypeInt::INT; // newIdx
fields[argp++] = TypeInt::INT; // shiftCount
fields[argp++] = TypeInt::INT; // numIter
assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
// no result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms + 0] = NULL;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
const TypeFunc* OptoRuntime::vectorizedMismatch_Type() {
// create input type (domain)
int num_args = 4;

@ -289,6 +289,8 @@ private:
static const TypeFunc* mulAdd_Type();
static const TypeFunc* bigIntegerShift_Type();
static const TypeFunc* vectorizedMismatch_Type();
static const TypeFunc* ghash_processBlocks_Type();

@ -157,6 +157,8 @@ address StubRoutines::_squareToLen = NULL;
address StubRoutines::_mulAdd = NULL;
address StubRoutines::_montgomeryMultiply = NULL;
address StubRoutines::_montgomerySquare = NULL;
address StubRoutines::_bigIntegerRightShiftWorker = NULL;
address StubRoutines::_bigIntegerLeftShiftWorker = NULL;
address StubRoutines::_vectorizedMismatch = NULL;

@ -239,6 +239,8 @@ class StubRoutines: AllStatic {
static address _mulAdd;
static address _montgomeryMultiply;
static address _montgomerySquare;
static address _bigIntegerRightShiftWorker;
static address _bigIntegerLeftShiftWorker;
static address _vectorizedMismatch;
@ -414,6 +416,8 @@ class StubRoutines: AllStatic {
static address mulAdd() { return _mulAdd; }
static address montgomeryMultiply() { return _montgomeryMultiply; }
static address montgomerySquare() { return _montgomerySquare; }
static address bigIntegerRightShift() { return _bigIntegerRightShiftWorker; }
static address bigIntegerLeftShift() { return _bigIntegerLeftShiftWorker; }
static address vectorizedMismatch() { return _vectorizedMismatch; }

@ -602,6 +602,8 @@ typedef HashtableEntry<InstanceKlass*, mtClass> KlassHashtableEntry;
static_field(StubRoutines, _updateBytesCRC32C, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \
static_field(StubRoutines, _bigIntegerRightShiftWorker, address) \
static_field(StubRoutines, _bigIntegerLeftShiftWorker, address) \
static_field(StubRoutines, _mulAdd, address) \
static_field(StubRoutines, _dexp, address) \
static_field(StubRoutines, _dlog, address) \

@ -42,6 +42,7 @@ import jdk.internal.math.DoubleConsts;
import jdk.internal.math.FloatConsts;
import jdk.internal.HotSpotIntrinsicCandidate;
import jdk.internal.vm.annotation.Stable;
import jdk.internal.vm.annotation.ForceInline;
/**
* Immutable arbitrary-precision integers. All operations behave as if
@ -2621,12 +2622,8 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
// shifts a up to len right n bits assumes no leading zeros, 0<n<32
static void primitiveRightShift(int[] a, int len, int n) {
int n2 = 32 - n;
for (int i=len-1, c=a[i]; i > 0; i--) {
int b = c;
c = a[i-1];
a[i] = (c << n2) | (b >>> n);
}
Objects.checkFromToIndex(0, len, a.length);
shiftRightImplWorker(a, a, 1, n, len-1);
a[0] >>>= n;
}
@ -2634,13 +2631,8 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
static void primitiveLeftShift(int[] a, int len, int n) {
if (len == 0 || n == 0)
return;
int n2 = 32 - n;
for (int i=0, c=a[i], m=i+len-1; i < m; i++) {
int b = c;
c = a[i+1];
a[i] = (b << n) | (c >>> n2);
}
Objects.checkFromToIndex(0, len, a.length);
shiftLeftImplWorker(a, a, 0, n, len-1);
a[len-1] <<= n;
}
@ -3353,14 +3345,25 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
} else {
newMag = new int[magLen + nInts];
}
int j=0;
while (j < magLen-1)
newMag[i++] = mag[j++] << nBits | mag[j] >>> nBits2;
newMag[i] = mag[j] << nBits;
int numIter = magLen - 1;
Objects.checkFromToIndex(0, numIter + 1, mag.length);
Objects.checkFromToIndex(i, numIter + i + 1, newMag.length);
shiftLeftImplWorker(newMag, mag, i, nBits, numIter);
newMag[numIter + i] = mag[numIter] << nBits;
}
return newMag;
}
@ForceInline
@HotSpotIntrinsicCandidate
private static void shiftLeftImplWorker(int[] newArr, int[] oldArr, int newIdx, int shiftCount, int numIter) {
int shiftCountRight = 32 - shiftCount;
int oldIdx = 0;
while (oldIdx < numIter) {
newArr[newIdx++] = (oldArr[oldIdx++] << shiftCount) | (oldArr[oldIdx] >>> shiftCountRight);
}
}
/**
* Returns a BigInteger whose value is {@code (this >> n)}. Sign
* extension is performed. The shift distance, {@code n}, may be
@ -3415,11 +3418,10 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
} else {
newMag = new int[magLen - nInts -1];
}
int nBits2 = 32 - nBits;
int j=0;
while (j < magLen - nInts - 1)
newMag[i++] = (mag[j++] << nBits2) | (mag[j] >>> nBits);
int numIter = magLen - nInts - 1;
Objects.checkFromToIndex(0, numIter + 1, mag.length);
Objects.checkFromToIndex(i, numIter + i, newMag.length);
shiftRightImplWorker(newMag, mag, i, nBits, numIter);
}
if (signum < 0) {
@ -3437,6 +3439,17 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
return new BigInteger(newMag, signum);
}
@ForceInline
@HotSpotIntrinsicCandidate
private static void shiftRightImplWorker(int[] newArr, int[] oldArr, int newIdx, int shiftCount, int numIter) {
int shiftCountLeft = 32 - shiftCount;
int idx = numIter;
int nidx = (newIdx == 0) ? numIter - 1 : numIter;
while (nidx >= newIdx) {
newArr[nidx--] = (oldArr[idx--] >>> shiftCount) | (oldArr[idx] << shiftCountLeft);
}
}
int[] javaIncrement(int[] val) {
int lastSum = 0;
for (int i=val.length-1; i >= 0 && lastSum == 0; i--)

@ -229,6 +229,8 @@ public final class BinaryContainer implements SymbolTable {
{"StubRoutines::_montgomeryMultiply", "_aot_stub_routines_montgomeryMultiply" },
{"StubRoutines::_montgomerySquare", "_aot_stub_routines_montgomerySquare" },
{"StubRoutines::_vectorizedMismatch", "_aot_stub_routines_vectorizedMismatch" },
{"StubRoutines::_bigIntegerRightShiftWorker", "_aot_stub_routines_bigIntegerRightShiftWorker" },
{"StubRoutines::_bigIntegerLeftShiftWorker", "_aot_stub_routines_bigIntegerLeftShiftWorker" },
{"StubRoutines::_throw_delayed_StackOverflowError_entry", "_aot_stub_routines_throw_delayed_StackOverflowError_entry" },

@ -416,7 +416,9 @@ public class CheckGraalIntrinsics extends GraalTest {
if (isJDK14OrHigher()) {
add(toBeInvestigated,
"com/sun/crypto/provider/ElectronicCodeBook.implECBDecrypt([BII[BI)I",
"com/sun/crypto/provider/ElectronicCodeBook.implECBEncrypt([BII[BI)I");
"com/sun/crypto/provider/ElectronicCodeBook.implECBEncrypt([BII[BI)I",
"java/math/BigInteger.shiftLeftImplWorker([I[IIII)V",
"java/math/BigInteger.shiftRightImplWorker([I[IIII)V");
}
if (!config.inlineNotify()) {

@ -0,0 +1,130 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8234692
* @summary Add C2 x86 intrinsic for BigInteger::shiftLeft() and BigInteger::shiftRight() method
* @requires vm.compiler2.enabled
*
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
* -XX:CompileCommand=exclude,compiler.intrinsics.bigInteger.TestShift::main
* -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_left_shift,ccstr,DisableIntrinsic,_bigIntegerLeftShiftWorker
* -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_right_shift,ccstr,DisableIntrinsic,_bigIntegerRightShiftWorker
* -XX:CompileCommand=inline,java.math.BigInteger::shiftLeft
* -XX:CompileCommand=inline,java.math.BigInteger::shiftRight
* compiler.intrinsics.bigInteger.TestShift
*
* @run main/othervm/timeout=600
* -XX:CompileCommand=exclude,compiler.intrinsics.bigInteger.TestShift::main
* -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_left_shift,ccstr,DisableIntrinsic,_bigIntegerLeftShiftWorker
* -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_right_shift,ccstr,DisableIntrinsic,_bigIntegerRightShiftWorker
* -XX:CompileCommand=inline,java.math.BigInteger::shiftLeft
* -XX:CompileCommand=inline,java.math.BigInteger::shiftRight
* compiler.intrinsics.bigInteger.TestShift
*
*/
package compiler.intrinsics.bigInteger;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Random;
public class TestShift {
public static BigInteger base_left_shift(BigInteger op1, int shift) {
return op1.shiftLeft(shift);
}
public static BigInteger new_left_shift(BigInteger op1, int shift) {
return op1.shiftLeft(shift);
}
public static BigInteger base_right_shift(BigInteger op1, int shift) {
return op1.shiftRight(shift);
}
public static BigInteger new_right_shift(BigInteger op1, int shift) {
return op1.shiftRight(shift);
}
public static boolean bytecompare(BigInteger b1, BigInteger b2) {
byte[] data1 = b1.toByteArray();
byte[] data2 = b2.toByteArray();
if (data1.length != data2.length)
return false;
for (int i = 0; i < data1.length; i++) {
if (data1[i] != data2[i])
return false;
}
return true;
}
public static String stringify(BigInteger b) {
String strout= "";
byte [] data = b.toByteArray();
for (int i = 0; i < data.length; i++) {
strout += (String.format("%02x",data[i]) + " ");
}
return strout;
}
public static void main(String args[]) throws Exception {
BigInteger [] inputbuffer = new BigInteger[10];
BigInteger [] oldLeftShiftResult = new BigInteger[10];
BigInteger [] newLeftShiftResult = new BigInteger[10];
BigInteger [] oldRightShiftResult = new BigInteger[10];
BigInteger [] newRightShiftResult = new BigInteger[10];
Random rand = new Random();
long seed = System.nanoTime();
rand.setSeed(seed);
int shiftCount = rand.nextInt(30) + 1;
for(int i = 0; i < inputbuffer.length; i++) {
int numbits = rand.nextInt(4096)+32;
inputbuffer[i] = new BigInteger(numbits, rand);
}
for (int j = 0; j < 100000; j++) {
for(int i = 0; i < inputbuffer.length; i++) {
oldLeftShiftResult[i] = base_left_shift(inputbuffer[i], shiftCount);
newLeftShiftResult[i] = new_left_shift(inputbuffer[i], shiftCount);
if (!bytecompare(oldLeftShiftResult[i], newLeftShiftResult[i])) {
System.out.println("mismatch for input:" + stringify(inputbuffer[i]) + "\n" + "expected left shift result:" + stringify(oldLeftShiftResult[i]) + "\n" +
"calculated left shift result:" + stringify(newLeftShiftResult[i]));
throw new Exception("Failed");
}
oldRightShiftResult[i] = base_right_shift(inputbuffer[i], shiftCount);
newRightShiftResult[i] = new_right_shift(inputbuffer[i], shiftCount);
if (!bytecompare(oldRightShiftResult[i], newRightShiftResult[i])) {
System.out.println("mismatch for input:" + stringify(inputbuffer[i]) + "\n" + "expected right shift result:" + stringify(oldRightShiftResult[i]) + "\n" +
"calculated right shift result:" + stringify(newRightShiftResult[i]));
throw new Exception("Failed");
}
}
}
}
}

@ -45,7 +45,7 @@ import java.util.concurrent.TimeUnit;
@State(Scope.Thread)
public class BigIntegers {
private BigInteger[] hugeArray, largeArray, smallArray;
private BigInteger[] hugeArray, largeArray, smallArray, shiftArray;
public String[] dummyStringArray;
public Object[] dummyArr;
private static final int TESTSIZE = 1000;
@ -53,6 +53,7 @@ public class BigIntegers {
@Setup
public void setup() {
Random r = new Random(1123);
int numbits = r.nextInt(16384);
hugeArray = new BigInteger[TESTSIZE]; /*
* Huge numbers larger than
@ -67,6 +68,10 @@ public class BigIntegers {
* Small number less than
* MAX_INT
*/
shiftArray = new BigInteger[TESTSIZE]; /*
* Each array entry is atmost 16k bits
* in size
*/
dummyStringArray = new String[TESTSIZE];
dummyArr = new Object[TESTSIZE];
@ -78,6 +83,7 @@ public class BigIntegers {
+ ((long) value + (long) Integer.MAX_VALUE));
largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE));
smallArray[i] = new BigInteger("" + ((long) value / 1000));
shiftArray[i] = new BigInteger(numbits, r);
}
}
@ -137,4 +143,38 @@ public class BigIntegers {
}
bh.consume(tmp);
}
/** Invokes the shiftLeft method of BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testLeftShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : shiftArray) {
if (tmp == null) {
tmp = s;
continue;
}
tmp = tmp.shiftLeft(shift);
}
bh.consume(tmp);
}
/** Invokes the shiftRight method of BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testRightShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : shiftArray) {
if (tmp == null) {
tmp = s;
continue;
}
tmp = tmp.shiftRight(shift);
}
bh.consume(tmp);
}
}