From b73ef8ebc6a059e4f200d7ca20c5ebb720ebdb18 Mon Sep 17 00:00:00 2001 From: Ed Nevill Date: Fri, 17 Jul 2015 07:50:36 +0000 Subject: [PATCH 1/3] 8131362: aarch64: C2 does not handle large stack offsets Change spill code to allow large offsets Reviewed-by: kvn, aph --- hotspot/src/cpu/aarch64/vm/aarch64.ad | 401 ++++-------------- .../cpu/aarch64/vm/macroAssembler_aarch64.cpp | 22 + .../cpu/aarch64/vm/macroAssembler_aarch64.hpp | 44 ++ 3 files changed, 159 insertions(+), 308 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/aarch64.ad b/hotspot/src/cpu/aarch64/vm/aarch64.ad index 5ece17d1c6d..bef849a2050 100644 --- a/hotspot/src/cpu/aarch64/vm/aarch64.ad +++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad @@ -2167,8 +2167,12 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo return 0; // Self copy, no move. } + bool is64 = (src_lo & 1) == 0 && src_lo + 1 == src_hi && + (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi; + int src_offset = ra_->reg2offset(src_lo); + int dst_offset = ra_->reg2offset(dst_lo); + if (bottom_type()->isa_vect() != NULL) { - uint len = 4; uint ireg = ideal_reg(); assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); if (cbuf) { @@ -2176,334 +2180,115 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity"); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { // stack->stack - int src_offset = ra_->reg2offset(src_lo); - int dst_offset = ra_->reg2offset(dst_lo); assert((src_offset & 7) && (dst_offset & 7), "unaligned stack offset"); - len = 8; if (ireg == Op_VecD) { - __ ldr(rscratch1, Address(sp, src_offset)); - __ str(rscratch1, Address(sp, dst_offset)); + __ unspill(rscratch1, true, src_offset); + __ spill(rscratch1, true, dst_offset); } else { - if (src_offset < 512) { - __ ldp(rscratch1, rscratch2, Address(sp, src_offset)); - } else { - __ ldr(rscratch1, Address(sp, src_offset)); - __ ldr(rscratch2, Address(sp, src_offset+4)); - len += 4; - } - if (dst_offset < 512) { - __ stp(rscratch1, rscratch2, Address(sp, dst_offset)); - } else { - __ str(rscratch1, Address(sp, dst_offset)); - __ str(rscratch2, Address(sp, dst_offset+4)); - len += 4; - } + __ spill_copy128(src_offset, dst_offset); } } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { - __ orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), + __ mov(as_FloatRegister(Matcher::_regEncode[dst_lo]), ireg == Op_VecD ? __ T8B : __ T16B, - as_FloatRegister(Matcher::_regEncode[src_lo]), as_FloatRegister(Matcher::_regEncode[src_lo])); } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { - __ str(as_FloatRegister(Matcher::_regEncode[src_lo]), - ireg == Op_VecD ? __ D : __ Q, - Address(sp, ra_->reg2offset(dst_lo))); + __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]), + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(dst_lo)); } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { - __ ldr(as_FloatRegister(Matcher::_regEncode[dst_lo]), - ireg == Op_VecD ? __ D : __ Q, - Address(sp, ra_->reg2offset(src_lo))); + __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]), + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(src_lo)); } else { ShouldNotReachHere(); } - } else if (st) { - if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { - // stack->stack - int src_offset = ra_->reg2offset(src_lo); - int dst_offset = ra_->reg2offset(dst_lo); - if (ireg == Op_VecD) { - st->print("ldr rscratch1, [sp, #%d]", src_offset); - st->print("str rscratch1, [sp, #%d]", dst_offset); + } + } else if (cbuf) { + MacroAssembler _masm(cbuf); + switch (src_lo_rc) { + case rc_int: + if (dst_lo_rc == rc_int) { // gpr --> gpr copy + if (is64) { + __ mov(as_Register(Matcher::_regEncode[dst_lo]), + as_Register(Matcher::_regEncode[src_lo])); } else { - if (src_offset < 512) { - st->print("ldp rscratch1, rscratch2, [sp, #%d]", src_offset); - } else { - st->print("ldr rscratch1, [sp, #%d]", src_offset); - st->print("\nldr rscratch2, [sp, #%d]", src_offset+4); - } - if (dst_offset < 512) { - st->print("\nstp rscratch1, rscratch2, [sp, #%d]", dst_offset); - } else { - st->print("\nstr rscratch1, [sp, #%d]", dst_offset); - st->print("\nstr rscratch2, [sp, #%d]", dst_offset+4); - } + MacroAssembler _masm(cbuf); + __ movw(as_Register(Matcher::_regEncode[dst_lo]), + as_Register(Matcher::_regEncode[src_lo])); } - st->print("\t# vector spill, stack to stack"); - } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { - st->print("mov %s, %s\t# vector spill, reg to reg", - Matcher::regName[dst_lo], Matcher::regName[src_lo]); - } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { - st->print("str %s, [sp, #%d]\t# vector spill, reg to stack", - Matcher::regName[src_lo], ra_->reg2offset(dst_lo)); - } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { - st->print("ldr %s, [sp, #%d]\t# vector spill, stack to reg", - Matcher::regName[dst_lo], ra_->reg2offset(src_lo)); + } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy + if (is64) { + __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_Register(Matcher::_regEncode[src_lo])); + } else { + __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_Register(Matcher::_regEncode[src_lo])); + } + } else { // gpr --> stack spill + assert(dst_lo_rc == rc_stack, "spill to bad register class"); + __ spill(as_Register(Matcher::_regEncode[src_lo]), is64, dst_offset); } - } - return len; - } - - switch (src_lo_rc) { - case rc_int: - if (dst_lo_rc == rc_int) { // gpr --> gpr copy - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ mov(as_Register(Matcher::_regEncode[dst_lo]), - as_Register(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("mov %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); + break; + case rc_float: + if (dst_lo_rc == rc_int) { // fpr --> gpr copy + if (is64) { + __ fmovd(as_Register(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + __ fmovs(as_Register(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ movw(as_Register(Matcher::_regEncode[dst_lo]), - as_Register(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("movw %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); + } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy + if (cbuf) { + __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); } + } else { // fpr --> stack spill + assert(dst_lo_rc == rc_stack, "spill to bad register class"); + __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]), + is64 ? __ D : __ S, dst_offset); } - } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]), - as_Register(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovd %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]), - as_Register(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovs %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } + break; + case rc_stack: + if (dst_lo_rc == rc_int) { // stack --> gpr load + __ unspill(as_Register(Matcher::_regEncode[dst_lo]), is64, src_offset); + } else if (dst_lo_rc == rc_float) { // stack --> fpr load + __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]), + is64 ? __ D : __ S, src_offset); + } else { // stack --> stack copy + assert(dst_lo_rc == rc_stack, "spill to bad register class"); + __ unspill(rscratch1, is64, src_offset); + __ spill(rscratch1, is64, dst_offset); } - } else { // gpr --> stack spill - assert(dst_lo_rc == rc_stack, "spill to bad register class"); - int dst_offset = ra_->reg2offset(dst_lo); - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ str(as_Register(Matcher::_regEncode[src_lo]), - Address(sp, dst_offset)); - } else if (st) { - st->print("str %s, [sp, #%d]\t# spill", - Matcher::regName[src_lo], - dst_offset); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ strw(as_Register(Matcher::_regEncode[src_lo]), - Address(sp, dst_offset)); - } else if (st) { - st->print("strw %s, [sp, #%d]\t# spill", - Matcher::regName[src_lo], - dst_offset); - } - } - } - return 4; - case rc_float: - if (dst_lo_rc == rc_int) { // fpr --> gpr copy - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovd(as_Register(Matcher::_regEncode[dst_lo]), - as_FloatRegister(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovd %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovs(as_Register(Matcher::_regEncode[dst_lo]), - as_FloatRegister(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovs %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } - } - } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]), - as_FloatRegister(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovd %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]), - as_FloatRegister(Matcher::_regEncode[src_lo])); - } else if (st) { - st->print("fmovs %s, %s\t# shuffle", - Matcher::regName[dst_lo], - Matcher::regName[src_lo]); - } - } - } else { // fpr --> stack spill - assert(dst_lo_rc == rc_stack, "spill to bad register class"); - int dst_offset = ra_->reg2offset(dst_lo); - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ strd(as_FloatRegister(Matcher::_regEncode[src_lo]), - Address(sp, dst_offset)); - } else if (st) { - st->print("strd %s, [sp, #%d]\t# spill", - Matcher::regName[src_lo], - dst_offset); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ strs(as_FloatRegister(Matcher::_regEncode[src_lo]), - Address(sp, dst_offset)); - } else if (st) { - st->print("strs %s, [sp, #%d]\t# spill", - Matcher::regName[src_lo], - dst_offset); - } - } - } - return 4; - case rc_stack: - int src_offset = ra_->reg2offset(src_lo); - if (dst_lo_rc == rc_int) { // stack --> gpr load - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldr(as_Register(Matcher::_regEncode[dst_lo]), - Address(sp, src_offset)); - } else if (st) { - st->print("ldr %s, [sp, %d]\t# restore", - Matcher::regName[dst_lo], - src_offset); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldrw(as_Register(Matcher::_regEncode[dst_lo]), - Address(sp, src_offset)); - } else if (st) { - st->print("ldr %s, [sp, %d]\t# restore", - Matcher::regName[dst_lo], - src_offset); - } - } - return 4; - } else if (dst_lo_rc == rc_float) { // stack --> fpr load - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldrd(as_FloatRegister(Matcher::_regEncode[dst_lo]), - Address(sp, src_offset)); - } else if (st) { - st->print("ldrd %s, [sp, %d]\t# restore", - Matcher::regName[dst_lo], - src_offset); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldrs(as_FloatRegister(Matcher::_regEncode[dst_lo]), - Address(sp, src_offset)); - } else if (st) { - st->print("ldrs %s, [sp, %d]\t# restore", - Matcher::regName[dst_lo], - src_offset); - } - } - return 4; - } else { // stack --> stack copy - assert(dst_lo_rc == rc_stack, "spill to bad register class"); - int dst_offset = ra_->reg2offset(dst_lo); - if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) && - (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) { - // 64 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldr(rscratch1, Address(sp, src_offset)); - __ str(rscratch1, Address(sp, dst_offset)); - } else if (st) { - st->print("ldr rscratch1, [sp, %d]\t# mem-mem spill", - src_offset); - st->print("\n\t"); - st->print("str rscratch1, [sp, %d]", - dst_offset); - } - } else { - // 32 bit - if (cbuf) { - MacroAssembler _masm(cbuf); - __ ldrw(rscratch1, Address(sp, src_offset)); - __ strw(rscratch1, Address(sp, dst_offset)); - } else if (st) { - st->print("ldrw rscratch1, [sp, %d]\t# mem-mem spill", - src_offset); - st->print("\n\t"); - st->print("strw rscratch1, [sp, %d]", - dst_offset); - } - } - return 8; + break; + default: + assert(false, "bad rc_class for spill"); + ShouldNotReachHere(); + } + } + + if (st) { + st->print("spill "); + if (src_lo_rc == rc_stack) { + st->print("[sp, #%d] -> ", ra_->reg2offset(src_lo)); + } else { + st->print("%s -> ", Matcher::regName[src_lo]); + } + if (dst_lo_rc == rc_stack) { + st->print("[sp, #%d]", ra_->reg2offset(dst_lo)); + } else { + st->print("%s", Matcher::regName[dst_lo]); + } + if (bottom_type()->isa_vect() != NULL) { + st->print("\t# vector spill size = %d", ideal_reg()==Op_VecD ? 64:128); + } else { + st->print("\t# spill size = %d", is64 ? 64:32); } } - assert(false," bad rc_class for spill "); - Unimplemented(); return 0; } @@ -2522,7 +2307,7 @@ void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { } uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const { - return implementation(NULL, ra_, true, NULL); + return MachNode::size(ra_); } //============================================================================= diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp index 4fe853e1f86..309191390ea 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp @@ -2306,6 +2306,28 @@ Address MacroAssembler::offsetted_address(Register r, Register r1, } } +Address MacroAssembler::spill_address(int size, int offset, Register tmp) +{ + assert(offset >= 0, "spill to negative address?"); + // Offset reachable ? + // Not aligned - 9 bits signed offset + // Aligned - 12 bits unsigned offset shifted + Register base = sp; + if ((offset & (size-1)) && offset >= (1<<8)) { + add(tmp, base, offset & ((1<<12)-1)); + base = tmp; + offset &= -1<<12; + } + + if (offset >= (1<<12) * size) { + add(tmp, base, offset & (((1<<12)-1)<<12)); + base = tmp; + offset &= ~(((1<<12)-1)<<12); + } + + return Address(base, offset); +} + /** * Multiply 64 bit by 64 bit first loop. */ diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp index b3544ee625d..3a73655ac17 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp @@ -468,6 +468,10 @@ public: void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32); + void mov(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { + orr(Vd, T, Vn, Vn); + } + // macro instructions for accessing and updating floating point // status register // @@ -1161,6 +1165,46 @@ private: // Uses rscratch2. Address offsetted_address(Register r, Register r1, Address::extend ext, int offset, int size); + +private: + // Returns an address on the stack which is reachable with a ldr/str of size + // Uses rscratch2 if the address is not directly reachable + Address spill_address(int size, int offset, Register tmp=rscratch2); + +public: + void spill(Register Rx, bool is64, int offset) { + if (is64) { + str(Rx, spill_address(8, offset)); + } else { + strw(Rx, spill_address(4, offset)); + } + } + void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) { + str(Vx, T, spill_address(1 << (int)T, offset)); + } + void unspill(Register Rx, bool is64, int offset) { + if (is64) { + ldr(Rx, spill_address(8, offset)); + } else { + ldrw(Rx, spill_address(4, offset)); + } + } + void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) { + ldr(Vx, T, spill_address(1 << (int)T, offset)); + } + void spill_copy128(int src_offset, int dst_offset, + Register tmp1=rscratch1, Register tmp2=rscratch2) { + if (src_offset < 512 && (src_offset & 7) == 0 && + dst_offset < 512 && (dst_offset & 7) == 0) { + ldp(tmp1, tmp2, Address(sp, src_offset)); + stp(tmp1, tmp2, Address(sp, dst_offset)); + } else { + unspill(tmp1, true, src_offset); + spill(tmp1, true, dst_offset); + unspill(tmp1, true, src_offset+8); + spill(tmp1, true, dst_offset+8); + } + } }; #ifdef ASSERT From 244435704bce329e44815b7d9248e351d7a12442 Mon Sep 17 00:00:00 2001 From: Andrew Haley Date: Mon, 20 Jul 2015 11:41:34 +0100 Subject: [PATCH 2/3] 8131779: AARCH64: add Montgomery multiply intrinsic Add Montgomery multiply intrinsic for AArch64. Reviewed-by: kvn --- .../cpu/aarch64/vm/macroAssembler_aarch64.cpp | 8 + .../cpu/aarch64/vm/macroAssembler_aarch64.hpp | 8 + .../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 858 +++++++++++++++++- .../src/cpu/aarch64/vm/vm_version_aarch64.cpp | 7 + 4 files changed, 869 insertions(+), 12 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp index b47d57f1cdb..93b951081fd 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp @@ -2008,6 +2008,14 @@ void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment } } +void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { + if (decrement.is_register()) { + sub(Rd, Rn, decrement.as_register()); + } else { + sub(Rd, Rn, decrement.as_constant()); + } +} + void MacroAssembler::reinit_heapbase() { if (UseCompressedOops) { diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp index b3544ee625d..a2d0a93771a 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp @@ -464,6 +464,13 @@ public: mov(dst, (long)i); } + void mov(Register dst, RegisterOrConstant src) { + if (src.is_register()) + mov(dst, src.as_register()); + else + mov(dst, src.as_constant()); + } + void movptr(Register r, uintptr_t imm64); void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32); @@ -1045,6 +1052,7 @@ public: void add(Register Rd, Register Rn, RegisterOrConstant increment); void addw(Register Rd, Register Rn, RegisterOrConstant increment); + void sub(Register Rd, Register Rn, RegisterOrConstant decrement); void adrp(Register reg1, const Address &dest, unsigned long &byte_offset); diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp index a7fcaca9788..ec45397fd26 100644 --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -120,10 +120,8 @@ class StubGenerator: public StubCodeGenerator { // we save r19-r28 which Java uses as scratch registers and C // expects to be callee-save // - // we don't save any FP registers since only v8-v15 are callee-save - // (strictly only the f and d components) and Java uses them as - // callee-save. v0-v7 are arg registers and C treats v16-v31 as - // volatile (as does Java?) + // we save the bottom 64 bits of each value stored in v8-v15; it is + // the responsibility of the caller to preserve larger values. // // so the stub frame looks like this when we enter Java code // @@ -131,14 +129,14 @@ class StubGenerator: public StubCodeGenerator { // [ argument word n ] // ... // -27 [ argument word 1 ] - // -26 [ saved d15 ] <--- sp_after_call - // -25 [ saved d14 ] - // -24 [ saved d13 ] - // -23 [ saved d12 ] - // -22 [ saved d11 ] - // -21 [ saved d10 ] - // -20 [ saved d9 ] - // -19 [ saved d8 ] + // -26 [ saved v15 ] <--- sp_after_call + // -25 [ saved v14 ] + // -24 [ saved v13 ] + // -23 [ saved v12 ] + // -22 [ saved v11 ] + // -21 [ saved v10 ] + // -20 [ saved v9 ] + // -19 [ saved v8 ] // -18 [ saved r28 ] // -17 [ saved r27 ] // -16 [ saved r26 ] @@ -2544,6 +2542,828 @@ class StubGenerator: public StubCodeGenerator { return stub->entry_point(); } + class MontgomeryMultiplyGenerator : public MacroAssembler { + + Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, + Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; + + RegSet _toSave; + bool _squaring; + + public: + MontgomeryMultiplyGenerator (Assembler *as, bool squaring) + : MacroAssembler(as->code()), _squaring(squaring) { + + // Register allocation + + Register reg = c_rarg0; + Pa_base = reg; // Argument registers + if (squaring) + Pb_base = Pa_base; + else + Pb_base = ++reg; + Pn_base = ++reg; + Rlen= ++reg; + inv = ++reg; + Pm_base = ++reg; + + // Working registers: + Ra = ++reg; // The current digit of a, b, n, and m. + Rb = ++reg; + Rm = ++reg; + Rn = ++reg; + + Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. + Pb = ++reg; + Pm = ++reg; + Pn = ++reg; + + t0 = ++reg; // Three registers which form a + t1 = ++reg; // triple-precision accumuator. + t2 = ++reg; + + Ri = ++reg; // Inner and outer loop indexes. + Rj = ++reg; + + Rhi_ab = ++reg; // Product registers: low and high parts + Rlo_ab = ++reg; // of a*b and m*n. + Rhi_mn = ++reg; + Rlo_mn = ++reg; + + // r19 and up are callee-saved. + _toSave = RegSet::range(r19, reg) + Pm_base; + } + + private: + void save_regs() { + push(_toSave, sp); + } + + void restore_regs() { + pop(_toSave, sp); + } + + template + void unroll_2(Register count, T block) { + Label loop, end, odd; + tbnz(count, 0, odd); + cbz(count, end); + align(16); + bind(loop); + (this->*block)(); + bind(odd); + (this->*block)(); + subs(count, count, 2); + br(Assembler::GT, loop); + bind(end); + } + + template + void unroll_2(Register count, T block, Register d, Register s, Register tmp) { + Label loop, end, odd; + tbnz(count, 0, odd); + cbz(count, end); + align(16); + bind(loop); + (this->*block)(d, s, tmp); + bind(odd); + (this->*block)(d, s, tmp); + subs(count, count, 2); + br(Assembler::GT, loop); + bind(end); + } + + void pre1(RegisterOrConstant i) { + block_comment("pre1"); + // Pa = Pa_base; + // Pb = Pb_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + ldr(Ra, Address(Pa_base)); + ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rm, Address(Pm_base)); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + lea(Pa, Address(Pa_base)); + lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); + lea(Pm, Address(Pm_base)); + lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + + // Zero the m*n result. + mov(Rhi_mn, zr); + mov(Rlo_mn, zr); + } + + // The core multiply-accumulate step of a Montgomery + // multiplication. The idea is to schedule operations as a + // pipeline so that instructions with long latencies (loads and + // multiplies) have time to complete before their results are + // used. This most benefits in-order implementations of the + // architecture but out-of-order ones also benefit. + void step() { + block_comment("step"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + ldr(Ra, pre(Pa, wordSize)); + ldr(Rb, pre(Pb, -wordSize)); + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the + // previous iteration. + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + umulh(Rhi_mn, Rm, Rn); + mul(Rlo_mn, Rm, Rn); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } + + void post1() { + block_comment("post1"); + + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + + // *Pm = Rm = t0 * inv; + mul(Rm, t0, inv); + str(Rm, Address(Pm)); + + // MACC(Rm, Rn, t0, t1, t2); + // t0 = t1; t1 = t2; t2 = 0; + umulh(Rhi_mn, Rm, Rn); + +#ifndef PRODUCT + // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); + { + mul(Rlo_mn, Rm, Rn); + add(Rlo_mn, t0, Rlo_mn); + Label ok; + cbz(Rlo_mn, ok); { + stop("broken Montgomery multiply"); + } bind(ok); + } +#endif + // We have very carefully set things up so that + // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate + // the lower half of Rm * Rn because we know the result already: + // it must be -t0. t0 + (-t0) must generate a carry iff + // t0 != 0. So, rather than do a mul and an adds we just set + // the carry flag iff t0 is nonzero. + // + // mul(Rlo_mn, Rm, Rn); + // adds(zr, t0, Rlo_mn); + subs(zr, t0, 1); // Set carry iff t0 is nonzero + adcs(t0, t1, Rhi_mn); + adc(t1, t2, zr); + mov(t2, zr); + } + + void pre2(RegisterOrConstant i, RegisterOrConstant len) { + block_comment("pre2"); + // Pa = Pa_base + i-len; + // Pb = Pb_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + if (i.is_register()) { + sub(Rj, i.as_register(), len); + } else { + mov(Rj, i.as_constant()); + sub(Rj, Rj, len); + } + // Rj == i-len + + lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); + lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); + lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); + lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + ldr(Ra, pre(Pa, wordSize)); + ldr(Rb, pre(Pb, -wordSize)); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + + mov(Rhi_mn, zr); + mov(Rlo_mn, zr); + } + + void post2(RegisterOrConstant i, RegisterOrConstant len) { + block_comment("post2"); + if (i.is_constant()) { + mov(Rj, i.as_constant()-len.as_constant()); + } else { + sub(Rj, i.as_register(), len); + } + + adds(t0, t0, Rlo_mn); // The pending m*n, low part + + // As soon as we know the least significant digit of our result, + // store it. + // Pm_base[i-len] = t0; + str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); + + // t0 = t1; t1 = t2; t2 = 0; + adcs(t0, t1, Rhi_mn); // The pending m*n, high part + adc(t1, t2, zr); + mov(t2, zr); + } + + // A carry in t0 after Montgomery multiplication means that we + // should subtract multiples of n from our result in m. We'll + // keep doing that until there is no carry. + void normalize(RegisterOrConstant len) { + block_comment("normalize"); + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + Label loop, post, again; + Register cnt = t1, i = t2; // Re-use registers; we're done with them now + cbz(t0, post); { + bind(again); { + mov(i, zr); + mov(cnt, len); + ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + subs(zr, zr, zr); // set carry flag, i.e. no borrow + align(16); + bind(loop); { + sbcs(Rm, Rm, Rn); + str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + add(i, i, 1); + ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + sub(cnt, cnt, 1); + } cbnz(cnt, loop); + sbc(t0, t0, zr); + } cbnz(t0, again); + } bind(post); + } + + // Move memory at s to d, reversing words. + // Increments d to end of copied memory + // Destroys tmp1, tmp2 + // Preserves len + // Leaves s pointing to the address which was in d at start + void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { + assert(tmp1 < r19 && tmp2 < r19, "register corruption"); + + lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); + mov(tmp1, len); + unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); + sub(s, d, len, ext::uxtw, LogBytesPerWord); + } + // where + void reverse1(Register d, Register s, Register tmp) { + ldr(tmp, pre(s, -wordSize)); + ror(tmp, tmp, 32); + str(tmp, post(d, wordSize)); + } + + void step_squaring() { + // An extra ACC + step(); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } + + void last_squaring(RegisterOrConstant i) { + Label dont; + // if ((i & 1) == 0) { + tbnz(i.as_register(), 0, dont); { + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } bind(dont); + } + + void extra_step_squaring() { + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + umulh(Rhi_mn, Rm, Rn); + mul(Rlo_mn, Rm, Rn); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + } + + void post1_squaring() { + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + + // *Pm = Rm = t0 * inv; + mul(Rm, t0, inv); + str(Rm, Address(Pm)); + + // MACC(Rm, Rn, t0, t1, t2); + // t0 = t1; t1 = t2; t2 = 0; + umulh(Rhi_mn, Rm, Rn); + +#ifndef PRODUCT + // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); + { + mul(Rlo_mn, Rm, Rn); + add(Rlo_mn, t0, Rlo_mn); + Label ok; + cbz(Rlo_mn, ok); { + stop("broken Montgomery multiply"); + } bind(ok); + } +#endif + // We have very carefully set things up so that + // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate + // the lower half of Rm * Rn because we know the result already: + // it must be -t0. t0 + (-t0) must generate a carry iff + // t0 != 0. So, rather than do a mul and an adds we just set + // the carry flag iff t0 is nonzero. + // + // mul(Rlo_mn, Rm, Rn); + // adds(zr, t0, Rlo_mn); + subs(zr, t0, 1); // Set carry iff t0 is nonzero + adcs(t0, t1, Rhi_mn); + adc(t1, t2, zr); + mov(t2, zr); + } + + void acc(Register Rhi, Register Rlo, + Register t0, Register t1, Register t2) { + adds(t0, t0, Rlo); + adcs(t1, t1, Rhi); + adc(t2, t2, zr); + } + + public: + /** + * Fast Montgomery multiplication. The derivation of the + * algorithm is in A Cryptographic Library for the Motorola + * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. + * + * Arguments: + * + * Inputs for multiplication: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements b + * c_rarg2 - int array elements n (the modulus) + * c_rarg3 - int length + * c_rarg4 - int inv + * c_rarg5 - int array elements m (the result) + * + * Inputs for squaring: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements n (the modulus) + * c_rarg2 - int length + * c_rarg3 - int inv + * c_rarg4 - int array elements m (the result) + * + */ + address generate_multiply() { + Label argh, nothing; + bind(argh); + stop("MontgomeryMultiply total_allocation must be <= 8192"); + + align(CodeEntryAlignment); + address entry = pc(); + + cbzw(Rlen, nothing); + + enter(); + + // Make room. + cmpw(Rlen, 512); + br(Assembler::HI, argh); + sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); + andr(sp, Ra, -2 * wordSize); + + lsrw(Rlen, Rlen, 1); // length in longwords = len/2 + + { + // Copy input args, reversing as we go. We use Ra as a + // temporary variable. + reverse(Ra, Pa_base, Rlen, t0, t1); + if (!_squaring) + reverse(Ra, Pb_base, Rlen, t0, t1); + reverse(Ra, Pn_base, Rlen, t0, t1); + } + + // Push all call-saved registers and also Pm_base which we'll need + // at the end. + save_regs(); + +#ifndef PRODUCT + // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + { + ldr(Rn, Address(Pn_base, 0)); + mul(Rlo_mn, Rn, inv); + cmp(Rlo_mn, -1); + Label ok; + br(EQ, ok); { + stop("broken inverse in Montgomery multiply"); + } bind(ok); + } +#endif + + mov(Pm_base, Ra); + + mov(t0, zr); + mov(t1, zr); + mov(t2, zr); + + block_comment("for (int i = 0; i < len; i++) {"); + mov(Ri, zr); { + Label loop, end; + cmpw(Ri, Rlen); + br(Assembler::GE, end); + + bind(loop); + pre1(Ri); + + block_comment(" for (j = i; j; j--) {"); { + movw(Rj, Ri); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step); + } block_comment(" } // j"); + + post1(); + addw(Ri, Ri, 1); + cmpw(Ri, Rlen); + br(Assembler::LT, loop); + bind(end); + block_comment("} // i"); + } + + block_comment("for (int i = len; i < 2*len; i++) {"); + mov(Ri, Rlen); { + Label loop, end; + cmpw(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::GE, end); + + bind(loop); + pre2(Ri, Rlen); + + block_comment(" for (j = len*2-i-1; j; j--) {"); { + lslw(Rj, Rlen, 1); + subw(Rj, Rj, Ri); + subw(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step); + } block_comment(" } // j"); + + post2(Ri, Rlen); + addw(Ri, Ri, 1); + cmpw(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::LT, loop); + bind(end); + } + block_comment("} // i"); + + normalize(Rlen); + + mov(Ra, Pm_base); // Save Pm_base in Ra + restore_regs(); // Restore caller's Pm_base + + // Copy our result into caller's Pm_base + reverse(Pm_base, Ra, Rlen, t0, t1); + + leave(); + bind(nothing); + ret(lr); + + return entry; + } + // In C, approximately: + + // void + // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], + // unsigned long Pn_base[], unsigned long Pm_base[], + // unsigned long inv, int len) { + // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + // unsigned long *Pa, *Pb, *Pn, *Pm; + // unsigned long Ra, Rb, Rn, Rm; + + // int i; + + // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); + + // for (i = 0; i < len; i++) { + // int j; + + // Pa = Pa_base; + // Pb = Pb_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + + // int iters = i; + // for (j = 0; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // *Pm = Rm = t0 * inv; + // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + + // assert(t0 == 0, "broken Montgomery multiply"); + + // t0 = t1; t1 = t2; t2 = 0; + // } + + // for (i = len; i < 2*len; i++) { + // int j; + + // Pa = Pa_base + i-len; + // Pb = Pb_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + + // int iters = len*2-i-1; + // for (j = i-len+1; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // Pm_base[i-len] = t0; + // t0 = t1; t1 = t2; t2 = 0; + // } + + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + // } + + /** + * Fast Montgomery squaring. This uses asymptotically 25% fewer + * multiplies than Montgomery multiplication so it should be up to + * 25% faster. However, its loop control is more complex and it + * may actually run slower on some machines. + * + * Arguments: + * + * Inputs: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements n (the modulus) + * c_rarg2 - int length + * c_rarg3 - int inv + * c_rarg4 - int array elements m (the result) + * + */ + address generate_square() { + Label argh; + bind(argh); + stop("MontgomeryMultiply total_allocation must be <= 8192"); + + align(CodeEntryAlignment); + address entry = pc(); + + enter(); + + // Make room. + cmpw(Rlen, 512); + br(Assembler::HI, argh); + sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); + andr(sp, Ra, -2 * wordSize); + + lsrw(Rlen, Rlen, 1); // length in longwords = len/2 + + { + // Copy input args, reversing as we go. We use Ra as a + // temporary variable. + reverse(Ra, Pa_base, Rlen, t0, t1); + reverse(Ra, Pn_base, Rlen, t0, t1); + } + + // Push all call-saved registers and also Pm_base which we'll need + // at the end. + save_regs(); + + mov(Pm_base, Ra); + + mov(t0, zr); + mov(t1, zr); + mov(t2, zr); + + block_comment("for (int i = 0; i < len; i++) {"); + mov(Ri, zr); { + Label loop, end; + bind(loop); + cmp(Ri, Rlen); + br(Assembler::GE, end); + + pre1(Ri); + + block_comment("for (j = (i+1)/2; j; j--) {"); { + add(Rj, Ri, 1); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); + } block_comment(" } // j"); + + last_squaring(Ri); + + block_comment(" for (j = i/2; j; j--) {"); { + lsr(Rj, Ri, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); + } block_comment(" } // j"); + + post1_squaring(); + add(Ri, Ri, 1); + cmp(Ri, Rlen); + br(Assembler::LT, loop); + + bind(end); + block_comment("} // i"); + } + + block_comment("for (int i = len; i < 2*len; i++) {"); + mov(Ri, Rlen); { + Label loop, end; + bind(loop); + cmp(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::GE, end); + + pre2(Ri, Rlen); + + block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { + lsl(Rj, Rlen, 1); + sub(Rj, Rj, Ri); + sub(Rj, Rj, 1); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); + } block_comment(" } // j"); + + last_squaring(Ri); + + block_comment(" for (j = (2*len-i)/2; j; j--) {"); { + lsl(Rj, Rlen, 1); + sub(Rj, Rj, Ri); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); + } block_comment(" } // j"); + + post2(Ri, Rlen); + add(Ri, Ri, 1); + cmp(Ri, Rlen, Assembler::LSL, 1); + + br(Assembler::LT, loop); + bind(end); + block_comment("} // i"); + } + + normalize(Rlen); + + mov(Ra, Pm_base); // Save Pm_base in Ra + restore_regs(); // Restore caller's Pm_base + + // Copy our result into caller's Pm_base + reverse(Pm_base, Ra, Rlen, t0, t1); + + leave(); + ret(lr); + + return entry; + } + // In C, approximately: + + // void + // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], + // unsigned long Pm_base[], unsigned long inv, int len) { + // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + // unsigned long *Pa, *Pb, *Pn, *Pm; + // unsigned long Ra, Rb, Rn, Rm; + + // int i; + + // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); + + // for (i = 0; i < len; i++) { + // int j; + + // Pa = Pa_base; + // Pb = Pa_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + + // int iters = (i+1)/2; + // for (j = 0; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); + // MACC2(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // if ((i & 1) == 0) { + // assert(Ra == Pa_base[j], "must be"); + // MACC(Ra, Ra, t0, t1, t2); + // } + // iters = i/2; + // assert(iters == i-j, "must be"); + // for (; iters--; j++) { + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // *Pm = Rm = t0 * inv; + // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + + // assert(t0 == 0, "broken Montgomery multiply"); + + // t0 = t1; t1 = t2; t2 = 0; + // } + + // for (i = len; i < 2*len; i++) { + // int start = i-len+1; + // int end = start + (len - start)/2; + // int j; + + // Pa = Pa_base + i-len; + // Pb = Pa_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + + // int iters = (2*len-i-1)/2; + // assert(iters == end-start, "must be"); + // for (j = start; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); + // MACC2(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // if ((i & 1) == 0) { + // assert(Ra == Pa_base[j], "must be"); + // MACC(Ra, Ra, t0, t1, t2); + // } + // iters = (2*len-i)/2; + // assert(iters == len-j, "must be"); + // for (; iters--; j++) { + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // Pm_base[i-len] = t0; + // t0 = t1; t1 = t2; t2 = 0; + // } + + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + // } + }; + // Initialization void generate_initial() { // Generate initial stubs and initializes the entry points @@ -2603,6 +3423,20 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_multiplyToLen = generate_multiplyToLen(); } + if (UseMontgomeryMultiplyIntrinsic) { + StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); + MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); + StubRoutines::_montgomeryMultiply = g.generate_multiply(); + } + + if (UseMontgomerySquareIntrinsic) { + StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); + MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); + // We use generate_multiply() rather than generate_square() + // because it's faster for the sizes of modulus we care about. + StubRoutines::_montgomerySquare = g.generate_multiply(); + } + #ifndef BUILTIN_SIM if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp index 02591e639ed..76277df495b 100644 --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp @@ -261,6 +261,13 @@ void VM_Version::get_processor_features() { UsePopCountInstruction = true; } + if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { + UseMontgomeryMultiplyIntrinsic = true; + } + if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { + UseMontgomerySquareIntrinsic = true; + } + #ifdef COMPILER2 if (FLAG_IS_DEFAULT(OptoScheduling)) { OptoScheduling = true; From 88a6ccaaa298a7877b786726882c73f6b394063e Mon Sep 17 00:00:00 2001 From: Aleksey Shipilev Date: Fri, 24 Jul 2015 21:29:11 -0400 Subject: [PATCH 3/3] 8131782: C1 Class.cast optimization breaks when Class is loaded from static final Change as_ValueType() to return InstanceConstant when appropriate Reviewed-by: jrose --- hotspot/src/share/vm/c1/c1_ValueType.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hotspot/src/share/vm/c1/c1_ValueType.cpp b/hotspot/src/share/vm/c1/c1_ValueType.cpp index 0aebd036a78..5f86a8b9309 100644 --- a/hotspot/src/share/vm/c1/c1_ValueType.cpp +++ b/hotspot/src/share/vm/c1/c1_ValueType.cpp @@ -153,7 +153,19 @@ ValueType* as_ValueType(ciConstant value) { case T_FLOAT : return new FloatConstant (value.as_float ()); case T_DOUBLE : return new DoubleConstant(value.as_double()); case T_ARRAY : // fall through (ciConstant doesn't have an array accessor) - case T_OBJECT : return new ObjectConstant(value.as_object()); + case T_OBJECT : { + // TODO: Common the code with GraphBuilder::load_constant? + ciObject* obj = value.as_object(); + if (obj->is_null_object()) + return objectNull; + if (obj->is_loaded()) { + if (obj->is_array()) + return new ArrayConstant(obj->as_array()); + else if (obj->is_instance()) + return new InstanceConstant(obj->as_instance()); + } + return new ObjectConstant(obj); + } } ShouldNotReachHere(); return illegalType;