8167199: Add C2 SPARC intrinsic for BigInteger::multiplyToLen() method
Preliminary support for intrinsic multiplyToLen, including generalised version based on 'mpmul' instruction (when available). Reviewed-by: kvn, neliasso
This commit is contained in:
parent
1af1d42ac4
commit
b125aebb91
@ -122,6 +122,7 @@ class Assembler : public AbstractAssembler {
|
||||
fpop1_op3 = 0x34,
|
||||
fpop2_op3 = 0x35,
|
||||
impdep1_op3 = 0x36,
|
||||
addx_op3 = 0x36,
|
||||
aes3_op3 = 0x36,
|
||||
sha_op3 = 0x36,
|
||||
bmask_op3 = 0x36,
|
||||
@ -133,6 +134,8 @@ class Assembler : public AbstractAssembler {
|
||||
fzero_op3 = 0x36,
|
||||
fsrc_op3 = 0x36,
|
||||
fnot_op3 = 0x36,
|
||||
mpmul_op3 = 0x36,
|
||||
umulx_op3 = 0x36,
|
||||
xmulx_op3 = 0x36,
|
||||
crc32c_op3 = 0x36,
|
||||
impdep2_op3 = 0x37,
|
||||
@ -195,6 +198,9 @@ class Assembler : public AbstractAssembler {
|
||||
fnegs_opf = 0x05,
|
||||
fnegd_opf = 0x06,
|
||||
|
||||
addxc_opf = 0x11,
|
||||
addxccc_opf = 0x13,
|
||||
umulxhi_opf = 0x16,
|
||||
alignaddr_opf = 0x18,
|
||||
bmask_opf = 0x19,
|
||||
|
||||
@ -240,7 +246,8 @@ class Assembler : public AbstractAssembler {
|
||||
sha256_opf = 0x142,
|
||||
sha512_opf = 0x143,
|
||||
|
||||
crc32c_opf = 0x147
|
||||
crc32c_opf = 0x147,
|
||||
mpmul_opf = 0x148
|
||||
};
|
||||
|
||||
enum op5s {
|
||||
@ -380,7 +387,7 @@ class Assembler : public AbstractAssembler {
|
||||
assert_signed_range(x, nbits + 2);
|
||||
}
|
||||
|
||||
static void assert_unsigned_const(int x, int nbits) {
|
||||
static void assert_unsigned_range(int x, int nbits) {
|
||||
assert(juint(x) < juint(1 << nbits), "unsigned constant out of range");
|
||||
}
|
||||
|
||||
@ -534,6 +541,12 @@ class Assembler : public AbstractAssembler {
|
||||
return x & ((1 << nbits) - 1);
|
||||
}
|
||||
|
||||
// unsigned immediate, in low bits, at most nbits long.
|
||||
static int uimm(int x, int nbits) {
|
||||
assert_unsigned_range(x, nbits);
|
||||
return x & ((1 << nbits) - 1);
|
||||
}
|
||||
|
||||
// compute inverse of wdisp16
|
||||
static intptr_t inv_wdisp16(int x, intptr_t pos) {
|
||||
int lo = x & ((1 << 14) - 1);
|
||||
@ -631,6 +644,9 @@ class Assembler : public AbstractAssembler {
|
||||
// FMAf instructions supported only on certain processors
|
||||
static void fmaf_only() { assert(VM_Version::has_fmaf(), "This instruction only works on SPARC with FMAf"); }
|
||||
|
||||
// MPMUL instruction supported only on certain processors
|
||||
static void mpmul_only() { assert( VM_Version::has_mpmul(), "This instruction only works on SPARC with MPMUL"); }
|
||||
|
||||
// instruction only in VIS1
|
||||
static void vis1_only() { assert(VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
|
||||
|
||||
@ -772,11 +788,12 @@ class Assembler : public AbstractAssembler {
|
||||
AbstractAssembler::flush();
|
||||
}
|
||||
|
||||
inline void emit_int32(int); // shadows AbstractAssembler::emit_int32
|
||||
inline void emit_data(int);
|
||||
inline void emit_data(int, RelocationHolder const &rspec);
|
||||
inline void emit_data(int, relocInfo::relocType rtype);
|
||||
// helper for above functions
|
||||
inline void emit_int32(int32_t); // shadows AbstractAssembler::emit_int32
|
||||
inline void emit_data(int32_t);
|
||||
inline void emit_data(int32_t, RelocationHolder const&);
|
||||
inline void emit_data(int32_t, relocInfo::relocType rtype);
|
||||
|
||||
// Helper for the above functions.
|
||||
inline void check_delay();
|
||||
|
||||
|
||||
@ -960,6 +977,8 @@ class Assembler : public AbstractAssembler {
|
||||
inline void ldf(FloatRegisterImpl::Width w, Register s1, int simm13a, FloatRegister d,
|
||||
RelocationHolder const &rspec = RelocationHolder());
|
||||
|
||||
inline void ldd(Register s1, Register s2, FloatRegister d);
|
||||
inline void ldd(Register s1, int simm13a, FloatRegister d);
|
||||
|
||||
inline void ldfsr(Register s1, Register s2);
|
||||
inline void ldfsr(Register s1, int simm13a);
|
||||
@ -987,8 +1006,6 @@ class Assembler : public AbstractAssembler {
|
||||
inline void lduw(Register s1, int simm13a, Register d);
|
||||
inline void ldx(Register s1, Register s2, Register d);
|
||||
inline void ldx(Register s1, int simm13a, Register d);
|
||||
inline void ldd(Register s1, Register s2, Register d);
|
||||
inline void ldd(Register s1, int simm13a, Register d);
|
||||
|
||||
// pp 177
|
||||
|
||||
@ -1157,6 +1174,9 @@ class Assembler : public AbstractAssembler {
|
||||
inline void stf(FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2);
|
||||
inline void stf(FloatRegisterImpl::Width w, FloatRegister d, Register s1, int simm13a);
|
||||
|
||||
inline void std(FloatRegister d, Register s1, Register s2);
|
||||
inline void std(FloatRegister d, Register s1, int simm13a);
|
||||
|
||||
inline void stfsr(Register s1, Register s2);
|
||||
inline void stfsr(Register s1, int simm13a);
|
||||
inline void stxfsr(Register s1, Register s2);
|
||||
@ -1177,8 +1197,6 @@ class Assembler : public AbstractAssembler {
|
||||
inline void stw(Register d, Register s1, int simm13a);
|
||||
inline void stx(Register d, Register s1, Register s2);
|
||||
inline void stx(Register d, Register s1, int simm13a);
|
||||
inline void std(Register d, Register s1, Register s2);
|
||||
inline void std(Register d, Register s1, int simm13a);
|
||||
|
||||
// pp 177
|
||||
|
||||
@ -1267,6 +1285,9 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
// VIS3 instructions
|
||||
|
||||
inline void addxc(Register s1, Register s2, Register d);
|
||||
inline void addxccc(Register s1, Register s2, Register d);
|
||||
|
||||
inline void movstosw(FloatRegister s, Register d);
|
||||
inline void movstouw(FloatRegister s, Register d);
|
||||
inline void movdtox(FloatRegister s, Register d);
|
||||
@ -1276,6 +1297,7 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
inline void xmulx(Register s1, Register s2, Register d);
|
||||
inline void xmulxhi(Register s1, Register s2, Register d);
|
||||
inline void umulxhi(Register s1, Register s2, Register d);
|
||||
|
||||
// Crypto SHA instructions
|
||||
|
||||
@ -1287,6 +1309,10 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
inline void crc32c(FloatRegister s1, FloatRegister s2, FloatRegister d);
|
||||
|
||||
// MPMUL instruction
|
||||
|
||||
inline void mpmul(int uimm5);
|
||||
|
||||
// Creation
|
||||
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
|
||||
#ifdef VALIDATE_PIPELINE
|
||||
|
@ -59,7 +59,7 @@ inline void Assembler::check_delay() {
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Assembler::emit_int32(int x) {
|
||||
inline void Assembler::emit_int32(int32_t x) {
|
||||
check_delay();
|
||||
#ifdef VALIDATE_PIPELINE
|
||||
_hazard_state = NoHazard;
|
||||
@ -67,16 +67,16 @@ inline void Assembler::emit_int32(int x) {
|
||||
AbstractAssembler::emit_int32(x);
|
||||
}
|
||||
|
||||
inline void Assembler::emit_data(int x) {
|
||||
inline void Assembler::emit_data(int32_t x) {
|
||||
emit_int32(x);
|
||||
}
|
||||
|
||||
inline void Assembler::emit_data(int x, relocInfo::relocType rtype) {
|
||||
inline void Assembler::emit_data(int32_t x, relocInfo::relocType rtype) {
|
||||
relocate(rtype);
|
||||
emit_int32(x);
|
||||
}
|
||||
|
||||
inline void Assembler::emit_data(int x, RelocationHolder const &rspec) {
|
||||
inline void Assembler::emit_data(int32_t x, RelocationHolder const &rspec) {
|
||||
relocate(rspec);
|
||||
emit_int32(x);
|
||||
}
|
||||
@ -402,6 +402,15 @@ inline void Assembler::ldf(FloatRegisterImpl::Width w, Register s1, int simm13a,
|
||||
emit_data(op(ldst_op) | fd(d, w) | alt_op3(ldf_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec);
|
||||
}
|
||||
|
||||
inline void Assembler::ldd(Register s1, Register s2, FloatRegister d) {
|
||||
assert(d->is_even(), "not even");
|
||||
ldf(FloatRegisterImpl::D, s1, s2, d);
|
||||
}
|
||||
inline void Assembler::ldd(Register s1, int simm13a, FloatRegister d) {
|
||||
assert(d->is_even(), "not even");
|
||||
ldf(FloatRegisterImpl::D, s1, simm13a, d);
|
||||
}
|
||||
|
||||
inline void Assembler::ldxfsr(Register s1, Register s2) {
|
||||
emit_int32(op(ldst_op) | rd(G1) | op3(ldfsr_op3) | rs1(s1) | rs2(s2));
|
||||
}
|
||||
@ -460,16 +469,6 @@ inline void Assembler::ldx(Register s1, Register s2, Register d) {
|
||||
inline void Assembler::ldx(Register s1, int simm13a, Register d) {
|
||||
emit_data(op(ldst_op) | rd(d) | op3(ldx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
|
||||
}
|
||||
inline void Assembler::ldd(Register s1, Register s2, Register d) {
|
||||
v9_dep();
|
||||
assert(d->is_even(), "not even");
|
||||
emit_int32(op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | rs2(s2));
|
||||
}
|
||||
inline void Assembler::ldd(Register s1, int simm13a, Register d) {
|
||||
v9_dep();
|
||||
assert(d->is_even(), "not even");
|
||||
emit_data(op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
|
||||
}
|
||||
|
||||
inline void Assembler::ldsba(Register s1, Register s2, int ia, Register d) {
|
||||
emit_int32(op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2));
|
||||
@ -806,6 +805,15 @@ inline void Assembler::stf(FloatRegisterImpl::Width w, FloatRegister d, Register
|
||||
emit_data(op(ldst_op) | fd(d, w) | alt_op3(stf_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13));
|
||||
}
|
||||
|
||||
inline void Assembler::std(FloatRegister d, Register s1, Register s2) {
|
||||
assert(d->is_even(), "not even");
|
||||
stf(FloatRegisterImpl::D, d, s1, s2);
|
||||
}
|
||||
inline void Assembler::std(FloatRegister d, Register s1, int simm13a) {
|
||||
assert(d->is_even(), "not even");
|
||||
stf(FloatRegisterImpl::D, d, s1, simm13a);
|
||||
}
|
||||
|
||||
inline void Assembler::stxfsr(Register s1, Register s2) {
|
||||
emit_int32(op(ldst_op) | rd(G1) | op3(stfsr_op3) | rs1(s1) | rs2(s2));
|
||||
}
|
||||
@ -848,16 +856,6 @@ inline void Assembler::stx(Register d, Register s1, Register s2) {
|
||||
inline void Assembler::stx(Register d, Register s1, int simm13a) {
|
||||
emit_data(op(ldst_op) | rd(d) | op3(stx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
|
||||
}
|
||||
inline void Assembler::std(Register d, Register s1, Register s2) {
|
||||
v9_dep();
|
||||
assert(d->is_even(), "not even");
|
||||
emit_int32(op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | rs2(s2));
|
||||
}
|
||||
inline void Assembler::std(Register d, Register s1, int simm13a) {
|
||||
v9_dep();
|
||||
assert(d->is_even(), "not even");
|
||||
emit_data(op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | immed(true) | simm(simm13a, 13));
|
||||
}
|
||||
|
||||
inline void Assembler::stba(Register d, Register s1, Register s2, int ia) {
|
||||
emit_int32(op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2));
|
||||
@ -1043,6 +1041,15 @@ inline void Assembler::bshuffle(FloatRegister s1, FloatRegister s2, FloatRegiste
|
||||
|
||||
// VIS3 instructions
|
||||
|
||||
inline void Assembler::addxc(Register s1, Register s2, Register d) {
|
||||
vis3_only();
|
||||
emit_int32(op(arith_op) | rd(d) | op3(addx_op3) | rs1(s1) | opf(addxc_opf) | rs2(s2));
|
||||
}
|
||||
inline void Assembler::addxccc(Register s1, Register s2, Register d) {
|
||||
vis3_only();
|
||||
emit_int32(op(arith_op) | rd(d) | op3(addx_op3) | rs1(s1) | opf(addxccc_opf) | rs2(s2));
|
||||
}
|
||||
|
||||
inline void Assembler::movstosw(FloatRegister s, Register d) {
|
||||
vis3_only();
|
||||
emit_int32(op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S));
|
||||
@ -1073,6 +1080,10 @@ inline void Assembler::xmulxhi(Register s1, Register s2, Register d) {
|
||||
vis3_only();
|
||||
emit_int32(op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2));
|
||||
}
|
||||
inline void Assembler::umulxhi(Register s1, Register s2, Register d) {
|
||||
vis3_only();
|
||||
emit_int32(op(arith_op) | rd(d) | op3(umulx_op3) | rs1(s1) | opf(umulxhi_opf) | rs2(s2));
|
||||
}
|
||||
|
||||
// Crypto SHA instructions
|
||||
|
||||
@ -1096,4 +1107,11 @@ inline void Assembler::crc32c(FloatRegister s1, FloatRegister s2, FloatRegister
|
||||
emit_int32(op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D));
|
||||
}
|
||||
|
||||
// MPMUL instruction
|
||||
|
||||
inline void Assembler::mpmul(int uimm5) {
|
||||
mpmul_only();
|
||||
emit_int32(op(arith_op) | rd(0) | op3(mpmul_op3) | rs1(0) | opf(mpmul_opf) | uimm(uimm5, 5));
|
||||
}
|
||||
|
||||
#endif // CPU_SPARC_VM_ASSEMBLER_SPARC_INLINE_HPP
|
||||
|
@ -97,12 +97,15 @@ define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
|
||||
writeable) \
|
||||
\
|
||||
product(intx, UseVIS, 99, \
|
||||
"Highest supported VIS instructions set on Sparc") \
|
||||
"Highest supported VIS instructions set on SPARC") \
|
||||
range(0, 99) \
|
||||
\
|
||||
product(bool, UseCBCond, false, \
|
||||
"Use compare and branch instruction on SPARC") \
|
||||
\
|
||||
product(bool, UseMPMUL, false, \
|
||||
"Use multi-precision multiply instruction (mpmul) on SPARC") \
|
||||
\
|
||||
product(bool, UseBlockZeroing, false, \
|
||||
"Use special cpu instructions for block zeroing") \
|
||||
\
|
||||
|
@ -1574,29 +1574,39 @@ void MacroAssembler::br_null_short(Register s1, Predict p, Label& L) {
|
||||
assert_not_delayed();
|
||||
if (use_cbcond(L)) {
|
||||
Assembler::cbcond(zero, ptr_cc, s1, 0, L);
|
||||
return;
|
||||
} else {
|
||||
br_null(s1, false, p, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
br_null(s1, false, p, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
|
||||
void MacroAssembler::br_notnull_short(Register s1, Predict p, Label& L) {
|
||||
assert_not_delayed();
|
||||
if (use_cbcond(L)) {
|
||||
Assembler::cbcond(notZero, ptr_cc, s1, 0, L);
|
||||
return;
|
||||
} else {
|
||||
br_notnull(s1, false, p, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
br_notnull(s1, false, p, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
|
||||
// Unconditional short branch
|
||||
void MacroAssembler::ba_short(Label& L) {
|
||||
assert_not_delayed();
|
||||
if (use_cbcond(L)) {
|
||||
Assembler::cbcond(equal, icc, G0, G0, L);
|
||||
return;
|
||||
} else {
|
||||
br(always, false, pt, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
br(always, false, pt, L);
|
||||
}
|
||||
|
||||
// Branch if 'icc' says zero or not (i.e. icc.z == 1|0).
|
||||
|
||||
void MacroAssembler::br_icc_zero(bool iszero, Predict p, Label &L) {
|
||||
assert_not_delayed();
|
||||
Condition cf = (iszero ? Assembler::zero : Assembler::notZero);
|
||||
br(cf, false, p, L);
|
||||
delayed()->nop();
|
||||
}
|
||||
|
||||
|
@ -606,7 +606,7 @@ class MacroAssembler : public Assembler {
|
||||
// offset. No explicit code generation is needed if the offset is within a certain
|
||||
// range (0 <= offset <= page_size).
|
||||
//
|
||||
// %%%%%% Currently not done for SPARC
|
||||
// FIXME: Currently not done for SPARC
|
||||
|
||||
void null_check(Register reg, int offset = -1);
|
||||
static bool needs_explicit_null_check(intptr_t offset);
|
||||
@ -648,6 +648,9 @@ class MacroAssembler : public Assembler {
|
||||
// unconditional short branch
|
||||
void ba_short(Label& L);
|
||||
|
||||
// Branch on icc.z (true or not).
|
||||
void br_icc_zero(bool iszero, Predict p, Label &L);
|
||||
|
||||
inline void bp( Condition c, bool a, CC cc, Predict p, address d, relocInfo::relocType rt = relocInfo::none );
|
||||
inline void bp( Condition c, bool a, CC cc, Predict p, Label& L );
|
||||
|
||||
@ -663,19 +666,19 @@ class MacroAssembler : public Assembler {
|
||||
inline void fbp( Condition c, bool a, CC cc, Predict p, Label& L );
|
||||
|
||||
// Sparc shorthands(pp 85, V8 manual, pp 289 V9 manual)
|
||||
inline void cmp( Register s1, Register s2 );
|
||||
inline void cmp( Register s1, int simm13a );
|
||||
inline void cmp( Register s1, Register s2 );
|
||||
inline void cmp( Register s1, int simm13a );
|
||||
|
||||
inline void jmp( Register s1, Register s2 );
|
||||
inline void jmp( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
|
||||
|
||||
// Check if the call target is out of wdisp30 range (relative to the code cache)
|
||||
static inline bool is_far_target(address d);
|
||||
inline void call( address d, relocInfo::relocType rt = relocInfo::runtime_call_type );
|
||||
inline void call( address d, RelocationHolder const& rspec);
|
||||
inline void call( address d, relocInfo::relocType rt = relocInfo::runtime_call_type );
|
||||
inline void call( address d, RelocationHolder const& rspec);
|
||||
|
||||
inline void call( Label& L, relocInfo::relocType rt = relocInfo::runtime_call_type );
|
||||
inline void call( Label& L, RelocationHolder const& rspec);
|
||||
inline void call( Label& L, relocInfo::relocType rt = relocInfo::runtime_call_type );
|
||||
inline void call( Label& L, RelocationHolder const& rspec);
|
||||
|
||||
inline void callr( Register s1, Register s2 );
|
||||
inline void callr( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
|
||||
|
@ -185,7 +185,7 @@ inline void MacroAssembler::br( Condition c, bool a, Predict p, address d, reloc
|
||||
}
|
||||
|
||||
inline void MacroAssembler::br( Condition c, bool a, Predict p, Label& L ) {
|
||||
// See note[+] on 'avoid_pipeline_stalls()', in "assembler_sparc.inline.hpp".
|
||||
// See note[+] on 'avoid_pipeline_stall()', in "assembler_sparc.inline.hpp".
|
||||
avoid_pipeline_stall();
|
||||
br(c, a, p, target(L));
|
||||
}
|
||||
|
@ -236,7 +236,7 @@ class FloatRegisterImpl: public AbstractRegisterImpl {
|
||||
inline VMReg as_VMReg( );
|
||||
|
||||
// accessors
|
||||
int encoding() const { assert(is_valid(), "invalid register"); return value(); }
|
||||
int encoding() const { assert(is_valid(), "invalid register"); return value(); }
|
||||
|
||||
public:
|
||||
int encoding(Width w) const {
|
||||
@ -258,10 +258,12 @@ class FloatRegisterImpl: public AbstractRegisterImpl {
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool is_valid() const { return 0 <= value() && value() < number_of_registers; }
|
||||
bool is_valid() const { return 0 <= value() && value() < number_of_registers; }
|
||||
bool is_even() const { return (encoding() & 1) == 0; }
|
||||
|
||||
const char* name() const;
|
||||
|
||||
FloatRegister successor() const { return as_FloatRegister(encoding() + 1); }
|
||||
FloatRegister successor() const { return as_FloatRegister(encoding() + 1); }
|
||||
};
|
||||
|
||||
|
||||
|
@ -58,7 +58,6 @@
|
||||
// Note: The register L7 is used as L7_thread_cache, and may not be used
|
||||
// any other way within this module.
|
||||
|
||||
|
||||
static const Register& Lstub_temp = L2;
|
||||
|
||||
// -------------------------------------------------------------------------------------------------------------------------
|
||||
@ -4943,7 +4942,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Inputs:
|
||||
@ -4975,6 +4974,773 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Inputs:
|
||||
* I0 - int* x-addr
|
||||
* I1 - int x-len
|
||||
* I2 - int* y-addr
|
||||
* I3 - int y-len
|
||||
* I4 - int* z-addr (output vector)
|
||||
* I5 - int z-len
|
||||
*/
|
||||
address generate_multiplyToLen() {
|
||||
assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions");
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
|
||||
address start = __ pc();
|
||||
|
||||
__ save_frame(0);
|
||||
|
||||
const Register xptr = I0; // input address
|
||||
const Register xlen = I1; // ...and length in 32b-words
|
||||
const Register yptr = I2; //
|
||||
const Register ylen = I3; //
|
||||
const Register zptr = I4; // output address
|
||||
const Register zlen = I5; // ...and length in 32b-words
|
||||
|
||||
/* The minimal "limb" representation suggest that odd length vectors are as
|
||||
* likely as even length dittos. This in turn suggests that we need to cope
|
||||
* with odd/even length arrays and data not aligned properly for 64-bit read
|
||||
* and write operations. We thus use a number of different kernels:
|
||||
*
|
||||
* if (is_even(x.len) && is_even(y.len))
|
||||
* if (is_align64(x) && is_align64(y) && is_align64(z))
|
||||
* if (x.len == y.len && 16 <= x.len && x.len <= 64)
|
||||
* memv_mult_mpmul(...)
|
||||
* else
|
||||
* memv_mult_64x64(...)
|
||||
* else
|
||||
* memv_mult_64x64u(...)
|
||||
* else
|
||||
* memv_mult_32x32(...)
|
||||
*
|
||||
* Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc').
|
||||
* In case CBCOND instructions are supported, we will use 'cxbX'. If the
|
||||
* MPMUL instruction is supported, we will generate a kernel using 'mpmul'
|
||||
* (for vectors with proper characteristics).
|
||||
*/
|
||||
const Register tmp0 = L0;
|
||||
const Register tmp1 = L1;
|
||||
|
||||
Label L_mult_32x32;
|
||||
Label L_mult_64x64u;
|
||||
Label L_mult_64x64;
|
||||
Label L_exit;
|
||||
|
||||
if_both_even(xlen, ylen, tmp0, false, L_mult_32x32);
|
||||
if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u);
|
||||
|
||||
if (UseMPMUL) {
|
||||
if_eq(xlen, ylen, false, L_mult_64x64);
|
||||
if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64);
|
||||
|
||||
// 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel,
|
||||
// operating on equal length vectors of size [16..64].
|
||||
gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit);
|
||||
}
|
||||
|
||||
// 2. Multiply naturally aligned 64-bit datums (64x64).
|
||||
__ bind(L_mult_64x64);
|
||||
gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
|
||||
|
||||
// 3. Multiply unaligned 64-bit datums (64x64).
|
||||
__ bind(L_mult_64x64u);
|
||||
gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
|
||||
|
||||
// 4. Multiply naturally aligned 32-bit datums (32x32).
|
||||
__ bind(L_mult_32x32);
|
||||
gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
|
||||
|
||||
__ bind(L_exit);
|
||||
__ ret();
|
||||
__ delayed()->restore();
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
// Additional help functions used by multiplyToLen generation.
|
||||
|
||||
void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L)
|
||||
{
|
||||
__ or3(r1, r2, tmp);
|
||||
__ andcc(tmp, 0x1, tmp);
|
||||
__ br_icc_zero(iseven, Assembler::pn, L);
|
||||
}
|
||||
|
||||
void if_all3_aligned(Register r1, Register r2, Register r3,
|
||||
Register tmp, uint align, bool isalign, Label &L)
|
||||
{
|
||||
__ or3(r1, r2, tmp);
|
||||
__ or3(r3, tmp, tmp);
|
||||
__ andcc(tmp, (align - 1), tmp);
|
||||
__ br_icc_zero(isalign, Assembler::pn, L);
|
||||
}
|
||||
|
||||
void if_eq(Register x, Register y, bool iseq, Label &L)
|
||||
{
|
||||
Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual);
|
||||
__ cmp_and_br_short(x, y, cf, Assembler::pt, L);
|
||||
}
|
||||
|
||||
void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L)
|
||||
{
|
||||
assert(Assembler::is_simm13(lb), "Small ints only!");
|
||||
assert(Assembler::is_simm13(ub), "Small ints only!");
|
||||
// Compute (x - lb) * (ub - x) >= 0
|
||||
// NOTE: With the local use of this routine, we rely on small integers to
|
||||
// guarantee that we do not overflow in the multiplication.
|
||||
__ add(G0, ub, t2);
|
||||
__ sub(x, lb, t1);
|
||||
__ sub(t2, x, t2);
|
||||
__ mulx(t1, t2, t1);
|
||||
Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less);
|
||||
__ cmp_and_br_short(t1, G0, cf, Assembler::pt, L);
|
||||
}
|
||||
|
||||
void ldd_entry(Register base, Register offs, FloatRegister dest)
|
||||
{
|
||||
__ ldd(base, offs, dest);
|
||||
__ inc(offs, 8);
|
||||
}
|
||||
|
||||
void ldx_entry(Register base, Register offs, Register dest)
|
||||
{
|
||||
__ ldx(base, offs, dest);
|
||||
__ inc(offs, 8);
|
||||
}
|
||||
|
||||
void mpmul_entry(int m, Label &next)
|
||||
{
|
||||
__ mpmul(m);
|
||||
__ cbcond(Assembler::equal, Assembler::icc, G0, G0, next);
|
||||
}
|
||||
|
||||
void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs)
|
||||
{
|
||||
__ bind(L);
|
||||
__ stx(r1, base, offs);
|
||||
__ inc(offs, 8);
|
||||
__ stx(r2, base, offs);
|
||||
__ inc(offs, 8);
|
||||
}
|
||||
|
||||
void offs_entry(Label &Lbl0, Label &Lbl1)
|
||||
{
|
||||
assert(Lbl0.is_bound(), "must be");
|
||||
assert(Lbl1.is_bound(), "must be");
|
||||
|
||||
int offset = Lbl0.loc_pos() - Lbl1.loc_pos();
|
||||
|
||||
__ emit_data(offset);
|
||||
}
|
||||
|
||||
/* Generate the actual multiplication kernels for BigInteger vectors:
|
||||
*
|
||||
* 1. gen_mult_mpmul(...)
|
||||
*
|
||||
* 2. gen_mult_64x64(...)
|
||||
*
|
||||
* 3. gen_mult_64x64_unaligned(...)
|
||||
*
|
||||
* 4. gen_mult_32x32(...)
|
||||
*/
|
||||
void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr,
|
||||
Label &L_exit)
|
||||
{
|
||||
const Register zero = G0;
|
||||
const Register gxp = G1; // Need to use global registers across RWs.
|
||||
const Register gyp = G2;
|
||||
const Register gzp = G3;
|
||||
const Register offs = G4;
|
||||
const Register disp = G5;
|
||||
|
||||
__ mov(xptr, gxp);
|
||||
__ mov(yptr, gyp);
|
||||
__ mov(zptr, gzp);
|
||||
|
||||
/* Compute jump vector entry:
|
||||
*
|
||||
* 1. mpmul input size (0..31) x 64b
|
||||
* 2. vector input size in 32b limbs (even number)
|
||||
* 3. branch entries in reverse order (31..0), using two
|
||||
* instructions per entry (2 * 4 bytes).
|
||||
*
|
||||
* displacement = byte_offset(bra_offset(len))
|
||||
* = byte_offset((64 - len)/2)
|
||||
* = 8 * (64 - len)/2
|
||||
* = 4 * (64 - len)
|
||||
*/
|
||||
Register temp = I5; // Alright to use input regs. in first batch.
|
||||
|
||||
__ sub(zero, len, temp);
|
||||
__ add(temp, 64, temp);
|
||||
__ sllx(temp, 2, disp); // disp := (64 - len) << 2
|
||||
|
||||
// Dispatch relative current PC, into instruction table below.
|
||||
__ rdpc(temp);
|
||||
__ add(temp, 16, temp);
|
||||
__ jmp(temp, disp);
|
||||
__ delayed()->clr(offs);
|
||||
|
||||
ldd_entry(gxp, offs, F22);
|
||||
ldd_entry(gxp, offs, F20);
|
||||
ldd_entry(gxp, offs, F18);
|
||||
ldd_entry(gxp, offs, F16);
|
||||
ldd_entry(gxp, offs, F14);
|
||||
ldd_entry(gxp, offs, F12);
|
||||
ldd_entry(gxp, offs, F10);
|
||||
ldd_entry(gxp, offs, F8);
|
||||
ldd_entry(gxp, offs, F6);
|
||||
ldd_entry(gxp, offs, F4);
|
||||
ldx_entry(gxp, offs, I5);
|
||||
ldx_entry(gxp, offs, I4);
|
||||
ldx_entry(gxp, offs, I3);
|
||||
ldx_entry(gxp, offs, I2);
|
||||
ldx_entry(gxp, offs, I1);
|
||||
ldx_entry(gxp, offs, I0);
|
||||
ldx_entry(gxp, offs, L7);
|
||||
ldx_entry(gxp, offs, L6);
|
||||
ldx_entry(gxp, offs, L5);
|
||||
ldx_entry(gxp, offs, L4);
|
||||
ldx_entry(gxp, offs, L3);
|
||||
ldx_entry(gxp, offs, L2);
|
||||
ldx_entry(gxp, offs, L1);
|
||||
ldx_entry(gxp, offs, L0);
|
||||
ldd_entry(gxp, offs, F2);
|
||||
ldd_entry(gxp, offs, F0);
|
||||
ldx_entry(gxp, offs, O5);
|
||||
ldx_entry(gxp, offs, O4);
|
||||
ldx_entry(gxp, offs, O3);
|
||||
ldx_entry(gxp, offs, O2);
|
||||
ldx_entry(gxp, offs, O1);
|
||||
ldx_entry(gxp, offs, O0);
|
||||
|
||||
__ save(SP, -176, SP);
|
||||
|
||||
const Register addr = gxp; // Alright to reuse 'gxp'.
|
||||
|
||||
// Dispatch relative current PC, into instruction table below.
|
||||
__ rdpc(addr);
|
||||
__ add(addr, 16, addr);
|
||||
__ jmp(addr, disp);
|
||||
__ delayed()->clr(offs);
|
||||
|
||||
ldd_entry(gyp, offs, F58);
|
||||
ldd_entry(gyp, offs, F56);
|
||||
ldd_entry(gyp, offs, F54);
|
||||
ldd_entry(gyp, offs, F52);
|
||||
ldd_entry(gyp, offs, F50);
|
||||
ldd_entry(gyp, offs, F48);
|
||||
ldd_entry(gyp, offs, F46);
|
||||
ldd_entry(gyp, offs, F44);
|
||||
ldd_entry(gyp, offs, F42);
|
||||
ldd_entry(gyp, offs, F40);
|
||||
ldd_entry(gyp, offs, F38);
|
||||
ldd_entry(gyp, offs, F36);
|
||||
ldd_entry(gyp, offs, F34);
|
||||
ldd_entry(gyp, offs, F32);
|
||||
ldd_entry(gyp, offs, F30);
|
||||
ldd_entry(gyp, offs, F28);
|
||||
ldd_entry(gyp, offs, F26);
|
||||
ldd_entry(gyp, offs, F24);
|
||||
ldx_entry(gyp, offs, O5);
|
||||
ldx_entry(gyp, offs, O4);
|
||||
ldx_entry(gyp, offs, O3);
|
||||
ldx_entry(gyp, offs, O2);
|
||||
ldx_entry(gyp, offs, O1);
|
||||
ldx_entry(gyp, offs, O0);
|
||||
ldx_entry(gyp, offs, L7);
|
||||
ldx_entry(gyp, offs, L6);
|
||||
ldx_entry(gyp, offs, L5);
|
||||
ldx_entry(gyp, offs, L4);
|
||||
ldx_entry(gyp, offs, L3);
|
||||
ldx_entry(gyp, offs, L2);
|
||||
ldx_entry(gyp, offs, L1);
|
||||
ldx_entry(gyp, offs, L0);
|
||||
|
||||
__ save(SP, -176, SP);
|
||||
__ save(SP, -176, SP);
|
||||
__ save(SP, -176, SP);
|
||||
__ save(SP, -176, SP);
|
||||
__ save(SP, -176, SP);
|
||||
|
||||
Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2;
|
||||
Label L_mpmul_restore_1, L_mpmul_restore_0;
|
||||
|
||||
// Dispatch relative current PC, into instruction table below.
|
||||
__ rdpc(addr);
|
||||
__ add(addr, 16, addr);
|
||||
__ jmp(addr, disp);
|
||||
__ delayed()->clr(offs);
|
||||
|
||||
mpmul_entry(31, L_mpmul_restore_0);
|
||||
mpmul_entry(30, L_mpmul_restore_0);
|
||||
mpmul_entry(29, L_mpmul_restore_0);
|
||||
mpmul_entry(28, L_mpmul_restore_0);
|
||||
mpmul_entry(27, L_mpmul_restore_1);
|
||||
mpmul_entry(26, L_mpmul_restore_1);
|
||||
mpmul_entry(25, L_mpmul_restore_1);
|
||||
mpmul_entry(24, L_mpmul_restore_1);
|
||||
mpmul_entry(23, L_mpmul_restore_1);
|
||||
mpmul_entry(22, L_mpmul_restore_1);
|
||||
mpmul_entry(21, L_mpmul_restore_1);
|
||||
mpmul_entry(20, L_mpmul_restore_2);
|
||||
mpmul_entry(19, L_mpmul_restore_2);
|
||||
mpmul_entry(18, L_mpmul_restore_2);
|
||||
mpmul_entry(17, L_mpmul_restore_2);
|
||||
mpmul_entry(16, L_mpmul_restore_2);
|
||||
mpmul_entry(15, L_mpmul_restore_2);
|
||||
mpmul_entry(14, L_mpmul_restore_2);
|
||||
mpmul_entry(13, L_mpmul_restore_3);
|
||||
mpmul_entry(12, L_mpmul_restore_3);
|
||||
mpmul_entry(11, L_mpmul_restore_3);
|
||||
mpmul_entry(10, L_mpmul_restore_3);
|
||||
mpmul_entry( 9, L_mpmul_restore_3);
|
||||
mpmul_entry( 8, L_mpmul_restore_3);
|
||||
mpmul_entry( 7, L_mpmul_restore_3);
|
||||
mpmul_entry( 6, L_mpmul_restore_4);
|
||||
mpmul_entry( 5, L_mpmul_restore_4);
|
||||
mpmul_entry( 4, L_mpmul_restore_4);
|
||||
mpmul_entry( 3, L_mpmul_restore_4);
|
||||
mpmul_entry( 2, L_mpmul_restore_4);
|
||||
mpmul_entry( 1, L_mpmul_restore_4);
|
||||
mpmul_entry( 0, L_mpmul_restore_4);
|
||||
|
||||
Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24;
|
||||
Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16;
|
||||
Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08;
|
||||
Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00;
|
||||
|
||||
Label L_zst_base; // Store sequence base address.
|
||||
__ bind(L_zst_base);
|
||||
|
||||
stx_entry(L_z31, L7, L6, gzp, offs);
|
||||
stx_entry(L_z30, L5, L4, gzp, offs);
|
||||
stx_entry(L_z29, L3, L2, gzp, offs);
|
||||
stx_entry(L_z28, L1, L0, gzp, offs);
|
||||
__ restore();
|
||||
stx_entry(L_z27, O5, O4, gzp, offs);
|
||||
stx_entry(L_z26, O3, O2, gzp, offs);
|
||||
stx_entry(L_z25, O1, O0, gzp, offs);
|
||||
stx_entry(L_z24, L7, L6, gzp, offs);
|
||||
stx_entry(L_z23, L5, L4, gzp, offs);
|
||||
stx_entry(L_z22, L3, L2, gzp, offs);
|
||||
stx_entry(L_z21, L1, L0, gzp, offs);
|
||||
__ restore();
|
||||
stx_entry(L_z20, O5, O4, gzp, offs);
|
||||
stx_entry(L_z19, O3, O2, gzp, offs);
|
||||
stx_entry(L_z18, O1, O0, gzp, offs);
|
||||
stx_entry(L_z17, L7, L6, gzp, offs);
|
||||
stx_entry(L_z16, L5, L4, gzp, offs);
|
||||
stx_entry(L_z15, L3, L2, gzp, offs);
|
||||
stx_entry(L_z14, L1, L0, gzp, offs);
|
||||
__ restore();
|
||||
stx_entry(L_z13, O5, O4, gzp, offs);
|
||||
stx_entry(L_z12, O3, O2, gzp, offs);
|
||||
stx_entry(L_z11, O1, O0, gzp, offs);
|
||||
stx_entry(L_z10, L7, L6, gzp, offs);
|
||||
stx_entry(L_z09, L5, L4, gzp, offs);
|
||||
stx_entry(L_z08, L3, L2, gzp, offs);
|
||||
stx_entry(L_z07, L1, L0, gzp, offs);
|
||||
__ restore();
|
||||
stx_entry(L_z06, O5, O4, gzp, offs);
|
||||
stx_entry(L_z05, O3, O2, gzp, offs);
|
||||
stx_entry(L_z04, O1, O0, gzp, offs);
|
||||
stx_entry(L_z03, L7, L6, gzp, offs);
|
||||
stx_entry(L_z02, L5, L4, gzp, offs);
|
||||
stx_entry(L_z01, L3, L2, gzp, offs);
|
||||
stx_entry(L_z00, L1, L0, gzp, offs);
|
||||
|
||||
__ restore();
|
||||
__ restore();
|
||||
// Exit out of 'mpmul' routine, back to multiplyToLen.
|
||||
__ ba_short(L_exit);
|
||||
|
||||
Label L_zst_offs;
|
||||
__ bind(L_zst_offs);
|
||||
|
||||
offs_entry(L_z31, L_zst_base); // index 31: 2048x2048
|
||||
offs_entry(L_z30, L_zst_base);
|
||||
offs_entry(L_z29, L_zst_base);
|
||||
offs_entry(L_z28, L_zst_base);
|
||||
offs_entry(L_z27, L_zst_base);
|
||||
offs_entry(L_z26, L_zst_base);
|
||||
offs_entry(L_z25, L_zst_base);
|
||||
offs_entry(L_z24, L_zst_base);
|
||||
offs_entry(L_z23, L_zst_base);
|
||||
offs_entry(L_z22, L_zst_base);
|
||||
offs_entry(L_z21, L_zst_base);
|
||||
offs_entry(L_z20, L_zst_base);
|
||||
offs_entry(L_z19, L_zst_base);
|
||||
offs_entry(L_z18, L_zst_base);
|
||||
offs_entry(L_z17, L_zst_base);
|
||||
offs_entry(L_z16, L_zst_base);
|
||||
offs_entry(L_z15, L_zst_base);
|
||||
offs_entry(L_z14, L_zst_base);
|
||||
offs_entry(L_z13, L_zst_base);
|
||||
offs_entry(L_z12, L_zst_base);
|
||||
offs_entry(L_z11, L_zst_base);
|
||||
offs_entry(L_z10, L_zst_base);
|
||||
offs_entry(L_z09, L_zst_base);
|
||||
offs_entry(L_z08, L_zst_base);
|
||||
offs_entry(L_z07, L_zst_base);
|
||||
offs_entry(L_z06, L_zst_base);
|
||||
offs_entry(L_z05, L_zst_base);
|
||||
offs_entry(L_z04, L_zst_base);
|
||||
offs_entry(L_z03, L_zst_base);
|
||||
offs_entry(L_z02, L_zst_base);
|
||||
offs_entry(L_z01, L_zst_base);
|
||||
offs_entry(L_z00, L_zst_base); // index 0: 64x64
|
||||
|
||||
__ bind(L_mpmul_restore_4);
|
||||
__ restore();
|
||||
__ bind(L_mpmul_restore_3);
|
||||
__ restore();
|
||||
__ bind(L_mpmul_restore_2);
|
||||
__ restore();
|
||||
__ bind(L_mpmul_restore_1);
|
||||
__ restore();
|
||||
__ bind(L_mpmul_restore_0);
|
||||
|
||||
// Dispatch via offset vector entry, into z-store sequence.
|
||||
Label L_zst_rdpc;
|
||||
__ bind(L_zst_rdpc);
|
||||
|
||||
assert(L_zst_base.is_bound(), "must be");
|
||||
assert(L_zst_offs.is_bound(), "must be");
|
||||
assert(L_zst_rdpc.is_bound(), "must be");
|
||||
|
||||
int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos();
|
||||
int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos();
|
||||
|
||||
temp = gyp; // Alright to reuse 'gyp'.
|
||||
|
||||
__ rdpc(addr);
|
||||
__ sub(addr, doffs, temp);
|
||||
__ srlx(disp, 1, disp);
|
||||
__ lduw(temp, disp, offs);
|
||||
__ sub(addr, dbase, temp);
|
||||
__ jmp(temp, offs);
|
||||
__ delayed()->clr(offs);
|
||||
}
|
||||
|
||||
void gen_mult_64x64(Register xp, Register xn,
|
||||
Register yp, Register yn,
|
||||
Register zp, Register zn, Label &L_exit)
|
||||
{
|
||||
// Assuming that a stack frame has already been created, i.e. local and
|
||||
// output registers are available for immediate use.
|
||||
|
||||
const Register ri = L0; // Outer loop index, xv[i]
|
||||
const Register rj = L1; // Inner loop index, yv[j]
|
||||
const Register rk = L2; // Output loop index, zv[k]
|
||||
const Register rx = L4; // x-vector datum [i]
|
||||
const Register ry = L5; // y-vector datum [j]
|
||||
const Register rz = L6; // z-vector datum [k]
|
||||
const Register rc = L7; // carry over (to z-vector datum [k-1])
|
||||
|
||||
const Register lop = O0; // lo-64b product
|
||||
const Register hip = O1; // hi-64b product
|
||||
|
||||
const Register zero = G0;
|
||||
|
||||
Label L_loop_i, L_exit_loop_i;
|
||||
Label L_loop_j;
|
||||
Label L_loop_i2, L_exit_loop_i2;
|
||||
|
||||
__ srlx(xn, 1, xn); // index for u32 to u64 ditto
|
||||
__ srlx(yn, 1, yn); // index for u32 to u64 ditto
|
||||
__ srlx(zn, 1, zn); // index for u32 to u64 ditto
|
||||
__ dec(xn); // Adjust [0..(N/2)-1]
|
||||
__ dec(yn);
|
||||
__ dec(zn);
|
||||
__ clr(rc); // u64 c = 0
|
||||
__ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn)
|
||||
__ sllx(yn, 3, rj); // int j = yn (byte offset i = 8*xn)
|
||||
__ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn)
|
||||
__ ldx(yp, rj, ry); // u64 y = yp[yn]
|
||||
|
||||
// for (int i = xn; i >= 0; i--)
|
||||
__ bind(L_loop_i);
|
||||
|
||||
__ cmp_and_br_short(ri, 0, // i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i);
|
||||
__ ldx(xp, ri, rx); // x = xp[i]
|
||||
__ mulx(rx, ry, lop); // lo-64b-part of result 64x64
|
||||
__ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
|
||||
__ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry)
|
||||
__ addxc(hip, zero, rc); // carry over to next datum [k-1]
|
||||
__ stx(lop, zp, rk); // z[k] = lop
|
||||
__ dec(rk, 8); // k--
|
||||
__ dec(ri, 8); // i--
|
||||
__ ba_short(L_loop_i);
|
||||
|
||||
__ bind(L_exit_loop_i);
|
||||
__ stx(rc, zp, rk); // z[k] = c
|
||||
|
||||
// for (int j = yn - 1; j >= 0; j--)
|
||||
__ sllx(yn, 3, rj); // int j = yn - 1 (byte offset j = 8*yn)
|
||||
__ dec(rj, 8);
|
||||
|
||||
__ bind(L_loop_j);
|
||||
|
||||
__ cmp_and_br_short(rj, 0, // j >= 0
|
||||
Assembler::less, Assembler::pn, L_exit);
|
||||
__ clr(rc); // u64 c = 0
|
||||
__ ldx(yp, rj, ry); // u64 y = yp[j]
|
||||
|
||||
// for (int i = xn, k = --zn; i >= 0; i--)
|
||||
__ dec(zn); // --zn
|
||||
__ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn)
|
||||
__ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn)
|
||||
|
||||
__ bind(L_loop_i2);
|
||||
|
||||
__ cmp_and_br_short(ri, 0, // i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i2);
|
||||
__ ldx(xp, ri, rx); // x = xp[i]
|
||||
__ ldx(zp, rk, rz); // z = zp[k], accumulator
|
||||
__ mulx(rx, ry, lop); // lo-64b-part of result 64x64
|
||||
__ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
|
||||
__ addcc(rz, rc, rz); // Accumulate lower order bits,
|
||||
__ addxc(hip, zero, rc); // Accumulate higher order bits to carry
|
||||
__ addcc(rz, lop, rz); // z += lo(p) + c
|
||||
__ addxc(rc, zero, rc);
|
||||
__ stx(rz, zp, rk); // zp[k] = z
|
||||
__ dec(rk, 8); // k--
|
||||
__ dec(ri, 8); // i--
|
||||
__ ba_short(L_loop_i2);
|
||||
|
||||
__ bind(L_exit_loop_i2);
|
||||
__ stx(rc, zp, rk); // z[k] = c
|
||||
__ dec(rj, 8); // j--
|
||||
__ ba_short(L_loop_j);
|
||||
}
|
||||
|
||||
void gen_mult_64x64_unaligned(Register xp, Register xn,
|
||||
Register yp, Register yn,
|
||||
Register zp, Register zn, Label &L_exit)
|
||||
{
|
||||
// Assuming that a stack frame has already been created, i.e. local and
|
||||
// output registers are available for use.
|
||||
|
||||
const Register xpc = L0; // Outer loop cursor, xp[i]
|
||||
const Register ypc = L1; // Inner loop cursor, yp[j]
|
||||
const Register zpc = L2; // Output loop cursor, zp[k]
|
||||
const Register rx = L4; // x-vector datum [i]
|
||||
const Register ry = L5; // y-vector datum [j]
|
||||
const Register rz = L6; // z-vector datum [k]
|
||||
const Register rc = L7; // carry over (to z-vector datum [k-1])
|
||||
const Register rt = O2;
|
||||
|
||||
const Register lop = O0; // lo-64b product
|
||||
const Register hip = O1; // hi-64b product
|
||||
|
||||
const Register zero = G0;
|
||||
|
||||
Label L_loop_i, L_exit_loop_i;
|
||||
Label L_loop_j;
|
||||
Label L_loop_i2, L_exit_loop_i2;
|
||||
|
||||
__ srlx(xn, 1, xn); // index for u32 to u64 ditto
|
||||
__ srlx(yn, 1, yn); // index for u32 to u64 ditto
|
||||
__ srlx(zn, 1, zn); // index for u32 to u64 ditto
|
||||
__ dec(xn); // Adjust [0..(N/2)-1]
|
||||
__ dec(yn);
|
||||
__ dec(zn);
|
||||
__ clr(rc); // u64 c = 0
|
||||
__ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn)
|
||||
__ add(xp, xpc, xpc);
|
||||
__ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn)
|
||||
__ add(yp, ypc, ypc);
|
||||
__ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn)
|
||||
__ add(zp, zpc, zpc);
|
||||
__ lduw(ypc, 0, rt); // u64 y = yp[yn]
|
||||
__ lduw(ypc, 4, ry); // ...
|
||||
__ sllx(rt, 32, rt);
|
||||
__ or3(rt, ry, ry);
|
||||
|
||||
// for (int i = xn; i >= 0; i--)
|
||||
__ bind(L_loop_i);
|
||||
|
||||
__ cmp_and_br_short(xpc, xp,// i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i);
|
||||
__ lduw(xpc, 0, rt); // u64 x = xp[i]
|
||||
__ lduw(xpc, 4, rx); // ...
|
||||
__ sllx(rt, 32, rt);
|
||||
__ or3(rt, rx, rx);
|
||||
__ mulx(rx, ry, lop); // lo-64b-part of result 64x64
|
||||
__ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
|
||||
__ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry)
|
||||
__ addxc(hip, zero, rc); // carry over to next datum [k-1]
|
||||
__ srlx(lop, 32, rt);
|
||||
__ stw(rt, zpc, 0); // z[k] = lop
|
||||
__ stw(lop, zpc, 4); // ...
|
||||
__ dec(zpc, 8); // k-- (zpc--)
|
||||
__ dec(xpc, 8); // i-- (xpc--)
|
||||
__ ba_short(L_loop_i);
|
||||
|
||||
__ bind(L_exit_loop_i);
|
||||
__ srlx(rc, 32, rt);
|
||||
__ stw(rt, zpc, 0); // z[k] = c
|
||||
__ stw(rc, zpc, 4);
|
||||
|
||||
// for (int j = yn - 1; j >= 0; j--)
|
||||
__ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn)
|
||||
__ add(yp, ypc, ypc);
|
||||
__ dec(ypc, 8); // yn - 1 (ypc--)
|
||||
|
||||
__ bind(L_loop_j);
|
||||
|
||||
__ cmp_and_br_short(ypc, yp,// j >= 0
|
||||
Assembler::less, Assembler::pn, L_exit);
|
||||
__ clr(rc); // u64 c = 0
|
||||
__ lduw(ypc, 0, rt); // u64 y = yp[j] (= *ypc)
|
||||
__ lduw(ypc, 4, ry); // ...
|
||||
__ sllx(rt, 32, rt);
|
||||
__ or3(rt, ry, ry);
|
||||
|
||||
// for (int i = xn, k = --zn; i >= 0; i--)
|
||||
__ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn)
|
||||
__ add(xp, xpc, xpc);
|
||||
__ dec(zn); // --zn
|
||||
__ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn)
|
||||
__ add(zp, zpc, zpc);
|
||||
|
||||
__ bind(L_loop_i2);
|
||||
|
||||
__ cmp_and_br_short(xpc, xp,// i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i2);
|
||||
__ lduw(xpc, 0, rt); // u64 x = xp[i] (= *xpc)
|
||||
__ lduw(xpc, 4, rx); // ...
|
||||
__ sllx(rt, 32, rt);
|
||||
__ or3(rt, rx, rx);
|
||||
|
||||
__ lduw(zpc, 0, rt); // u64 z = zp[k] (= *zpc)
|
||||
__ lduw(zpc, 4, rz); // ...
|
||||
__ sllx(rt, 32, rt);
|
||||
__ or3(rt, rz, rz);
|
||||
|
||||
__ mulx(rx, ry, lop); // lo-64b-part of result 64x64
|
||||
__ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
|
||||
__ addcc(rz, rc, rz); // Accumulate lower order bits...
|
||||
__ addxc(hip, zero, rc); // Accumulate higher order bits to carry
|
||||
__ addcc(rz, lop, rz); // ... z += lo(p) + c
|
||||
__ addxccc(rc, zero, rc);
|
||||
__ srlx(rz, 32, rt);
|
||||
__ stw(rt, zpc, 0); // zp[k] = z (*zpc = z)
|
||||
__ stw(rz, zpc, 4);
|
||||
__ dec(zpc, 8); // k-- (zpc--)
|
||||
__ dec(xpc, 8); // i-- (xpc--)
|
||||
__ ba_short(L_loop_i2);
|
||||
|
||||
__ bind(L_exit_loop_i2);
|
||||
__ srlx(rc, 32, rt);
|
||||
__ stw(rt, zpc, 0); // z[k] = c
|
||||
__ stw(rc, zpc, 4);
|
||||
__ dec(ypc, 8); // j-- (ypc--)
|
||||
__ ba_short(L_loop_j);
|
||||
}
|
||||
|
||||
void gen_mult_32x32(Register xp, Register xn,
|
||||
Register yp, Register yn,
|
||||
Register zp, Register zn, Label &L_exit)
|
||||
{
|
||||
// Assuming that a stack frame has already been created, i.e. local and
|
||||
// output registers are available for use.
|
||||
|
||||
const Register ri = L0; // Outer loop index, xv[i]
|
||||
const Register rj = L1; // Inner loop index, yv[j]
|
||||
const Register rk = L2; // Output loop index, zv[k]
|
||||
const Register rx = L4; // x-vector datum [i]
|
||||
const Register ry = L5; // y-vector datum [j]
|
||||
const Register rz = L6; // z-vector datum [k]
|
||||
const Register rc = L7; // carry over (to z-vector datum [k-1])
|
||||
|
||||
const Register p64 = O0; // 64b product
|
||||
const Register z65 = O1; // carry+64b accumulator
|
||||
const Register c65 = O2; // carry at bit 65
|
||||
const Register c33 = O2; // carry at bit 33 (after shift)
|
||||
|
||||
const Register zero = G0;
|
||||
|
||||
Label L_loop_i, L_exit_loop_i;
|
||||
Label L_loop_j;
|
||||
Label L_loop_i2, L_exit_loop_i2;
|
||||
|
||||
__ dec(xn); // Adjust [0..N-1]
|
||||
__ dec(yn);
|
||||
__ dec(zn);
|
||||
__ clr(rc); // u32 c = 0
|
||||
__ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn)
|
||||
__ sllx(yn, 2, rj); // int j = yn (byte offset i = 4*xn)
|
||||
__ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn)
|
||||
__ lduw(yp, rj, ry); // u32 y = yp[yn]
|
||||
|
||||
// for (int i = xn; i >= 0; i--)
|
||||
__ bind(L_loop_i);
|
||||
|
||||
__ cmp_and_br_short(ri, 0, // i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i);
|
||||
__ lduw(xp, ri, rx); // x = xp[i]
|
||||
__ mulx(rx, ry, p64); // 64b result of 32x32
|
||||
__ addcc(rc, p64, z65); // Accumulate to 65 bits (producing carry)
|
||||
__ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb,
|
||||
__ sllx(c65, 32, c33); // and shift into bit 33
|
||||
__ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32
|
||||
__ add(c33, rc, rc); // carry over to next datum [k-1]
|
||||
__ stw(z65, zp, rk); // z[k] = lo(z65)
|
||||
__ dec(rk, 4); // k--
|
||||
__ dec(ri, 4); // i--
|
||||
__ ba_short(L_loop_i);
|
||||
|
||||
__ bind(L_exit_loop_i);
|
||||
__ stw(rc, zp, rk); // z[k] = c
|
||||
|
||||
// for (int j = yn - 1; j >= 0; j--)
|
||||
__ sllx(yn, 2, rj); // int j = yn - 1 (byte offset j = 4*yn)
|
||||
__ dec(rj, 4);
|
||||
|
||||
__ bind(L_loop_j);
|
||||
|
||||
__ cmp_and_br_short(rj, 0, // j >= 0
|
||||
Assembler::less, Assembler::pn, L_exit);
|
||||
__ clr(rc); // u32 c = 0
|
||||
__ lduw(yp, rj, ry); // u32 y = yp[j]
|
||||
|
||||
// for (int i = xn, k = --zn; i >= 0; i--)
|
||||
__ dec(zn); // --zn
|
||||
__ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn)
|
||||
__ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn)
|
||||
|
||||
__ bind(L_loop_i2);
|
||||
|
||||
__ cmp_and_br_short(ri, 0, // i >= 0
|
||||
Assembler::less, Assembler::pn, L_exit_loop_i2);
|
||||
__ lduw(xp, ri, rx); // x = xp[i]
|
||||
__ lduw(zp, rk, rz); // z = zp[k], accumulator
|
||||
__ mulx(rx, ry, p64); // 64b result of 32x32
|
||||
__ add(rz, rc, rz); // Accumulate lower order bits,
|
||||
__ addcc(rz, p64, z65); // z += lo(p64) + c
|
||||
__ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb,
|
||||
__ sllx(c65, 32, c33); // and shift into bit 33
|
||||
__ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32
|
||||
__ add(c33, rc, rc); // carry over to next datum [k-1]
|
||||
__ stw(z65, zp, rk); // zp[k] = lo(z65)
|
||||
__ dec(rk, 4); // k--
|
||||
__ dec(ri, 4); // i--
|
||||
__ ba_short(L_loop_i2);
|
||||
|
||||
__ bind(L_exit_loop_i2);
|
||||
__ stw(rc, zp, rk); // z[k] = c
|
||||
__ dec(rj, 4); // j--
|
||||
__ ba_short(L_loop_j);
|
||||
}
|
||||
|
||||
|
||||
void generate_initial() {
|
||||
// Generates all stubs and initializes the entry points
|
||||
|
||||
@ -5073,8 +5839,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
if (UseAdler32Intrinsics) {
|
||||
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef COMPILER2
|
||||
// Intrinsics supported by C2 only:
|
||||
if (UseMultiplyToLenIntrinsic) {
|
||||
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
||||
}
|
||||
#endif // COMPILER2
|
||||
}
|
||||
|
||||
public:
|
||||
StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
|
||||
|
@ -41,7 +41,7 @@ static bool returns_to_call_stub(address return_pc) {
|
||||
enum /* platform_dependent_constants */ {
|
||||
// %%%%%%%% May be able to shrink this a lot
|
||||
code_size1 = 20000, // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 27000 // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 29000 // simply increase if too small (assembler will crash if too small)
|
||||
};
|
||||
|
||||
class Sparc {
|
||||
|
@ -168,6 +168,16 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(UseCBCond, false);
|
||||
}
|
||||
|
||||
// Use 'mpmul' instruction if available.
|
||||
if (has_mpmul()) {
|
||||
if (FLAG_IS_DEFAULT(UseMPMUL)) {
|
||||
FLAG_SET_DEFAULT(UseMPMUL, true);
|
||||
}
|
||||
} else if (UseMPMUL) {
|
||||
warning("MPMUL instruction is not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseMPMUL, false);
|
||||
}
|
||||
|
||||
assert(BlockZeroingLowLimit > 0, "invalid value");
|
||||
|
||||
if (has_blk_zeroing() && cache_line_size > 0) {
|
||||
@ -409,6 +419,15 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
|
||||
}
|
||||
|
||||
if (UseVIS > 2) {
|
||||
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
|
||||
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, true);
|
||||
}
|
||||
} else if (UseMultiplyToLenIntrinsic) {
|
||||
warning("SPARC multiplyToLen intrinsics require VIS3 instructions support. Intrinsics will be disabled");
|
||||
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
|
||||
}
|
||||
|
||||
if (UseVectorizedMismatchIntrinsic) {
|
||||
warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
|
||||
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
|
||||
|
Loading…
x
Reference in New Issue
Block a user