8282541: AArch64: Auto-vectorize Math.round API

Reviewed-by: njian, ngasson, adinn
This commit is contained in:
Andrew Haley 2022-04-26 13:43:12 +00:00
parent 8de3c65545
commit a7b5157375
20 changed files with 1197 additions and 611 deletions

@ -15141,6 +15141,30 @@ instruct convL2D_reg_reg(vRegD dst, iRegL src) %{
ins_pipe(fp_l2d);
%}
instruct round_double_reg(iRegLNoSp dst, vRegD src, vRegD ftmp, rFlagsReg cr)
%{
match(Set dst (RoundD src));
effect(TEMP_DEF dst, TEMP ftmp, KILL cr);
format %{ "java_round_double $dst,$src"%}
ins_encode %{
__ java_round_double($dst$$Register, as_FloatRegister($src$$reg),
as_FloatRegister($ftmp$$reg));
%}
ins_pipe(pipe_slow);
%}
instruct round_float_reg(iRegINoSp dst, vRegF src, vRegF ftmp, rFlagsReg cr)
%{
match(Set dst (RoundF src));
effect(TEMP_DEF dst, TEMP ftmp, KILL cr);
format %{ "java_round_float $dst,$src"%}
ins_encode %{
__ java_round_float($dst$$Register, as_FloatRegister($src$$reg),
as_FloatRegister($ftmp$$reg));
%}
ins_pipe(pipe_slow);
%}
// stack <-> reg and reg <-> reg shuffles with no conversion
instruct MoveF2I_stack_reg(iRegINoSp dst, stackSlotF src) %{

@ -570,6 +570,52 @@ instruct vcvt2Dto2F(vecD dst, vecX src)
ins_pipe(pipe_class_default);
%}
instruct vroundvecD2Fto2I(vecD dst, vecD src, vecD tmp1, vecD tmp2, vecD tmp3)
%{
predicate(UseSVE == 0 &&
n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
match(Set dst (RoundVF src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
format %{ "vround $dst, T2S, $src\t# round vecD 2F to 2I vector" %}
ins_encode %{
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg), __ T2S);
%}
ins_pipe(pipe_class_default);
%}
instruct vroundvecX4Fto4I(vecX dst, vecX src, vecX tmp1, vecX tmp2, vecX tmp3)
%{
predicate(UseSVE == 0 &&
n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
match(Set dst (RoundVF src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
format %{ "vround $dst, T4S, $src\t# round vecX 4F to 4I vector" %}
ins_encode %{
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg), __ T4S);
%}
ins_pipe(pipe_class_default);
%}
instruct vroundvecX2Dto2L(vecX dst, vecX src, vecX tmp1, vecX tmp2, vecX tmp3)
%{
predicate(UseSVE == 0 &&
n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
match(Set dst (RoundVD src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
format %{ "vround $dst, T2D, $src\t# round vecX 2D to 2L vector" %}
ins_encode %{
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg), __ T2D);
%}
ins_pipe(pipe_class_default);
%}
// ------------------------------ Reduction -------------------------------
instruct reduce_add8B(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, vecD tmp)

@ -349,6 +349,25 @@ VECTOR_CAST_F2F(F, D, X, D, fcvtl, 2S, 2D)
VECTOR_CAST_F2F(D, F, D, X, fcvtn, 2D, 2S)
dnl
define(`VECTOR_JAVA_FROUND', `
instruct vround$7$2to$5$3($7 dst, $7 src, $7 tmp1, $7 tmp2, $7 tmp3)
%{
predicate(UseSVE == 0 &&
n->as_Vector()->length() == $5 && n->bottom_type()->is_vect()->element_basic_type() == T_$6);
match(Set dst (RoundV$1 src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
format %{ "vround $dst, $4, $src\t# round $7 $2 to $5$3 vector" %}
ins_encode %{
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg), __ $4);
%}
ins_pipe(pipe_class_default);
%}')dnl $1 $2 $3 $4 $5 $6 $7
VECTOR_JAVA_FROUND(F, 2F, I, T2S, 2, INT, vecD)
VECTOR_JAVA_FROUND(F, 4F, I, T4S, 4, INT, vecX)
VECTOR_JAVA_FROUND(D, 2D, L, T2D, 2, LONG, vecX)
// ------------------------------ Reduction -------------------------------
dnl
define(`REDUCE_ADD_BORS', `

@ -162,7 +162,6 @@ source %{
}
return op_sve_supported(opcode, vlen, bt);
}
%}
definitions %{
@ -3277,6 +3276,54 @@ instruct vroundD(vReg dst, vReg src, immI rmode) %{
ins_pipe(pipe_slow);
%}
instruct vroundFtoI(vReg dst, vReg src, vReg tmp1, vReg tmp2, vReg tmp3, pRegGov ptmp)
%{
predicate(UseSVE > 0);
match(Set dst (RoundVF src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
format %{ "sve_vround $dst, S, $src\t# round F to I vector" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
int vlen = Matcher::vector_length_in_bytes(this);
if (vlen > 16) {
__ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_PRegister($ptmp$$reg), __ S);
} else {
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg),
__ esize2arrangement(type2aelembytes(bt),
/*isQ*/ vlen == 16));
}
%}
ins_pipe(pipe_class_default);
%}
instruct vroundDtoL(vReg dst, vReg src, vReg tmp1, vReg tmp2, vReg tmp3, pRegGov ptmp)
%{
predicate(UseSVE > 0);
match(Set dst (RoundVD src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
format %{ "sve_vround $dst, D, $src\t# round D to L vector" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
int vlen = Matcher::vector_length_in_bytes(this);
if (vlen > 16) {
__ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_PRegister($ptmp$$reg), __ D);
} else {
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg),
__ esize2arrangement(type2aelembytes(bt),
/*isQ*/ vlen == 16));
}
%}
ins_pipe(pipe_class_default);
%}
// vector replicate
instruct replicateB(vReg dst, iRegIorL2I src) %{

@ -157,7 +157,6 @@ source %{
}
return op_sve_supported(opcode, vlen, bt);
}
%}
definitions %{
@ -1793,6 +1792,32 @@ instruct vroundD(vReg dst, vReg src, immI rmode) %{
%}
ins_pipe(pipe_slow);
%}
define(`VECTOR_JAVA_FROUND', `
instruct vround$1to$3($7 dst, $7 src, $7 tmp1, $7 tmp2, $7 tmp3, pRegGov ptmp)
%{
predicate(UseSVE > 0);
match(Set dst (RoundV$1 src));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp);
format %{ "sve_vround $dst, $4, $src\t# round $1 to $3 vector" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
int vlen = Matcher::vector_length_in_bytes(this);
if (vlen > 16) {
__ vector_round_sve(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_PRegister($ptmp$$reg), __ $4);
} else {
__ vector_round_neon(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
as_FloatRegister($tmp1$$reg), as_FloatRegister($tmp2$$reg),
as_FloatRegister($tmp3$$reg),
__ esize2arrangement(type2aelembytes(bt),
/*isQ*/ vlen == 16));
}
%}
ins_pipe(pipe_class_default);
%}')dnl $1 $2 $3 $4 $5 $6 $7
VECTOR_JAVA_FROUND(F, 8F, I, S, 8, INT, vReg)
VECTOR_JAVA_FROUND(D, 4D, L, D, 4, LONG, vReg)
dnl
dnl REPLICATE($1, $2, $3, $4, $5 )
dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len)

@ -306,14 +306,6 @@ public:
assert_cond((bits & mask) == mask);
return (insn & mask) >> lsb;
}
void fixed(unsigned value, unsigned mask) {
assert_cond ((mask & bits) == 0);
#ifdef ASSERT
bits |= mask;
#endif
insn |= value;
}
};
#define starti Instruction_aarch64 current_insn(this);
@ -698,7 +690,6 @@ public:
#define zrf current_insn.zrf
#define prf current_insn.prf
#define pgrf current_insn.pgrf
#define fixed current_insn.fixed
typedef void (Assembler::* uncond_branch_insn)(address dest);
typedef void (Assembler::* compare_and_branch_insn)(Register Rt, address dest);
@ -1085,7 +1076,7 @@ public:
// A more convenient access to dmb for our purposes
enum Membar_mask_bits {
// We can use ISH for a barrier because the ARM ARM says "This
// We can use ISH for a barrier because the Arm ARM says "This
// architecture assumes that all Processing Elements that use the
// same operating system or hypervisor are in the same Inner
// Shareable shareability domain."
@ -2082,46 +2073,55 @@ public:
#undef INSN
// Floating-point<->integer conversions
void float_int_convert(unsigned op31, unsigned type,
void float_int_convert(unsigned sflag, unsigned ftype,
unsigned rmode, unsigned opcode,
Register Rd, Register Rn) {
starti;
f(op31, 31, 29);
f(sflag, 31);
f(0b00, 30, 29);
f(0b11110, 28, 24);
f(type, 23, 22), f(1, 21), f(rmode, 20, 19);
f(ftype, 23, 22), f(1, 21), f(rmode, 20, 19);
f(opcode, 18, 16), f(0b000000, 15, 10);
zrf(Rn, 5), zrf(Rd, 0);
}
#define INSN(NAME, op31, type, rmode, opcode) \
void NAME(Register Rd, FloatRegister Vn) { \
float_int_convert(op31, type, rmode, opcode, Rd, as_Register(Vn)); \
#define INSN(NAME, sflag, ftype, rmode, opcode) \
void NAME(Register Rd, FloatRegister Vn) { \
float_int_convert(sflag, ftype, rmode, opcode, Rd, as_Register(Vn)); \
}
INSN(fcvtzsw, 0b000, 0b00, 0b11, 0b000);
INSN(fcvtzs, 0b100, 0b00, 0b11, 0b000);
INSN(fcvtzdw, 0b000, 0b01, 0b11, 0b000);
INSN(fcvtzd, 0b100, 0b01, 0b11, 0b000);
INSN(fcvtzsw, 0b0, 0b00, 0b11, 0b000);
INSN(fcvtzs, 0b1, 0b00, 0b11, 0b000);
INSN(fcvtzdw, 0b0, 0b01, 0b11, 0b000);
INSN(fcvtzd, 0b1, 0b01, 0b11, 0b000);
INSN(fmovs, 0b000, 0b00, 0b00, 0b110);
INSN(fmovd, 0b100, 0b01, 0b00, 0b110);
// RoundToNearestTiesAway
INSN(fcvtassw, 0b0, 0b00, 0b00, 0b100); // float -> signed word
INSN(fcvtasd, 0b1, 0b01, 0b00, 0b100); // double -> signed xword
INSN(fmovhid, 0b100, 0b10, 0b01, 0b110);
// RoundTowardsNegative
INSN(fcvtmssw, 0b0, 0b00, 0b10, 0b000); // float -> signed word
INSN(fcvtmsd, 0b1, 0b01, 0b10, 0b000); // double -> signed xword
INSN(fmovs, 0b0, 0b00, 0b00, 0b110);
INSN(fmovd, 0b1, 0b01, 0b00, 0b110);
INSN(fmovhid, 0b1, 0b10, 0b01, 0b110);
#undef INSN
#define INSN(NAME, op31, type, rmode, opcode) \
#define INSN(NAME, sflag, type, rmode, opcode) \
void NAME(FloatRegister Vd, Register Rn) { \
float_int_convert(op31, type, rmode, opcode, as_Register(Vd), Rn); \
float_int_convert(sflag, type, rmode, opcode, as_Register(Vd), Rn); \
}
INSN(fmovs, 0b000, 0b00, 0b00, 0b111);
INSN(fmovd, 0b100, 0b01, 0b00, 0b111);
INSN(fmovs, 0b0, 0b00, 0b00, 0b111);
INSN(fmovd, 0b1, 0b01, 0b00, 0b111);
INSN(scvtfws, 0b000, 0b00, 0b00, 0b010);
INSN(scvtfs, 0b100, 0b00, 0b00, 0b010);
INSN(scvtfwd, 0b000, 0b01, 0b00, 0b010);
INSN(scvtfd, 0b100, 0b01, 0b00, 0b010);
INSN(scvtfws, 0b0, 0b00, 0b00, 0b010);
INSN(scvtfs, 0b1, 0b00, 0b00, 0b010);
INSN(scvtfwd, 0b0, 0b01, 0b00, 0b010);
INSN(scvtfd, 0b1, 0b01, 0b00, 0b010);
// INSN(fmovhid, 0b100, 0b10, 0b01, 0b111);
@ -2510,6 +2510,7 @@ public:
#undef INSN
// Advanced SIMD modified immediate
#define INSN(NAME, op0, cmode0) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, unsigned imm8, unsigned lsl = 0) { \
unsigned cmode = cmode0; \
@ -2537,7 +2538,22 @@ public:
#undef INSN
#define INSN(NAME, op1, op2, op3) \
#define INSN(NAME, op, cmode) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, double imm) { \
unsigned imm8 = pack(imm); \
starti; \
f(0, 31), f((int)T & 1, 30), f(op, 29), f(0b0111100000, 28, 19); \
f(imm8 >> 5, 18, 16), f(cmode, 15, 12), f(0x01, 11, 10), f(imm8 & 0b11111, 9, 5); \
rf(Vd, 0); \
}
INSN(fmovs, 0, 0b1111);
INSN(fmovd, 1, 0b1111);
#undef INSN
// Advanced SIMD three same
#define INSN(NAME, op1, op2, op3) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
starti; \
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); \
@ -2984,7 +3000,9 @@ public:
INSN(frintn, 0, 0b00, 0b01, 0b11000);
INSN(frintm, 0, 0b00, 0b01, 0b11001);
INSN(frintp, 0, 0b10, 0b01, 0b11000);
INSN(fcvtas, 0, 0b00, 0b01, 0b11100);
INSN(fcvtzs, 0, 0b10, 0b01, 0b11011);
INSN(fcvtms, 0, 0b00, 0b01, 0b11011);
#undef ASSERTION
#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T == T2S || T == T4S)
@ -3154,6 +3172,7 @@ public:
INSN(sve_fneg, 0b00000100, 0b011101101);
INSN(sve_frintm, 0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity
INSN(sve_frintn, 0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even
INSN(sve_frinta, 0b01100101, 0b000100101); // floating-point round to integral value, nearest with ties to away
INSN(sve_frintp, 0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity
INSN(sve_fsqrt, 0b01100101, 0b001101101);
INSN(sve_fsub, 0b01100101, 0b000001100);
@ -3449,8 +3468,9 @@ public:
pgrf(Pg, 10), srf(Rn, 5), rf(Zd, 0);
}
// SVE copy signed integer immediate to vector elements (predicated)
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8, bool isMerge) {
private:
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8,
bool isMerge, bool isFloat) {
starti;
assert(T != Q, "invalid size");
int sh = 0;
@ -3464,7 +3484,17 @@ public:
}
int m = isMerge ? 1 : 0;
f(0b00000101, 31, 24), f(T, 23, 22), f(0b01, 21, 20);
prf(Pg, 16), f(0b0, 15), f(m, 14), f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0);
prf(Pg, 16), f(isFloat ? 1 : 0, 15), f(m, 14), f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0);
}
public:
// SVE copy signed integer immediate to vector elements (predicated)
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8, bool isMerge) {
sve_cpy(Zd, T, Pg, imm8, isMerge, /*isFloat*/false);
}
// SVE copy floating-point immediate to vector elements (predicated)
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, double d) {
sve_cpy(Zd, T, Pg, checked_cast<int8_t>(pack(d)), /*isMerge*/true, /*isFloat*/true);
}
// SVE conditionally select elements from two vectors
@ -3528,6 +3558,29 @@ void sve_cmp(Condition cond, PRegister Pd, SIMD_RegVariant T,
f(cond_op & 0x1, 4), prf(Pd, 0);
}
// SVE Floating-point compare vector with zero
void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
PRegister Pg, FloatRegister Zn, double d) {
starti;
assert(T != Q, "invalid size");
guarantee(d == 0.0, "invalid immediate");
int cond_op;
switch(cond) {
case EQ: cond_op = 0b100; break;
case GT: cond_op = 0b001; break;
case GE: cond_op = 0b000; break;
case LT: cond_op = 0b010; break;
case LE: cond_op = 0b011; break;
case NE: cond_op = 0b110; break;
default:
ShouldNotReachHere();
}
f(0b01100101, 31, 24), f(T, 23, 22), f(0b0100, 21, 18),
f((cond_op >> 1) & 0x3, 17, 16), f(0b001, 15, 13),
pgrf(Pg, 10), rf(Zn, 5);
f(cond_op & 0x1, 4), prf(Pd, 0);
}
// SVE unpack vector elements
#define INSN(NAME, op) \
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn) { \

@ -1267,3 +1267,74 @@ void C2_MacroAssembler::sve_ptrue_lanecnt(PRegister dst, SIMD_RegVariant size, i
ShouldNotReachHere();
}
}
// java.lang.Math::round intrinsics
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
assert_different_registers(tmp1, tmp2, tmp3, src, dst);
switch (T) {
case T2S:
case T4S:
fmovs(tmp1, T, 0.5f);
mov(rscratch1, jint_cast(0x1.0p23f));
break;
case T2D:
fmovd(tmp1, T, 0.5);
mov(rscratch1, julong_cast(0x1.0p52));
break;
default:
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
}
fadd(tmp1, T, tmp1, src);
fcvtms(tmp1, T, tmp1);
// tmp1 = floor(src + 0.5, ties to even)
fcvtas(dst, T, src);
// dst = round(src), ties to away
fneg(tmp3, T, src);
dup(tmp2, T, rscratch1);
cmhs(tmp3, T, tmp3, tmp2);
// tmp3 is now a set of flags
bif(dst, T16B, tmp1, tmp3);
// result in dst
}
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) {
assert_different_registers(tmp1, tmp2, src, dst);
switch (T) {
case S:
mov(rscratch1, jint_cast(0x1.0p23f));
break;
case D:
mov(rscratch1, julong_cast(0x1.0p52));
break;
default:
assert(T == S || T == D, "invalid arrangement");
}
sve_frinta(dst, T, ptrue, src);
// dst = round(src), ties to away
Label none;
sve_fneg(tmp1, T, ptrue, src);
sve_dup(tmp2, T, rscratch1);
sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1);
br(EQ, none);
{
sve_cpy(tmp1, T, ptmp, 0.5);
sve_fadd(tmp1, T, ptmp, src);
sve_frintm(dst, T, ptmp, tmp1);
// dst = floor(src + 0.5, ties to even)
}
bind(none);
sve_fcvtzs(dst, T, ptrue, dst, T);
// result in dst
}

@ -103,4 +103,12 @@
sve_lastb(dst, size, pg, src);
}
// java.lang.Math::round intrinsics
void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3,
SIMD_Arrangement T);
void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, PRegister ptmp,
SIMD_RegVariant T);
#endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP

@ -5178,6 +5178,56 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
csel(res, res, zr, EQ);
}
// java.math.round(double a)
// Returns the closest long to the argument, with ties rounding to
// positive infinity. This requires some fiddling for corner
// cases. We take care to avoid double rounding in e.g. (jlong)(a + 0.5).
void MacroAssembler::java_round_double(Register dst, FloatRegister src,
FloatRegister ftmp) {
Label DONE;
BLOCK_COMMENT("java_round_double: { ");
fmovd(rscratch1, src);
// Use RoundToNearestTiesAway unless src small and -ve.
fcvtasd(dst, src);
// Test if src >= 0 || abs(src) >= 0x1.0p52
eor(rscratch1, rscratch1, UCONST64(1) << 63); // flip sign bit
mov(rscratch2, julong_cast(0x1.0p52));
cmp(rscratch1, rscratch2);
br(HS, DONE); {
// src < 0 && abs(src) < 0x1.0p52
// src may have a fractional part, so add 0.5
fmovd(ftmp, 0.5);
faddd(ftmp, src, ftmp);
// Convert double to jlong, use RoundTowardsNegative
fcvtmsd(dst, ftmp);
}
bind(DONE);
BLOCK_COMMENT("} java_round_double");
}
void MacroAssembler::java_round_float(Register dst, FloatRegister src,
FloatRegister ftmp) {
Label DONE;
BLOCK_COMMENT("java_round_float: { ");
fmovs(rscratch1, src);
// Use RoundToNearestTiesAway unless src small and -ve.
fcvtassw(dst, src);
// Test if src >= 0 || abs(src) >= 0x1.0p23
eor(rscratch1, rscratch1, 0x80000000); // flip sign bit
mov(rscratch2, jint_cast(0x1.0p23f));
cmp(rscratch1, rscratch2);
br(HS, DONE); {
// src < 0 && |src| < 0x1.0p23
// src may have a fractional part, so add 0.5
fmovs(ftmp, 0.5f);
fadds(ftmp, src, ftmp);
// Convert float to jint, use RoundTowardsNegative
fcvtmssw(dst, ftmp);
}
bind(DONE);
BLOCK_COMMENT("} java_round_float");
}
// get_thread() can be called anywhere inside generated code so we
// need to save whatever non-callee save context might get clobbered
// by the call to JavaThread::aarch64_get_thread_helper() or, indeed,

@ -877,6 +877,10 @@ public:
// Round up to a power of two
void round_to(Register reg, int modulus);
// java.lang.Math::round intrinsics
void java_round_double(Register dst, FloatRegister src, FloatRegister ftmp);
void java_round_float(Register dst, FloatRegister src, FloatRegister ftmp);
// allocation
void eden_allocate(
Register obj, // result: pointer to object after successful allocation

@ -165,8 +165,23 @@
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
switch(vopc) {
default: return 0;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 15;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 15;
}
}
}
#endif // CPU_AARCH64_MATCHER_AARCH64_HPP

@ -155,9 +155,25 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = false;
// Returns pre-selection estimated cost of a vector operation.
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
switch(vopc) {
default: return 0;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 30;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 30;
}
}
}
#endif // CPU_ARM_MATCHER_ARM_HPP

@ -164,10 +164,25 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
switch(vopc) {
default: return 0;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 30;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 30;
}
}
}
#endif // CPU_PPC_MATCHER_PPC_HPP

@ -163,7 +163,23 @@
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
switch(vopc) {
default: return 0;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 30;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 30;
}
}
}
#endif // CPU_RISCV_MATCHER_RISCV_HPP

@ -153,9 +153,25 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
return 0;
switch(vopc) {
default: return 0;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 30;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 30;
}
}
}
#endif // CPU_S390_MATCHER_S390_HPP

@ -183,12 +183,26 @@
// Implements a variant of EncodeISOArrayNode that encode ASCII only
static const bool supports_encode_ascii_array = true;
// Returns pre-selection estimated cost of a vector operation.
// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
default: return 0;
case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
case Op_RoundVF: // fall through
case Op_RoundVD: {
return 30;
}
}
}
// Returns pre-selection estimated size of a scalar operation.
static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
switch(vopc) {
default: return 0;
case Op_RoundF: // fall through
case Op_RoundD: {
return 30;
}
}
}

@ -970,10 +970,12 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
case Op_ModL: body_size += 30; break;
case Op_DivL: body_size += 30; break;
case Op_MulL: body_size += 10; break;
case Op_RoundF: body_size += 30; break;
case Op_RoundD: body_size += 30; break;
case Op_RoundVF: body_size += 30; break;
case Op_RoundVD: body_size += 30; break;
case Op_RoundF:
case Op_RoundD: {
body_size += Matcher::scalar_op_pre_select_sz_estimate(n->Opcode(), n->bottom_type()->basic_type());
} break;
case Op_RoundVF:
case Op_RoundVD:
case Op_PopCountVI:
case Op_PopCountVL: {
const TypeVect* vt = n->bottom_type()->is_vect();

@ -462,6 +462,29 @@ class SVEBinaryImmOp(Instruction):
return (formatStr
% tuple([Instruction.astr(self)] + Regs + [self.immed]))
class SVEComparisonWithZero(Instruction):
def __init__(self, arg):
Instruction.__init__(self, "fcm")
self.condition = arg
self.dest = OperandFactory.create('p').generate()
self.reg = SVEVectorRegister().generate()
self._width = RegVariant(2, 3)
self.preg = OperandFactory.create('P').generate()
def generate(self):
return Instruction.generate(self)
def cstr(self):
return ("%s(%s, %s, %s, %s, %s, 0.0);"
% ("__ sve_" + self._name, "Assembler::" + self.condition,
str(self.dest), self._width.cstr(), str(self.preg), str(self.reg)))
def astr(self):
val = ("%s%s\t%s%s, %s/z, %s%s, #0.0"
% (self._name, self.condition.lower(), str(self.dest), self._width.astr(),
str(self.preg), str(self.reg), self._width.astr()))
return val
class MultiOp():
def multipleForms(self):
@ -1444,6 +1467,8 @@ generate(FloatConvertOp, [["fcvtzsw", "fcvtzs", "ws"], ["fcvtzs", "fcvtzs", "xs"
["fcvtzdw", "fcvtzs", "wd"], ["fcvtzd", "fcvtzs", "xd"],
["scvtfws", "scvtf", "sw"], ["scvtfs", "scvtf", "sx"],
["scvtfwd", "scvtf", "dw"], ["scvtfd", "scvtf", "dx"],
["fcvtassw", "fcvtas", "ws"], ["fcvtasd", "fcvtas", "xd"],
["fcvtmssw", "fcvtms", "ws"], ["fcvtmsd", "fcvtms", "xd"],
["fmovs", "fmov", "ws"], ["fmovd", "fmov", "xd"],
["fmovs", "fmov", "sw"], ["fmovd", "fmov", "dx"]])
@ -1590,6 +1615,8 @@ generate(ThreeRegNEONOp,
["fcmge", "fcmge", "2D"],
])
generate(SVEComparisonWithZero, ["EQ", "GT", "GE", "LT", "LE", "NE"])
generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);", "ccmn\txzr, xzr, #3, LE"],
["ccmnw", "__ ccmnw(zr, zr, 5u, Assembler::EQ);", "ccmn\twzr, wzr, #5, EQ"],
["ccmp", "__ ccmp(zr, 1, 4u, Assembler::NE);", "ccmp\txzr, 1, #4, NE"],
@ -1613,8 +1640,12 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);",
["umov", "__ umov(r0, v1, __ H, 2);", "umov\tw0, v1.h[2]"],
["umov", "__ umov(r0, v1, __ B, 3);", "umov\tw0, v1.b[3]"],
["fmov", "__ fmovhid(r0, v1);", "fmov\tx0, v1.d[1]"],
["fmov", "__ fmovs(v9, __ T2S, 0.5f);", "fmov\tv9.2s, 0.5"],
["fmov", "__ fmovd(v14, __ T2D, 0.5f);", "fmov\tv14.2d, 0.5"],
["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"],
["fcvtzs", "__ fcvtzs(v0, __ T4S, v1);", "fcvtzs\tv0.4s, v1.4s"],
["fcvtzs", "__ fcvtzs(v0, __ T2S, v1);", "fcvtzs\tv0.2s, v1.2s"],
["fcvtas", "__ fcvtas(v2, __ T4S, v3);", "fcvtas\tv2.4s, v3.4s"],
["fcvtms", "__ fcvtms(v4, __ T2D, v5);", "fcvtms\tv4.2d, v5.2d"],
# SVE instructions
["cpy", "__ sve_cpy(z0, __ S, p0, v1);", "mov\tz0.s, p0/m, s1"],
["cpy", "__ sve_cpy(z0, __ B, p0, 127, true);", "mov\tz0.b, p0/m, 127"],

File diff suppressed because it is too large Load Diff

@ -0,0 +1,94 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8282541
* @summary Auto-vectorize Math.round API
* @requires vm.compiler2.enabled
* @requires os.simpleArch == "aarch64"
* @library /test/lib /
* @run driver compiler.vectorization.TestRoundVectAArch64
*/
package compiler.vectorization;
import compiler.lib.ir_framework.*;
public class TestRoundVectAArch64 {
private static final int ARRLEN = 1024;
private static final int ITERS = 11000;
private static double [] dinp;
private static long [] lout;
private static float [] finp;
private static int [] iout;
public static void main(String args[]) {
if (System.getProperty("os.arch").equals("aarch64")) {
TestFramework.runWithFlags("-XX:-TieredCompilation",
"-XX:CompileThresholdScaling=0.3");
}
System.out.println("PASSED");
}
@Test
@IR(counts = {"RoundVD" , " > 0 "})
public void test_round_double(long[] lout, double[] dinp) {
for (int i = 0; i < lout.length; i+=1) {
lout[i] = Math.round(dinp[i]);
}
}
@Run(test = {"test_round_double"}, mode = RunMode.STANDALONE)
public void kernel_test_round_double() {
dinp = new double[ARRLEN];
lout = new long[ARRLEN];
for(int i = 0 ; i < ARRLEN; i++) {
dinp[i] = (double)i*1.4;
}
for (int i = 0; i < ITERS; i++) {
test_round_double(lout , dinp);
}
}
@Test
@IR(counts = {"RoundVF" , " > 0 "})
public void test_round_float(int[] iout, float[] finp) {
for (int i = 0; i < finp.length; i+=1) {
iout[i] = Math.round(finp[i]);
}
}
@Run(test = {"test_round_float"}, mode = RunMode.STANDALONE)
public void kernel_test_round() {
finp = new float[ARRLEN];
iout = new int[ARRLEN];
for(int i = 0 ; i < ARRLEN; i++) {
finp[i] = (float)i*1.4f;
}
for (int i = 0; i < ITERS; i++) {
test_round_float(iout , finp);
}
}
}