8285790: AArch64: Merge C2 NEON and SVE matching rules

Co-authored-by: Ningsheng Jian <njian@openjdk.org>
Co-authored-by: Eric Liu <eliu@openjdk.org>
Reviewed-by: adinn, aph, xgong
This commit is contained in:
Hao Sun 2022-08-17 03:51:46 +00:00 committed by Ningsheng Jian
parent da477b1366
commit 0cc66aeae8
19 changed files with 11741 additions and 18945 deletions

View File

@ -142,8 +142,7 @@ ifeq ($(call check-jvm-feature, compiler2), true)
ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_neon.ad \
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_vector.ad \
)))
endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1004,7 +1004,7 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
// Repeat on higher bytes and join the results.
// Compress 8 bytes in each iteration.
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
bytemask_compress(rscratch1);
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
}
@ -1108,6 +1108,7 @@ void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicTyp
sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
}
// Clobbers: rflags
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
FloatRegister zn, FloatRegister zm, int cond) {
assert(pg->is_governing(), "This register has to be a governing predicate register");
@ -1145,6 +1146,61 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
subw(dst, rscratch1, dst);
}
// Extend integer vector src to dst with the same lane count
// but larger element size, e.g. 4B -> 4I
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
FloatRegister src, BasicType src_bt) {
if (src_bt == T_BYTE) {
if (dst_bt == T_SHORT) {
// 4B/8B to 4S/8S
assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
sxtl(dst, T8H, src, T8B);
} else {
// 4B to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T8H, src, T8B);
sxtl(dst, T4S, dst, T4H);
}
} else if (src_bt == T_SHORT) {
// 4S to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T4S, src, T4H);
} else if (src_bt == T_INT) {
// 2I to 2L
assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
sxtl(dst, T2D, src, T2S);
} else {
ShouldNotReachHere();
}
}
// Narrow integer vector src down to dst with the same lane count
// but smaller element size, e.g. 4I -> 4B
void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
if (src_bt == T_SHORT) {
// 4S/8S to 4B/8B
assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_BYTE, "unsupported");
xtn(dst, T8B, src, T8H);
} else if (src_bt == T_INT) {
// 4I to 4B/4S
assert(src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
xtn(dst, T4H, src, T4S);
if (dst_bt == T_BYTE) {
xtn(dst, T8B, dst, T8H);
}
} else if (src_bt == T_LONG) {
// 2L to 2I
assert(src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_INT, "unsupported");
xtn(dst, T2S, src, T2D);
} else {
ShouldNotReachHere();
}
}
void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size) {
assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
@ -1257,6 +1313,275 @@ void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
}
}
// Vector reduction add for integral type with ASIMD instructions.
void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_add_integral {");
switch(bt) {
case T_BYTE:
addv(vtmp, isQ ? T16B : T8B, vsrc);
smov(dst, vtmp, B, 0);
addw(dst, dst, isrc, ext::sxtb);
break;
case T_SHORT:
addv(vtmp, isQ ? T8H : T4H, vsrc);
smov(dst, vtmp, H, 0);
addw(dst, dst, isrc, ext::sxth);
break;
case T_INT:
isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
umov(dst, vtmp, S, 0);
addw(dst, dst, isrc);
break;
case T_LONG:
assert(isQ, "unsupported");
addpd(vtmp, vsrc);
umov(dst, vtmp, D, 0);
add(dst, dst, isrc);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_add_integral");
}
// Vector reduction multiply for integral type with ASIMD instructions.
// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
// Clobbers: rscratch1
void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_mul_integral {");
switch(bt) {
case T_BYTE:
if (isQ) {
// Multiply the lower half and higher half of vector iteratively.
// vtmp1 = vsrc[8:15]
ins(vtmp1, D, vsrc, 0, 1);
// vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
mulv(vtmp1, T8B, vtmp1, vsrc);
// vtmp2 = vtmp1[4:7]
ins(vtmp2, S, vtmp1, 0, 1);
// vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
mulv(vtmp1, T8B, vtmp2, vtmp1);
} else {
ins(vtmp1, S, vsrc, 0, 1);
mulv(vtmp1, T8B, vtmp1, vsrc);
}
// vtmp2 = vtmp1[2:3]
ins(vtmp2, H, vtmp1, 0, 1);
// vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
mulv(vtmp2, T8B, vtmp2, vtmp1);
// dst = vtmp2[0] * isrc * vtmp2[1]
umov(rscratch1, vtmp2, B, 0);
mulw(dst, rscratch1, isrc);
sxtb(dst, dst);
umov(rscratch1, vtmp2, B, 1);
mulw(dst, rscratch1, dst);
sxtb(dst, dst);
break;
case T_SHORT:
if (isQ) {
ins(vtmp2, D, vsrc, 0, 1);
mulv(vtmp2, T4H, vtmp2, vsrc);
ins(vtmp1, S, vtmp2, 0, 1);
mulv(vtmp1, T4H, vtmp1, vtmp2);
} else {
ins(vtmp1, S, vsrc, 0, 1);
mulv(vtmp1, T4H, vtmp1, vsrc);
}
umov(rscratch1, vtmp1, H, 0);
mulw(dst, rscratch1, isrc);
sxth(dst, dst);
umov(rscratch1, vtmp1, H, 1);
mulw(dst, rscratch1, dst);
sxth(dst, dst);
break;
case T_INT:
if (isQ) {
ins(vtmp1, D, vsrc, 0, 1);
mulv(vtmp1, T2S, vtmp1, vsrc);
} else {
vtmp1 = vsrc;
}
umov(rscratch1, vtmp1, S, 0);
mul(dst, rscratch1, isrc);
umov(rscratch1, vtmp1, S, 1);
mul(dst, rscratch1, dst);
break;
case T_LONG:
umov(rscratch1, vsrc, D, 0);
mul(dst, isrc, rscratch1);
umov(rscratch1, vsrc, D, 1);
mul(dst, dst, rscratch1);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_mul_integral");
}
// Vector reduction multiply for floating-point type with ASIMD instructions.
void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
FloatRegister fsrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_mul_fp {");
switch(bt) {
case T_FLOAT:
fmuls(dst, fsrc, vsrc);
ins(vtmp, S, vsrc, 0, 1);
fmuls(dst, dst, vtmp);
if (isQ) {
ins(vtmp, S, vsrc, 0, 2);
fmuls(dst, dst, vtmp);
ins(vtmp, S, vsrc, 0, 3);
fmuls(dst, dst, vtmp);
}
break;
case T_DOUBLE:
assert(isQ, "unsupported");
fmuld(dst, fsrc, vsrc);
ins(vtmp, D, vsrc, 0, 1);
fmuld(dst, dst, vtmp);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_mul_fp");
}
// Helper to select logical instruction
void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
Register Rn, Register Rm,
enum shift_kind kind, unsigned shift) {
switch(opc) {
case Op_AndReductionV:
is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
break;
case Op_OrReductionV:
is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
break;
case Op_XorReductionV:
is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Vector reduction logical operations And, Or, Xor
// Clobbers: rscratch1
void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes) {
assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
"unsupported");
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_logical {");
umov(rscratch1, vsrc, isQ ? D : S, 0);
umov(dst, vsrc, isQ ? D : S, 1);
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
switch(bt) {
case T_BYTE:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
sxtb(dst, dst);
break;
case T_SHORT:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
sxth(dst, dst);
break;
case T_INT:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
break;
case T_LONG:
assert(isQ, "unsupported");
neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_logical");
}
// Vector reduction min/max for integral type with ASIMD instructions.
// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
bool is_min = opc == Op_MinReductionV;
BLOCK_COMMENT("neon_reduce_minmax_integral {");
if (bt == T_LONG) {
assert(vtmp == fnoreg, "should be");
assert(isQ, "should be");
umov(rscratch1, vsrc, D, 0);
cmp(isrc, rscratch1);
csel(dst, isrc, rscratch1, is_min ? LT : GT);
umov(rscratch1, vsrc, D, 1);
cmp(dst, rscratch1);
csel(dst, dst, rscratch1, is_min ? LT : GT);
} else {
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
if (size == T2S) {
is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
} else {
is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
}
if (bt == T_INT) {
umov(dst, vtmp, S, 0);
} else {
smov(dst, vtmp, elemType_to_regVariant(bt), 0);
}
cmpw(dst, isrc);
cselw(dst, dst, isrc, is_min ? LT : GT);
}
BLOCK_COMMENT("} neon_reduce_minmax_integral");
}
// Vector reduction for integral type with SVE instruction.
// Supported operations are Add, And, Or, Xor, Max, Min.
// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
FloatRegister src2, PRegister pg, FloatRegister tmp) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
@ -1267,12 +1592,14 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
switch (opc) {
case Op_AddReductionVI: {
sve_uaddv(tmp, size, pg, src2);
smov(dst, tmp, size, 0);
if (bt == T_BYTE) {
smov(dst, tmp, size, 0);
addw(dst, src1, dst, ext::sxtb);
} else if (bt == T_SHORT) {
smov(dst, tmp, size, 0);
addw(dst, src1, dst, ext::sxth);
} else {
umov(dst, tmp, size, 0);
addw(dst, dst, src1);
}
break;
@ -1285,45 +1612,57 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
}
case Op_AndReductionV: {
sve_andv(tmp, size, pg, src2);
if (bt == T_LONG) {
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
andr(dst, dst, src1);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
andr(dst, dst, src1);
} else {
andw(dst, dst, src1);
}
break;
}
case Op_OrReductionV: {
sve_orv(tmp, size, pg, src2);
if (bt == T_LONG) {
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
orr(dst, dst, src1);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
orr(dst, dst, src1);
} else {
orrw(dst, dst, src1);
}
break;
}
case Op_XorReductionV: {
sve_eorv(tmp, size, pg, src2);
if (bt == T_LONG) {
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
eor(dst, dst, src1);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
eor(dst, dst, src1);
} else {
eorw(dst, dst, src1);
}
break;
}
case Op_MaxReductionV: {
sve_smaxv(tmp, size, pg, src2);
if (bt == T_LONG) {
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
cmp(dst, src1);
csel(dst, dst, src1, Assembler::GT);
} else {
smov(dst, tmp, size, 0);
cmpw(dst, src1);
cselw(dst, dst, src1, Assembler::GT);
}
@ -1331,12 +1670,15 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
}
case Op_MinReductionV: {
sve_sminv(tmp, size, pg, src2);
if (bt == T_LONG) {
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
cmp(dst, src1);
csel(dst, dst, src1, Assembler::LT);
} else {
smov(dst, tmp, size, 0);
cmpw(dst, src1);
cselw(dst, dst, src1, Assembler::LT);
}
@ -1573,23 +1915,32 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src,
// Extract a scalar element from an sve vector at position 'idx'.
// The input elements in src are expected to be of integral type.
void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
bool is_signed, FloatRegister vtmp) {
assert(UseSVE > 0 && size != Q, "unsupported");
assert(!(is_signed && size == D), "signed extract (D) not supported.");
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
int idx, FloatRegister vtmp) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx);
if (bt == T_INT || bt == T_LONG) {
umov(dst, src, size, idx);
} else {
smov(dst, src, size, idx);
}
} else {
sve_orr(vtmp, src, src);
sve_ext(vtmp, vtmp, idx << size);
is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0);
if (bt == T_INT || bt == T_LONG) {
umov(dst, vtmp, size, 0);
} else {
smov(dst, vtmp, size, 0);
}
}
}
// java.lang.Math::round intrinsics
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
assert_different_registers(tmp1, tmp2, tmp3, src, dst);
switch (T) {
case T2S:
@ -1620,8 +1971,10 @@ void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src,
// result in dst
}
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) {
FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(tmp1, tmp2, src, dst);
switch (T) {
@ -1632,7 +1985,7 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
mov(rscratch1, julong_cast(0x1.0p52));
break;
default:
assert(T == S || T == D, "invalid arrangement");
assert(T == S || T == D, "invalid register variant");
}
sve_frinta(dst, T, ptrue, src);
@ -1642,12 +1995,12 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
sve_fneg(tmp1, T, ptrue, src);
sve_dup(tmp2, T, rscratch1);
sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1);
sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
br(EQ, none);
{
sve_cpy(tmp1, T, ptmp, 0.5);
sve_fadd(tmp1, T, ptmp, src);
sve_frintm(dst, T, ptmp, tmp1);
sve_cpy(tmp1, T, pgtmp, 0.5);
sve_fadd(tmp1, T, pgtmp, src);
sve_frintm(dst, T, pgtmp, tmp1);
// dst = floor(src + 0.5, ties to even)
}
bind(none);

View File

@ -31,6 +31,9 @@
// Return true if the phase output is in the scratch emit size mode.
virtual bool in_scratch_emit_size() override;
void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size();
@ -84,6 +87,13 @@
void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);
// Vector cast
void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
FloatRegister src, BasicType src_bt);
void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size);
@ -96,6 +106,27 @@
void sve_vmaskcast_narrow(PRegister dst, PRegister src,
uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
// Vector reduction
void neon_reduce_add_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes, FloatRegister vtmp);
void neon_reduce_mul_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp1, FloatRegister vtmp2);
void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
FloatRegister fsrc, FloatRegister vsrc,
unsigned vector_length_in_bytes, FloatRegister vtmp);
void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
FloatRegister vsrc, unsigned vector_length_in_bytes);
void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes, FloatRegister vtmp);
void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
FloatRegister src2, PRegister pg, FloatRegister tmp);
@ -107,15 +138,15 @@
// Extract a scalar element from an sve vector at position 'idx'.
// The input elements in src are expected to be of integral type.
void sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
bool is_signed, FloatRegister vtmp);
void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
int idx, FloatRegister vtmp);
// java.lang.Math::round intrinsics
void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3,
SIMD_Arrangement T);
void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, PRegister ptmp,
FloatRegister tmp2, PRegister pgtmp,
SIMD_RegVariant T);
// Pack active elements of src, under the control of mask, into the

View File

@ -52,8 +52,8 @@
// the cpu only look at the lower 5/6 bits anyway?
static const bool need_masked_shift_count = false;
// No support for generic vector operands.
static const bool supports_generic_vector_operands = false;
// aarch64 supports generic vector operands: vReg.
static const bool supports_generic_vector_operands = true;
static constexpr bool isSimpleConstant64(jlong value) {
// Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.

View File

@ -137,7 +137,7 @@ class FloatRegisterImpl: public AbstractRegisterImpl {
public:
enum {
number_of_registers = 32,
max_slots_per_register = 8,
max_slots_per_register = 4,
save_slots_per_register = 2,
slots_per_neon_register = 4,
extra_save_slots_per_neon_register = slots_per_neon_register - save_slots_per_register

View File

@ -167,6 +167,12 @@ public:
static void initialize_cpu_information(void);
static bool use_rop_protection() { return _rop_protection; }
// For common 64/128-bit unpredicated vector operations, we may prefer
// emitting NEON instructions rather than the corresponding SVE instructions.
static bool use_neon_for_vector(int vector_length_in_bytes) {
return vector_length_in_bytes <= 16;
}
};
#endif // CPU_AARCH64_VM_VERSION_AARCH64_HPP

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -2274,6 +2274,9 @@ private:
#if defined(PPC64)
if (strcmp(rep_var,"$VectorRegister") == 0) return "as_VectorRegister";
if (strcmp(rep_var,"$VectorSRegister") == 0) return "as_VectorSRegister";
#endif
#if defined(AARCH64)
if (strcmp(rep_var,"$PRegister") == 0) return "as_PRegister";
#endif
return NULL;
}

View File

@ -135,6 +135,14 @@ public:
return ::as_VectorSRegister(reg(ra_, node, idx));
}
#endif
#if defined(AARCH64)
PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node) const {
return ::as_PRegister(reg(ra_, node));
}
PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node, int idx) const {
return ::as_PRegister(reg(ra_, node, idx));
}
#endif
virtual intptr_t constant() const;
virtual relocInfo::relocType constant_reloc() const;

View File

@ -99,7 +99,7 @@ class RegMask {
// requirement is internal to the allocator, and independent of any
// particular platform.
enum { SlotsPerLong = 2,
SlotsPerVecA = RISCV_ONLY(4) NOT_RISCV(8),
SlotsPerVecA = 4,
SlotsPerVecS = 1,
SlotsPerVecD = 2,
SlotsPerVecX = 4,

View File

@ -84,7 +84,7 @@ public class AllBitsSetVectorMatchRuleTest {
@Test
@Warmup(10000)
@IR(counts = { "bic", " >= 1" })
@IR(counts = { "vand_notI", " >= 1" })
public static void testAllBitsSetVector() {
IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
IntVector bv = IntVector.fromArray(I_SPECIES, ib, 0);
@ -98,7 +98,7 @@ public class AllBitsSetVectorMatchRuleTest {
@Test
@Warmup(10000)
@IR(counts = { "bic", " >= 1" })
@IR(counts = { "and_notL", " >= 1" })
public static void testAllBitsSetMask() {
VectorMask<Long> avm = VectorMask.fromArray(L_SPECIES, ma, 0);
VectorMask<Long> bvm = VectorMask.fromArray(L_SPECIES, mb, 0);

View File

@ -224,7 +224,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mla", ">= 1" })
@IR(counts = { "vmla_masked", ">= 1" })
public static void testByteMultiplyAddMasked() {
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
@ -237,7 +237,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mls", ">= 1" })
@IR(counts = { "vmls_masked", ">= 1" })
public static void testByteMultiplySubMasked() {
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
@ -250,7 +250,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mla", ">= 1" })
@IR(counts = { "vmla_masked", ">= 1" })
public static void testShortMultiplyAddMasked() {
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
@ -263,7 +263,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mls", ">= 1" })
@IR(counts = { "vmls_masked", ">= 1" })
public static void testShortMultiplySubMasked() {
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
@ -276,7 +276,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mla", ">= 1" })
@IR(counts = { "vmla_masked", ">= 1" })
public static void testIntMultiplyAddMasked() {
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
@ -289,7 +289,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mls", ">= 1" })
@IR(counts = { "vmls_masked", ">= 1" })
public static void testIntMultiplySubMasked() {
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
@ -302,7 +302,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mla", ">= 1" })
@IR(counts = { "vmla_masked", ">= 1" })
public static void testLongMultiplyAddMasked() {
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
@ -315,7 +315,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_mls", ">= 1" })
@IR(counts = { "vmls_masked", ">= 1" })
public static void testLongMultiplySubMasked() {
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
@ -328,7 +328,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fmsb", ">= 1" })
@IR(counts = { "vfmsb_masked", ">= 1" })
public static void testFloatMultiplySubMasked() {
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -341,7 +341,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fnmad", ">= 1" })
@IR(counts = { "vfnmad_masked", ">= 1" })
public static void testFloatNegatedMultiplyAddMasked() {
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -354,7 +354,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fnmsb", ">= 1" })
@IR(counts = { "vfnmsb_masked", ">= 1" })
public static void testFloatNegatedMultiplySubMasked() {
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
@ -367,7 +367,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fmsb", ">= 1" })
@IR(counts = { "vfmsb_masked", ">= 1" })
public static void testDoubleMultiplySubMasked() {
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
@ -380,7 +380,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fnmad", ">= 1" })
@IR(counts = { "vfnmad_masked", ">= 1" })
public static void testDoubleNegatedMultiplyAddMasked() {
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
@ -393,7 +393,7 @@ public class VectorFusedMultiplyAddSubTest {
}
@Test
@IR(counts = { "sve_fnmsb", ">= 1" })
@IR(counts = { "vfnmsb_masked", ">= 1" })
public static void testDoubleNegatedMultiplySubMasked() {
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {

View File

@ -77,7 +77,7 @@ public class VectorMaskedNotTest {
@Test
@Warmup(10000)
@IR(counts = { "sve_not", ">= 1" })
@IR(counts = { "vnotI_masked", ">= 1" })
public static void testIntNotMasked() {
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
@ -95,7 +95,7 @@ public class VectorMaskedNotTest {
@Test
@Warmup(10000)
@IR(counts = { "sve_not", ">= 1" })
@IR(counts = { "vnotL_masked", ">= 1" })
public static void testLongNotMasked() {
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
LongVector av = LongVector.fromArray(L_SPECIES, la, 0);