8285790: AArch64: Merge C2 NEON and SVE matching rules
Co-authored-by: Ningsheng Jian <njian@openjdk.org> Co-authored-by: Eric Liu <eliu@openjdk.org> Reviewed-by: adinn, aph, xgong
This commit is contained in:
parent
da477b1366
commit
0cc66aeae8
@ -142,8 +142,7 @@ ifeq ($(call check-jvm-feature, compiler2), true)
|
||||
|
||||
ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
|
||||
AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
|
||||
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_neon.ad \
|
||||
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \
|
||||
$d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_vector.ad \
|
||||
)))
|
||||
endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
6361
src/hotspot/cpu/aarch64/aarch64_vector.ad
Normal file
6361
src/hotspot/cpu/aarch64/aarch64_vector.ad
Normal file
File diff suppressed because it is too large
Load Diff
4701
src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Normal file
4701
src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Normal file
File diff suppressed because it is too large
Load Diff
@ -1004,7 +1004,7 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
|
||||
// Repeat on higher bytes and join the results.
|
||||
// Compress 8 bytes in each iteration.
|
||||
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
|
||||
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
|
||||
sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
|
||||
bytemask_compress(rscratch1);
|
||||
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
|
||||
}
|
||||
@ -1108,6 +1108,7 @@ void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicTyp
|
||||
sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
|
||||
}
|
||||
|
||||
// Clobbers: rflags
|
||||
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
|
||||
FloatRegister zn, FloatRegister zm, int cond) {
|
||||
assert(pg->is_governing(), "This register has to be a governing predicate register");
|
||||
@ -1145,6 +1146,61 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
|
||||
subw(dst, rscratch1, dst);
|
||||
}
|
||||
|
||||
// Extend integer vector src to dst with the same lane count
|
||||
// but larger element size, e.g. 4B -> 4I
|
||||
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
|
||||
FloatRegister src, BasicType src_bt) {
|
||||
if (src_bt == T_BYTE) {
|
||||
if (dst_bt == T_SHORT) {
|
||||
// 4B/8B to 4S/8S
|
||||
assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
|
||||
sxtl(dst, T8H, src, T8B);
|
||||
} else {
|
||||
// 4B to 4I
|
||||
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
|
||||
sxtl(dst, T8H, src, T8B);
|
||||
sxtl(dst, T4S, dst, T4H);
|
||||
}
|
||||
} else if (src_bt == T_SHORT) {
|
||||
// 4S to 4I
|
||||
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
|
||||
sxtl(dst, T4S, src, T4H);
|
||||
} else if (src_bt == T_INT) {
|
||||
// 2I to 2L
|
||||
assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
|
||||
sxtl(dst, T2D, src, T2S);
|
||||
} else {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
// Narrow integer vector src down to dst with the same lane count
|
||||
// but smaller element size, e.g. 4I -> 4B
|
||||
void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
|
||||
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
|
||||
if (src_bt == T_SHORT) {
|
||||
// 4S/8S to 4B/8B
|
||||
assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
|
||||
assert(dst_bt == T_BYTE, "unsupported");
|
||||
xtn(dst, T8B, src, T8H);
|
||||
} else if (src_bt == T_INT) {
|
||||
// 4I to 4B/4S
|
||||
assert(src_vlen_in_bytes == 16, "unsupported");
|
||||
assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
|
||||
xtn(dst, T4H, src, T4S);
|
||||
if (dst_bt == T_BYTE) {
|
||||
xtn(dst, T8B, dst, T8H);
|
||||
}
|
||||
} else if (src_bt == T_LONG) {
|
||||
// 2L to 2I
|
||||
assert(src_vlen_in_bytes == 16, "unsupported");
|
||||
assert(dst_bt == T_INT, "unsupported");
|
||||
xtn(dst, T2S, src, T2D);
|
||||
} else {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
|
||||
FloatRegister src, SIMD_RegVariant src_size) {
|
||||
assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
|
||||
@ -1257,6 +1313,275 @@ void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
// Vector reduction add for integral type with ASIMD instructions.
|
||||
void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes,
|
||||
FloatRegister vtmp) {
|
||||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
|
||||
assert_different_registers(dst, isrc);
|
||||
bool isQ = vector_length_in_bytes == 16;
|
||||
|
||||
BLOCK_COMMENT("neon_reduce_add_integral {");
|
||||
switch(bt) {
|
||||
case T_BYTE:
|
||||
addv(vtmp, isQ ? T16B : T8B, vsrc);
|
||||
smov(dst, vtmp, B, 0);
|
||||
addw(dst, dst, isrc, ext::sxtb);
|
||||
break;
|
||||
case T_SHORT:
|
||||
addv(vtmp, isQ ? T8H : T4H, vsrc);
|
||||
smov(dst, vtmp, H, 0);
|
||||
addw(dst, dst, isrc, ext::sxth);
|
||||
break;
|
||||
case T_INT:
|
||||
isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
|
||||
umov(dst, vtmp, S, 0);
|
||||
addw(dst, dst, isrc);
|
||||
break;
|
||||
case T_LONG:
|
||||
assert(isQ, "unsupported");
|
||||
addpd(vtmp, vsrc);
|
||||
umov(dst, vtmp, D, 0);
|
||||
add(dst, dst, isrc);
|
||||
break;
|
||||
default:
|
||||
assert(false, "unsupported");
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
BLOCK_COMMENT("} neon_reduce_add_integral");
|
||||
}
|
||||
|
||||
// Vector reduction multiply for integral type with ASIMD instructions.
|
||||
// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
|
||||
// Clobbers: rscratch1
|
||||
void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes,
|
||||
FloatRegister vtmp1, FloatRegister vtmp2) {
|
||||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
|
||||
bool isQ = vector_length_in_bytes == 16;
|
||||
|
||||
BLOCK_COMMENT("neon_reduce_mul_integral {");
|
||||
switch(bt) {
|
||||
case T_BYTE:
|
||||
if (isQ) {
|
||||
// Multiply the lower half and higher half of vector iteratively.
|
||||
// vtmp1 = vsrc[8:15]
|
||||
ins(vtmp1, D, vsrc, 0, 1);
|
||||
// vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
|
||||
mulv(vtmp1, T8B, vtmp1, vsrc);
|
||||
// vtmp2 = vtmp1[4:7]
|
||||
ins(vtmp2, S, vtmp1, 0, 1);
|
||||
// vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
|
||||
mulv(vtmp1, T8B, vtmp2, vtmp1);
|
||||
} else {
|
||||
ins(vtmp1, S, vsrc, 0, 1);
|
||||
mulv(vtmp1, T8B, vtmp1, vsrc);
|
||||
}
|
||||
// vtmp2 = vtmp1[2:3]
|
||||
ins(vtmp2, H, vtmp1, 0, 1);
|
||||
// vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
|
||||
mulv(vtmp2, T8B, vtmp2, vtmp1);
|
||||
// dst = vtmp2[0] * isrc * vtmp2[1]
|
||||
umov(rscratch1, vtmp2, B, 0);
|
||||
mulw(dst, rscratch1, isrc);
|
||||
sxtb(dst, dst);
|
||||
umov(rscratch1, vtmp2, B, 1);
|
||||
mulw(dst, rscratch1, dst);
|
||||
sxtb(dst, dst);
|
||||
break;
|
||||
case T_SHORT:
|
||||
if (isQ) {
|
||||
ins(vtmp2, D, vsrc, 0, 1);
|
||||
mulv(vtmp2, T4H, vtmp2, vsrc);
|
||||
ins(vtmp1, S, vtmp2, 0, 1);
|
||||
mulv(vtmp1, T4H, vtmp1, vtmp2);
|
||||
} else {
|
||||
ins(vtmp1, S, vsrc, 0, 1);
|
||||
mulv(vtmp1, T4H, vtmp1, vsrc);
|
||||
}
|
||||
umov(rscratch1, vtmp1, H, 0);
|
||||
mulw(dst, rscratch1, isrc);
|
||||
sxth(dst, dst);
|
||||
umov(rscratch1, vtmp1, H, 1);
|
||||
mulw(dst, rscratch1, dst);
|
||||
sxth(dst, dst);
|
||||
break;
|
||||
case T_INT:
|
||||
if (isQ) {
|
||||
ins(vtmp1, D, vsrc, 0, 1);
|
||||
mulv(vtmp1, T2S, vtmp1, vsrc);
|
||||
} else {
|
||||
vtmp1 = vsrc;
|
||||
}
|
||||
umov(rscratch1, vtmp1, S, 0);
|
||||
mul(dst, rscratch1, isrc);
|
||||
umov(rscratch1, vtmp1, S, 1);
|
||||
mul(dst, rscratch1, dst);
|
||||
break;
|
||||
case T_LONG:
|
||||
umov(rscratch1, vsrc, D, 0);
|
||||
mul(dst, isrc, rscratch1);
|
||||
umov(rscratch1, vsrc, D, 1);
|
||||
mul(dst, dst, rscratch1);
|
||||
break;
|
||||
default:
|
||||
assert(false, "unsupported");
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
BLOCK_COMMENT("} neon_reduce_mul_integral");
|
||||
}
|
||||
|
||||
// Vector reduction multiply for floating-point type with ASIMD instructions.
|
||||
void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
|
||||
FloatRegister fsrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes,
|
||||
FloatRegister vtmp) {
|
||||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
|
||||
bool isQ = vector_length_in_bytes == 16;
|
||||
|
||||
BLOCK_COMMENT("neon_reduce_mul_fp {");
|
||||
switch(bt) {
|
||||
case T_FLOAT:
|
||||
fmuls(dst, fsrc, vsrc);
|
||||
ins(vtmp, S, vsrc, 0, 1);
|
||||
fmuls(dst, dst, vtmp);
|
||||
if (isQ) {
|
||||
ins(vtmp, S, vsrc, 0, 2);
|
||||
fmuls(dst, dst, vtmp);
|
||||
ins(vtmp, S, vsrc, 0, 3);
|
||||
fmuls(dst, dst, vtmp);
|
||||
}
|
||||
break;
|
||||
case T_DOUBLE:
|
||||
assert(isQ, "unsupported");
|
||||
fmuld(dst, fsrc, vsrc);
|
||||
ins(vtmp, D, vsrc, 0, 1);
|
||||
fmuld(dst, dst, vtmp);
|
||||
break;
|
||||
default:
|
||||
assert(false, "unsupported");
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
BLOCK_COMMENT("} neon_reduce_mul_fp");
|
||||
}
|
||||
|
||||
// Helper to select logical instruction
|
||||
void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
|
||||
Register Rn, Register Rm,
|
||||
enum shift_kind kind, unsigned shift) {
|
||||
switch(opc) {
|
||||
case Op_AndReductionV:
|
||||
is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
|
||||
break;
|
||||
case Op_OrReductionV:
|
||||
is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
|
||||
break;
|
||||
case Op_XorReductionV:
|
||||
is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
|
||||
break;
|
||||
default:
|
||||
assert(false, "unsupported");
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
// Vector reduction logical operations And, Or, Xor
|
||||
// Clobbers: rscratch1
|
||||
void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes) {
|
||||
assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
|
||||
"unsupported");
|
||||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
|
||||
assert_different_registers(dst, isrc);
|
||||
bool isQ = vector_length_in_bytes == 16;
|
||||
|
||||
BLOCK_COMMENT("neon_reduce_logical {");
|
||||
umov(rscratch1, vsrc, isQ ? D : S, 0);
|
||||
umov(dst, vsrc, isQ ? D : S, 1);
|
||||
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
|
||||
switch(bt) {
|
||||
case T_BYTE:
|
||||
if (isQ) {
|
||||
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
|
||||
}
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
|
||||
sxtb(dst, dst);
|
||||
break;
|
||||
case T_SHORT:
|
||||
if (isQ) {
|
||||
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
|
||||
}
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
|
||||
sxth(dst, dst);
|
||||
break;
|
||||
case T_INT:
|
||||
if (isQ) {
|
||||
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
|
||||
}
|
||||
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
|
||||
break;
|
||||
case T_LONG:
|
||||
assert(isQ, "unsupported");
|
||||
neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
|
||||
break;
|
||||
default:
|
||||
assert(false, "unsupported");
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
BLOCK_COMMENT("} neon_reduce_logical");
|
||||
}
|
||||
|
||||
// Vector reduction min/max for integral type with ASIMD instructions.
|
||||
// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
|
||||
// Clobbers: rscratch1, rflags
|
||||
void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes,
|
||||
FloatRegister vtmp) {
|
||||
assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
|
||||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
|
||||
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
|
||||
assert_different_registers(dst, isrc);
|
||||
bool isQ = vector_length_in_bytes == 16;
|
||||
bool is_min = opc == Op_MinReductionV;
|
||||
|
||||
BLOCK_COMMENT("neon_reduce_minmax_integral {");
|
||||
if (bt == T_LONG) {
|
||||
assert(vtmp == fnoreg, "should be");
|
||||
assert(isQ, "should be");
|
||||
umov(rscratch1, vsrc, D, 0);
|
||||
cmp(isrc, rscratch1);
|
||||
csel(dst, isrc, rscratch1, is_min ? LT : GT);
|
||||
umov(rscratch1, vsrc, D, 1);
|
||||
cmp(dst, rscratch1);
|
||||
csel(dst, dst, rscratch1, is_min ? LT : GT);
|
||||
} else {
|
||||
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
|
||||
if (size == T2S) {
|
||||
is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
|
||||
} else {
|
||||
is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
|
||||
}
|
||||
if (bt == T_INT) {
|
||||
umov(dst, vtmp, S, 0);
|
||||
} else {
|
||||
smov(dst, vtmp, elemType_to_regVariant(bt), 0);
|
||||
}
|
||||
cmpw(dst, isrc);
|
||||
cselw(dst, dst, isrc, is_min ? LT : GT);
|
||||
}
|
||||
BLOCK_COMMENT("} neon_reduce_minmax_integral");
|
||||
}
|
||||
|
||||
// Vector reduction for integral type with SVE instruction.
|
||||
// Supported operations are Add, And, Or, Xor, Max, Min.
|
||||
// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
|
||||
void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
|
||||
FloatRegister src2, PRegister pg, FloatRegister tmp) {
|
||||
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
|
||||
@ -1267,12 +1592,14 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
|
||||
switch (opc) {
|
||||
case Op_AddReductionVI: {
|
||||
sve_uaddv(tmp, size, pg, src2);
|
||||
smov(dst, tmp, size, 0);
|
||||
if (bt == T_BYTE) {
|
||||
smov(dst, tmp, size, 0);
|
||||
addw(dst, src1, dst, ext::sxtb);
|
||||
} else if (bt == T_SHORT) {
|
||||
smov(dst, tmp, size, 0);
|
||||
addw(dst, src1, dst, ext::sxth);
|
||||
} else {
|
||||
umov(dst, tmp, size, 0);
|
||||
addw(dst, dst, src1);
|
||||
}
|
||||
break;
|
||||
@ -1285,45 +1612,57 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
|
||||
}
|
||||
case Op_AndReductionV: {
|
||||
sve_andv(tmp, size, pg, src2);
|
||||
if (bt == T_LONG) {
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, tmp, size, 0);
|
||||
andr(dst, dst, src1);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
}
|
||||
if (bt == T_LONG) {
|
||||
andr(dst, dst, src1);
|
||||
} else {
|
||||
andw(dst, dst, src1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Op_OrReductionV: {
|
||||
sve_orv(tmp, size, pg, src2);
|
||||
if (bt == T_LONG) {
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, tmp, size, 0);
|
||||
orr(dst, dst, src1);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
}
|
||||
if (bt == T_LONG) {
|
||||
orr(dst, dst, src1);
|
||||
} else {
|
||||
orrw(dst, dst, src1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Op_XorReductionV: {
|
||||
sve_eorv(tmp, size, pg, src2);
|
||||
if (bt == T_LONG) {
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, tmp, size, 0);
|
||||
eor(dst, dst, src1);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
}
|
||||
if (bt == T_LONG) {
|
||||
eor(dst, dst, src1);
|
||||
} else {
|
||||
eorw(dst, dst, src1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Op_MaxReductionV: {
|
||||
sve_smaxv(tmp, size, pg, src2);
|
||||
if (bt == T_LONG) {
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, tmp, size, 0);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
}
|
||||
if (bt == T_LONG) {
|
||||
cmp(dst, src1);
|
||||
csel(dst, dst, src1, Assembler::GT);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
cmpw(dst, src1);
|
||||
cselw(dst, dst, src1, Assembler::GT);
|
||||
}
|
||||
@ -1331,12 +1670,15 @@ void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt,
|
||||
}
|
||||
case Op_MinReductionV: {
|
||||
sve_sminv(tmp, size, pg, src2);
|
||||
if (bt == T_LONG) {
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, tmp, size, 0);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
}
|
||||
if (bt == T_LONG) {
|
||||
cmp(dst, src1);
|
||||
csel(dst, dst, src1, Assembler::LT);
|
||||
} else {
|
||||
smov(dst, tmp, size, 0);
|
||||
cmpw(dst, src1);
|
||||
cselw(dst, dst, src1, Assembler::LT);
|
||||
}
|
||||
@ -1573,23 +1915,32 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src,
|
||||
|
||||
// Extract a scalar element from an sve vector at position 'idx'.
|
||||
// The input elements in src are expected to be of integral type.
|
||||
void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
|
||||
bool is_signed, FloatRegister vtmp) {
|
||||
assert(UseSVE > 0 && size != Q, "unsupported");
|
||||
assert(!(is_signed && size == D), "signed extract (D) not supported.");
|
||||
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
|
||||
int idx, FloatRegister vtmp) {
|
||||
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
|
||||
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
|
||||
if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
|
||||
is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx);
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, src, size, idx);
|
||||
} else {
|
||||
smov(dst, src, size, idx);
|
||||
}
|
||||
} else {
|
||||
sve_orr(vtmp, src, src);
|
||||
sve_ext(vtmp, vtmp, idx << size);
|
||||
is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0);
|
||||
if (bt == T_INT || bt == T_LONG) {
|
||||
umov(dst, vtmp, size, 0);
|
||||
} else {
|
||||
smov(dst, vtmp, size, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// java.lang.Math::round intrinsics
|
||||
|
||||
// Clobbers: rscratch1, rflags
|
||||
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
|
||||
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
|
||||
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
|
||||
assert_different_registers(tmp1, tmp2, tmp3, src, dst);
|
||||
switch (T) {
|
||||
case T2S:
|
||||
@ -1620,8 +1971,10 @@ void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src,
|
||||
// result in dst
|
||||
}
|
||||
|
||||
// Clobbers: rscratch1, rflags
|
||||
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
|
||||
FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) {
|
||||
FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
|
||||
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
|
||||
assert_different_registers(tmp1, tmp2, src, dst);
|
||||
|
||||
switch (T) {
|
||||
@ -1632,7 +1985,7 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
|
||||
mov(rscratch1, julong_cast(0x1.0p52));
|
||||
break;
|
||||
default:
|
||||
assert(T == S || T == D, "invalid arrangement");
|
||||
assert(T == S || T == D, "invalid register variant");
|
||||
}
|
||||
|
||||
sve_frinta(dst, T, ptrue, src);
|
||||
@ -1642,12 +1995,12 @@ void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, F
|
||||
|
||||
sve_fneg(tmp1, T, ptrue, src);
|
||||
sve_dup(tmp2, T, rscratch1);
|
||||
sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1);
|
||||
sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
|
||||
br(EQ, none);
|
||||
{
|
||||
sve_cpy(tmp1, T, ptmp, 0.5);
|
||||
sve_fadd(tmp1, T, ptmp, src);
|
||||
sve_frintm(dst, T, ptmp, tmp1);
|
||||
sve_cpy(tmp1, T, pgtmp, 0.5);
|
||||
sve_fadd(tmp1, T, pgtmp, src);
|
||||
sve_frintm(dst, T, pgtmp, tmp1);
|
||||
// dst = floor(src + 0.5, ties to even)
|
||||
}
|
||||
bind(none);
|
||||
|
@ -31,6 +31,9 @@
|
||||
// Return true if the phase output is in the scratch emit size mode.
|
||||
virtual bool in_scratch_emit_size() override;
|
||||
|
||||
void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
|
||||
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
|
||||
|
||||
public:
|
||||
void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
|
||||
static int entry_barrier_stub_size();
|
||||
@ -84,6 +87,13 @@
|
||||
|
||||
void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);
|
||||
|
||||
// Vector cast
|
||||
void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
|
||||
FloatRegister src, BasicType src_bt);
|
||||
|
||||
void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
|
||||
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
|
||||
|
||||
void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
|
||||
FloatRegister src, SIMD_RegVariant src_size);
|
||||
|
||||
@ -96,6 +106,27 @@
|
||||
void sve_vmaskcast_narrow(PRegister dst, PRegister src,
|
||||
uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
|
||||
|
||||
// Vector reduction
|
||||
void neon_reduce_add_integral(Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes, FloatRegister vtmp);
|
||||
|
||||
void neon_reduce_mul_integral(Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes,
|
||||
FloatRegister vtmp1, FloatRegister vtmp2);
|
||||
|
||||
void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
|
||||
FloatRegister fsrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes, FloatRegister vtmp);
|
||||
|
||||
void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
|
||||
FloatRegister vsrc, unsigned vector_length_in_bytes);
|
||||
|
||||
void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
|
||||
Register isrc, FloatRegister vsrc,
|
||||
unsigned vector_length_in_bytes, FloatRegister vtmp);
|
||||
|
||||
void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
|
||||
FloatRegister src2, PRegister pg, FloatRegister tmp);
|
||||
|
||||
@ -107,15 +138,15 @@
|
||||
|
||||
// Extract a scalar element from an sve vector at position 'idx'.
|
||||
// The input elements in src are expected to be of integral type.
|
||||
void sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
|
||||
bool is_signed, FloatRegister vtmp);
|
||||
void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
|
||||
int idx, FloatRegister vtmp);
|
||||
|
||||
// java.lang.Math::round intrinsics
|
||||
void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
|
||||
FloatRegister tmp2, FloatRegister tmp3,
|
||||
SIMD_Arrangement T);
|
||||
void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
|
||||
FloatRegister tmp2, PRegister ptmp,
|
||||
FloatRegister tmp2, PRegister pgtmp,
|
||||
SIMD_RegVariant T);
|
||||
|
||||
// Pack active elements of src, under the control of mask, into the
|
||||
|
@ -52,8 +52,8 @@
|
||||
// the cpu only look at the lower 5/6 bits anyway?
|
||||
static const bool need_masked_shift_count = false;
|
||||
|
||||
// No support for generic vector operands.
|
||||
static const bool supports_generic_vector_operands = false;
|
||||
// aarch64 supports generic vector operands: vReg.
|
||||
static const bool supports_generic_vector_operands = true;
|
||||
|
||||
static constexpr bool isSimpleConstant64(jlong value) {
|
||||
// Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
|
||||
|
@ -137,7 +137,7 @@ class FloatRegisterImpl: public AbstractRegisterImpl {
|
||||
public:
|
||||
enum {
|
||||
number_of_registers = 32,
|
||||
max_slots_per_register = 8,
|
||||
max_slots_per_register = 4,
|
||||
save_slots_per_register = 2,
|
||||
slots_per_neon_register = 4,
|
||||
extra_save_slots_per_neon_register = slots_per_neon_register - save_slots_per_register
|
||||
|
@ -167,6 +167,12 @@ public:
|
||||
static void initialize_cpu_information(void);
|
||||
|
||||
static bool use_rop_protection() { return _rop_protection; }
|
||||
|
||||
// For common 64/128-bit unpredicated vector operations, we may prefer
|
||||
// emitting NEON instructions rather than the corresponding SVE instructions.
|
||||
static bool use_neon_for_vector(int vector_length_in_bytes) {
|
||||
return vector_length_in_bytes <= 16;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CPU_AARCH64_VM_VERSION_AARCH64_HPP
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -2274,6 +2274,9 @@ private:
|
||||
#if defined(PPC64)
|
||||
if (strcmp(rep_var,"$VectorRegister") == 0) return "as_VectorRegister";
|
||||
if (strcmp(rep_var,"$VectorSRegister") == 0) return "as_VectorSRegister";
|
||||
#endif
|
||||
#if defined(AARCH64)
|
||||
if (strcmp(rep_var,"$PRegister") == 0) return "as_PRegister";
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
@ -135,6 +135,14 @@ public:
|
||||
return ::as_VectorSRegister(reg(ra_, node, idx));
|
||||
}
|
||||
#endif
|
||||
#if defined(AARCH64)
|
||||
PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node) const {
|
||||
return ::as_PRegister(reg(ra_, node));
|
||||
}
|
||||
PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node, int idx) const {
|
||||
return ::as_PRegister(reg(ra_, node, idx));
|
||||
}
|
||||
#endif
|
||||
|
||||
virtual intptr_t constant() const;
|
||||
virtual relocInfo::relocType constant_reloc() const;
|
||||
|
@ -99,7 +99,7 @@ class RegMask {
|
||||
// requirement is internal to the allocator, and independent of any
|
||||
// particular platform.
|
||||
enum { SlotsPerLong = 2,
|
||||
SlotsPerVecA = RISCV_ONLY(4) NOT_RISCV(8),
|
||||
SlotsPerVecA = 4,
|
||||
SlotsPerVecS = 1,
|
||||
SlotsPerVecD = 2,
|
||||
SlotsPerVecX = 4,
|
||||
|
@ -84,7 +84,7 @@ public class AllBitsSetVectorMatchRuleTest {
|
||||
|
||||
@Test
|
||||
@Warmup(10000)
|
||||
@IR(counts = { "bic", " >= 1" })
|
||||
@IR(counts = { "vand_notI", " >= 1" })
|
||||
public static void testAllBitsSetVector() {
|
||||
IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
|
||||
IntVector bv = IntVector.fromArray(I_SPECIES, ib, 0);
|
||||
@ -98,7 +98,7 @@ public class AllBitsSetVectorMatchRuleTest {
|
||||
|
||||
@Test
|
||||
@Warmup(10000)
|
||||
@IR(counts = { "bic", " >= 1" })
|
||||
@IR(counts = { "and_notL", " >= 1" })
|
||||
public static void testAllBitsSetMask() {
|
||||
VectorMask<Long> avm = VectorMask.fromArray(L_SPECIES, ma, 0);
|
||||
VectorMask<Long> bvm = VectorMask.fromArray(L_SPECIES, mb, 0);
|
||||
|
@ -224,7 +224,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mla", ">= 1" })
|
||||
@IR(counts = { "vmla_masked", ">= 1" })
|
||||
public static void testByteMultiplyAddMasked() {
|
||||
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
|
||||
@ -237,7 +237,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mls", ">= 1" })
|
||||
@IR(counts = { "vmls_masked", ">= 1" })
|
||||
public static void testByteMultiplySubMasked() {
|
||||
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
|
||||
@ -250,7 +250,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mla", ">= 1" })
|
||||
@IR(counts = { "vmla_masked", ">= 1" })
|
||||
public static void testShortMultiplyAddMasked() {
|
||||
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
|
||||
@ -263,7 +263,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mls", ">= 1" })
|
||||
@IR(counts = { "vmls_masked", ">= 1" })
|
||||
public static void testShortMultiplySubMasked() {
|
||||
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
|
||||
@ -276,7 +276,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mla", ">= 1" })
|
||||
@IR(counts = { "vmla_masked", ">= 1" })
|
||||
public static void testIntMultiplyAddMasked() {
|
||||
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
|
||||
@ -289,7 +289,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mls", ">= 1" })
|
||||
@IR(counts = { "vmls_masked", ">= 1" })
|
||||
public static void testIntMultiplySubMasked() {
|
||||
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
|
||||
@ -302,7 +302,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mla", ">= 1" })
|
||||
@IR(counts = { "vmla_masked", ">= 1" })
|
||||
public static void testLongMultiplyAddMasked() {
|
||||
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
|
||||
@ -315,7 +315,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_mls", ">= 1" })
|
||||
@IR(counts = { "vmls_masked", ">= 1" })
|
||||
public static void testLongMultiplySubMasked() {
|
||||
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
|
||||
@ -328,7 +328,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fmsb", ">= 1" })
|
||||
@IR(counts = { "vfmsb_masked", ">= 1" })
|
||||
public static void testFloatMultiplySubMasked() {
|
||||
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
|
||||
@ -341,7 +341,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fnmad", ">= 1" })
|
||||
@IR(counts = { "vfnmad_masked", ">= 1" })
|
||||
public static void testFloatNegatedMultiplyAddMasked() {
|
||||
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
|
||||
@ -354,7 +354,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fnmsb", ">= 1" })
|
||||
@IR(counts = { "vfnmsb_masked", ">= 1" })
|
||||
public static void testFloatNegatedMultiplySubMasked() {
|
||||
VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
|
||||
@ -367,7 +367,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fmsb", ">= 1" })
|
||||
@IR(counts = { "vfmsb_masked", ">= 1" })
|
||||
public static void testDoubleMultiplySubMasked() {
|
||||
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
|
||||
@ -380,7 +380,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fnmad", ">= 1" })
|
||||
@IR(counts = { "vfnmad_masked", ">= 1" })
|
||||
public static void testDoubleNegatedMultiplyAddMasked() {
|
||||
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
|
||||
@ -393,7 +393,7 @@ public class VectorFusedMultiplyAddSubTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { "sve_fnmsb", ">= 1" })
|
||||
@IR(counts = { "vfnmsb_masked", ">= 1" })
|
||||
public static void testDoubleNegatedMultiplySubMasked() {
|
||||
VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
|
||||
for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
|
||||
|
@ -77,7 +77,7 @@ public class VectorMaskedNotTest {
|
||||
|
||||
@Test
|
||||
@Warmup(10000)
|
||||
@IR(counts = { "sve_not", ">= 1" })
|
||||
@IR(counts = { "vnotI_masked", ">= 1" })
|
||||
public static void testIntNotMasked() {
|
||||
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
|
||||
IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
|
||||
@ -95,7 +95,7 @@ public class VectorMaskedNotTest {
|
||||
|
||||
@Test
|
||||
@Warmup(10000)
|
||||
@IR(counts = { "sve_not", ">= 1" })
|
||||
@IR(counts = { "vnotL_masked", ">= 1" })
|
||||
public static void testLongNotMasked() {
|
||||
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
|
||||
LongVector av = LongVector.fromArray(L_SPECIES, la, 0);
|
||||
|
Loading…
x
Reference in New Issue
Block a user