8267356: AArch64: Vector API SVE codegen support

Co-authored-by: Xiaohong Gong <xgong@openjdk.org>
Co-authored-by: Wang Huang <whuang@openjdk.org>
Co-authored-by: Ningsheng Jian <njian@openjdk.org>
Co-authored-by: Xuejin He <xhe@openjdk.org>
Co-authored-by: Ai Jiaming <aijiaming1@huawei.com>
Co-authored-by: Eric Liu <eliu@openjdk.org>
Reviewed-by: aph, ngasson
This commit is contained in:
Ningsheng Jian 2021-09-23 02:58:59 +00:00
parent 603138895f
commit 9d3379b975
13 changed files with 5060 additions and 679 deletions

View File

@ -1902,7 +1902,7 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
__ bind(L_skip_barrier);
}
if (C->max_vector_size() >= 16) {
if (C->max_vector_size() > 0) {
__ reinitialize_ptrue();
}
@ -2388,7 +2388,7 @@ const bool Matcher::match_rule_supported(int opcode) {
// Identify extra cases that we might want to provide match rules for vector nodes and
// other intrinsics guarded with vector length (vlen) and element type (bt).
const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) {
if (!match_rule_supported(opcode)) {
return false;
}
int bit_size = vlen * type2aelembytes(bt) * 8;
@ -2396,7 +2396,7 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
if (UseSVE > 0) {
return op_sve_supported(opcode);
return op_sve_supported(opcode, vlen, bt);
} else { // NEON
// Special cases
switch (opcode) {
@ -2438,11 +2438,14 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
return false;
default:
break;
}
}
return true; // Per default match rules are supported.
return vector_size_supported(bt, vlen);
}
const RegMask* Matcher::predicate_reg_mask(void) {
@ -2488,24 +2491,20 @@ const int Matcher::vector_width_in_bytes(BasicType bt) {
const int Matcher::max_vector_size(const BasicType bt) {
return vector_width_in_bytes(bt)/type2aelembytes(bt);
}
const int Matcher::min_vector_size(const BasicType bt) {
int max_size = max_vector_size(bt);
if ((UseSVE > 0) && (MaxVectorSize >= 16)) {
// Currently vector length less than SVE vector register size is not supported.
return max_size;
} else { // NEON
// Limit the vector size to 8 bytes
int size = 8 / type2aelembytes(bt);
if (bt == T_BYTE) {
// To support vector api shuffle/rearrange.
size = 4;
} else if (bt == T_BOOLEAN) {
// To support vector api load/store mask.
size = 2;
}
if (size < 2) size = 2;
return MIN2(size,max_size);
// Limit the min vector size to 8 bytes.
int size = 8 / type2aelembytes(bt);
if (bt == T_BYTE) {
// To support vector api shuffle/rearrange.
size = 4;
} else if (bt == T_BOOLEAN) {
// To support vector api load/store mask.
size = 2;
}
if (size < 2) size = 2;
return MIN2(size, max_size);
}
// Actual max scalable vector register length.
@ -2515,7 +2514,7 @@ const int Matcher::scalable_vector_reg_size(const BasicType bt) {
// Vector ideal reg.
const uint Matcher::vector_ideal_reg(int len) {
if (UseSVE > 0 && 16 <= len && len <= 256) {
if (UseSVE > 0 && 2 <= len && len <= 256) {
return Op_VecA;
}
switch(len) {
@ -3720,7 +3719,7 @@ encode %{
}
// Only non uncommon_trap calls need to reinitialize ptrue.
if (Compile::current()->max_vector_size() >= 16 && uncommon_trap_request() == 0) {
if (Compile::current()->max_vector_size() > 0 && uncommon_trap_request() == 0) {
__ reinitialize_ptrue();
}
%}
@ -3732,7 +3731,7 @@ encode %{
if (call == NULL) {
ciEnv::current()->record_failure("CodeCache is full");
return;
} else if (Compile::current()->max_vector_size() >= 16) {
} else if (Compile::current()->max_vector_size() > 0) {
__ reinitialize_ptrue();
}
%}
@ -3770,7 +3769,7 @@ encode %{
__ bind(retaddr);
__ add(sp, sp, 2 * wordSize);
}
if (Compile::current()->max_vector_size() >= 16) {
if (Compile::current()->max_vector_size() > 0) {
__ reinitialize_ptrue();
}
%}
@ -3783,7 +3782,7 @@ encode %{
enc_class aarch64_enc_ret() %{
C2_MacroAssembler _masm(&cbuf);
#ifdef ASSERT
if (Compile::current()->max_vector_size() >= 16) {
if (Compile::current()->max_vector_size() > 0) {
__ verify_ptrue();
}
#endif
@ -4156,6 +4155,16 @@ operand immIExt()
interface(CONST_INTER);
%}
operand immI_gt_1()
%{
predicate(n->get_int() > 1);
match(ConI);
op_cost(0);
format %{ %}
interface(CONST_INTER);
%}
operand immI_le_4()
%{
predicate(n->get_int() <= 4);

View File

@ -33,7 +33,7 @@
// Load Vector (16 bits)
instruct loadV2(vecD dst, vmem2 mem)
%{
predicate(n->as_LoadVector()->memory_size() == 2);
predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 2);
match(Set dst (LoadVector mem));
ins_cost(4 * INSN_COST);
format %{ "ldrh $dst,$mem\t# vector (16 bits)" %}
@ -44,7 +44,7 @@ instruct loadV2(vecD dst, vmem2 mem)
// Load Vector (32 bits)
instruct loadV4(vecD dst, vmem4 mem)
%{
predicate(n->as_LoadVector()->memory_size() == 4);
predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 4);
match(Set dst (LoadVector mem));
ins_cost(4 * INSN_COST);
format %{ "ldrs $dst,$mem\t# vector (32 bits)" %}
@ -55,7 +55,7 @@ instruct loadV4(vecD dst, vmem4 mem)
// Load Vector (64 bits)
instruct loadV8(vecD dst, vmem8 mem)
%{
predicate(n->as_LoadVector()->memory_size() == 8);
predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 8);
match(Set dst (LoadVector mem));
ins_cost(4 * INSN_COST);
format %{ "ldrd $dst,$mem\t# vector (64 bits)" %}
@ -2473,9 +2473,10 @@ instruct vmaskcastX(vecX dst)
instruct loadcon8B(vecD dst, immI0 src)
%{
predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8) &&
n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
predicate(UseSVE == 0 &&
(n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8) &&
n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
match(Set dst (VectorLoadConst src));
ins_cost(INSN_COST);
format %{ "ldr $dst, CONSTANT_MEMORY\t# load iota indices" %}
@ -2488,7 +2489,7 @@ instruct loadcon8B(vecD dst, immI0 src)
instruct loadcon16B(vecX dst, immI0 src)
%{
predicate(n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
predicate(UseSVE == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
match(Set dst (VectorLoadConst src));
ins_cost(INSN_COST);
format %{ "ldr $dst, CONSTANT_MEMORY\t# load iota indices" %}
@ -2945,8 +2946,8 @@ instruct vabd2D(vecX dst, vecX src1, vecX src2)
instruct replicate8B(vecD dst, iRegIorL2I src)
%{
predicate(n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8);
predicate(UseSVE == 0 && (n->as_Vector()->length() == 8 ||
n->as_Vector()->length() == 4));
match(Set dst (ReplicateB src));
ins_cost(INSN_COST);
format %{ "dup $dst, $src\t# vector (8B)" %}
@ -2970,8 +2971,8 @@ instruct replicate16B(vecX dst, iRegIorL2I src)
instruct replicate8B_imm(vecD dst, immI con)
%{
predicate(n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8);
predicate(UseSVE == 0 && (n->as_Vector()->length() == 8 ||
n->as_Vector()->length() == 4));
match(Set dst (ReplicateB con));
ins_cost(INSN_COST);
format %{ "movi $dst, $con\t# vector (8B)" %}
@ -2995,8 +2996,8 @@ instruct replicate16B_imm(vecX dst, immI con)
instruct replicate4S(vecD dst, iRegIorL2I src)
%{
predicate(n->as_Vector()->length() == 2 ||
n->as_Vector()->length() == 4);
predicate(UseSVE == 0 && (n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 2));
match(Set dst (ReplicateS src));
ins_cost(INSN_COST);
format %{ "dup $dst, $src\t# vector (4S)" %}
@ -3020,8 +3021,8 @@ instruct replicate8S(vecX dst, iRegIorL2I src)
instruct replicate4S_imm(vecD dst, immI con)
%{
predicate(n->as_Vector()->length() == 2 ||
n->as_Vector()->length() == 4);
predicate(UseSVE == 0 && (n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 2));
match(Set dst (ReplicateS con));
ins_cost(INSN_COST);
format %{ "movi $dst, $con\t# vector (4H)" %}
@ -3045,7 +3046,7 @@ instruct replicate8S_imm(vecX dst, immI con)
instruct replicate2I(vecD dst, iRegIorL2I src)
%{
predicate(n->as_Vector()->length() == 2);
predicate(UseSVE == 0 && n->as_Vector()->length() == 2);
match(Set dst (ReplicateI src));
ins_cost(INSN_COST);
format %{ "dup $dst, $src\t# vector (2I)" %}
@ -3069,7 +3070,7 @@ instruct replicate4I(vecX dst, iRegIorL2I src)
instruct replicate2I_imm(vecD dst, immI con)
%{
predicate(n->as_Vector()->length() == 2);
predicate(UseSVE == 0 && n->as_Vector()->length() == 2);
match(Set dst (ReplicateI con));
ins_cost(INSN_COST);
format %{ "movi $dst, $con\t# vector (2I)" %}
@ -3119,7 +3120,7 @@ instruct replicate2L_zero(vecX dst, immI0 zero)
instruct replicate2F(vecD dst, vRegF src)
%{
predicate(n->as_Vector()->length() == 2);
predicate(UseSVE == 0 && n->as_Vector()->length() == 2);
match(Set dst (ReplicateF src));
ins_cost(INSN_COST);
format %{ "dup $dst, $src\t# vector (2F)" %}
@ -4249,8 +4250,8 @@ instruct vxor16B(vecX dst, vecX src1, vecX src2)
// ------------------------------ Shift ---------------------------------------
instruct vshiftcnt8B(vecD dst, iRegIorL2I cnt) %{
predicate(n->as_Vector()->length_in_bytes() == 4 ||
n->as_Vector()->length_in_bytes() == 8);
predicate(UseSVE == 0 && (n->as_Vector()->length_in_bytes() == 4 ||
n->as_Vector()->length_in_bytes() == 8));
match(Set dst (LShiftCntV cnt));
match(Set dst (RShiftCntV cnt));
format %{ "dup $dst, $cnt\t# shift count vector (8B)" %}
@ -4261,7 +4262,7 @@ instruct vshiftcnt8B(vecD dst, iRegIorL2I cnt) %{
%}
instruct vshiftcnt16B(vecX dst, iRegIorL2I cnt) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
predicate(UseSVE == 0 && (n->as_Vector()->length_in_bytes() == 16));
match(Set dst (LShiftCntV cnt));
match(Set dst (RShiftCntV cnt));
format %{ "dup $dst, $cnt\t# shift count vector (16B)" %}

View File

@ -69,9 +69,9 @@ instruct $3V$4`'(vec$5 $7, vmem$4 mem)
ins_pipe(v$3`_reg_mem'ifelse(eval($4 * 8), 128, 128, 64));
%}')dnl
dnl $1 $2 $3 $4 $5 $6 $7 $8
VLoadStore(ldrh, H, load, 2, D, 16, dst, )
VLoadStore(ldrs, S, load, 4, D, 32, dst, )
VLoadStore(ldrd, D, load, 8, D, 64, dst, )
VLoadStore(ldrh, H, load, 2, D, 16, dst, UseSVE == 0 && )
VLoadStore(ldrs, S, load, 4, D, 32, dst, UseSVE == 0 && )
VLoadStore(ldrd, D, load, 8, D, 64, dst, UseSVE == 0 && )
VLoadStore(ldrq, Q, load, 16, X, 128, dst, UseSVE == 0 && )
VLoadStore(strh, H, store, 2, D, 16, src, )
VLoadStore(strs, S, store, 4, D, 32, src, )
@ -1196,10 +1196,11 @@ dnl
//-------------------------------- LOAD_IOTA_INDICES----------------------------------
dnl
define(`PREDICATE', `ifelse($1, 8,
`predicate((n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8) &&
n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);',
`predicate(n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);')')dnl
`predicate(UseSVE == 0 &&
(n->as_Vector()->length() == 2 || n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8) &&
n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);',
`predicate(UseSVE == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);')')dnl
dnl
define(`VECTOR_LOAD_CON', `
instruct loadcon$1B`'(vec$2 dst, immI0 src)
@ -1466,9 +1467,10 @@ dnl
define(`VREPLICATE', `
instruct replicate$3$4$5`'(vec$6 dst, $7 ifelse($7, immI0, zero, $7, immI, con, src))
%{
predicate(ifelse($8, UseSVE == 0 && , $8,
$8, , , $8`
')n->as_Vector()->length() == $3);
predicate(UseSVE == 0 && ifelse($8, `',
n->as_Vector()->length() == $3,
(n->as_Vector()->length() == $3 ||`
'n->as_Vector()->length() == $8)));
match(Set dst (Replicate`'ifelse($7, immI0, I, $4) ifelse($7, immI0, zero, $7, immI, con, $7, zero, I, src)));
ins_cost(INSN_COST);
format %{ "$1 $dst, $ifelse($7, immI0, zero, $7, immI, con, src)`\t# vector ('ifelse($4$7, SimmI, $3H, $2, eor, 4I, $3$4)`)"' %}
@ -1494,24 +1496,24 @@ instruct replicate$3$4$5`'(vec$6 dst, $7 ifelse($7, immI0, zero, $7, immI, con,
$7, iRegL, vdup_reg_reg,
$4, F, vdup_reg_freg, vdup_reg_dreg)`'ifelse($6, X, 128, 64));
%}')dnl
dnl $1 $2 $3 $4 $5 $6 $7 $8 $9
VREPLICATE(dup, dup, 8, B, , D, iRegIorL2I, n->as_Vector()->length() == 4 ||, B)
VREPLICATE(dup, dup, 16, B, , X, iRegIorL2I, UseSVE == 0 && , B)
VREPLICATE(movi, mov, 8, B, _imm, D, immI, n->as_Vector()->length() == 4 ||, B)
VREPLICATE(movi, mov, 16, B, _imm, X, immI, UseSVE == 0 && , B)
VREPLICATE(dup, dup, 4, S, , D, iRegIorL2I, n->as_Vector()->length() == 2 ||, H)
VREPLICATE(dup, dup, 8, S, , X, iRegIorL2I, UseSVE == 0 && , H)
VREPLICATE(movi, mov, 4, S, _imm, D, immI, n->as_Vector()->length() == 2 ||, H)
VREPLICATE(movi, mov, 8, S, _imm, X, immI, UseSVE == 0 && , H)
VREPLICATE(dup, dup, 2, I, , D, iRegIorL2I, , S)
VREPLICATE(dup, dup, 4, I, , X, iRegIorL2I, UseSVE == 0 && , S)
VREPLICATE(movi, mov, 2, I, _imm, D, immI, , S)
VREPLICATE(movi, mov, 4, I, _imm, X, immI, UseSVE == 0 && , S)
VREPLICATE(dup, dup, 2, L, , X, iRegL, UseSVE == 0 && , D)
VREPLICATE(movi, eor, 2, L, _zero, X, immI0, UseSVE == 0 && , D)
VREPLICATE(dup, dup, 2, F, , D, vRegF, , S)
VREPLICATE(dup, dup, 4, F, , X, vRegF, UseSVE == 0 && , S)
VREPLICATE(dup, dup, 2, D, , X, vRegD, UseSVE == 0 && , D)
dnl $1 $2 $3 $4 $5 $6 $7 $8 $9
VREPLICATE(dup, dup, 8, B, , D, iRegIorL2I, 4, B)
VREPLICATE(dup, dup, 16, B, , X, iRegIorL2I, , B)
VREPLICATE(movi, mov, 8, B, _imm, D, immI, 4, B)
VREPLICATE(movi, mov, 16, B, _imm, X, immI, , B)
VREPLICATE(dup, dup, 4, S, , D, iRegIorL2I, 2, H)
VREPLICATE(dup, dup, 8, S, , X, iRegIorL2I, , H)
VREPLICATE(movi, mov, 4, S, _imm, D, immI, 2, H)
VREPLICATE(movi, mov, 8, S, _imm, X, immI, , H)
VREPLICATE(dup, dup, 2, I, , D, iRegIorL2I, , S)
VREPLICATE(dup, dup, 4, I, , X, iRegIorL2I, , S)
VREPLICATE(movi, mov, 2, I, _imm, D, immI, , S)
VREPLICATE(movi, mov, 4, I, _imm, X, immI, , S)
VREPLICATE(dup, dup, 2, L, , X, iRegL, , D)
VREPLICATE(movi, eor, 2, L, _zero, X, immI0, , D)
VREPLICATE(dup, dup, 2, F, , D, vRegF, , S)
VREPLICATE(dup, dup, 4, F, , X, vRegF, , S)
VREPLICATE(dup, dup, 2, D, , X, vRegD, , D)
dnl
// ====================REDUCTION ARITHMETIC====================================
@ -1884,8 +1886,8 @@ VLOGICAL(xor, eor, xor, Xor, 16, B, X)
dnl
define(`VSHIFTCNT', `
instruct vshiftcnt$3$4`'(vec$5 dst, iRegIorL2I cnt) %{
predicate(ifelse($3, 8, n->as_Vector()->length_in_bytes() == 4 ||`
')n->as_Vector()->length_in_bytes() == $3);
predicate(UseSVE == 0 && (ifelse($3, 8, n->as_Vector()->length_in_bytes() == 4 ||`
')n->as_Vector()->length_in_bytes() == $3));
match(Set dst (LShiftCntV cnt));
match(Set dst (RShiftCntV cnt));
format %{ "$1 $dst, $cnt\t# shift count vector ($3$4)" %}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -54,10 +54,32 @@ Assembler::SIMD_Arrangement Assembler::_esize2arrangement_table[9][2] = {
/* 8 */ {T1D, T2D}
};
Assembler::SIMD_RegVariant Assembler::_esize2regvariant[9] = {
INVALID,
B,
H,
INVALID,
S,
INVALID,
INVALID,
INVALID,
D,
};
Assembler::SIMD_Arrangement Assembler::esize2arrangement(int esize, bool isQ) {
guarantee(esize == 1 || esize == 2 || esize == 4 || esize == 8, "unsupported element size");
return _esize2arrangement_table[esize][isQ];
Assembler::SIMD_Arrangement Assembler::esize2arrangement(unsigned esize, bool isQ) {
guarantee(esize < ARRAY_SIZE(_esize2arrangement_table) &&
_esize2arrangement_table[esize][isQ] != INVALID_ARRANGEMENT, "unsupported element size");
return _esize2arrangement_table[esize][isQ];
}
Assembler::SIMD_RegVariant Assembler::elemBytes_to_regVariant(unsigned esize) {
guarantee(esize < ARRAY_SIZE(_esize2regvariant) && _esize2regvariant[esize] != INVALID,
"unsupported element size");
return _esize2regvariant[esize];
}
Assembler::SIMD_RegVariant Assembler::elemType_to_regVariant(BasicType bt) {
return elemBytes_to_regVariant(type2aelembytes(bt));
}
void Assembler::emit_data64(jlong data,

View File

@ -1502,17 +1502,20 @@ public:
T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q, INVALID_ARRANGEMENT
};
enum SIMD_RegVariant {
B, H, S, D, Q, INVALID
};
private:
static SIMD_Arrangement _esize2arrangement_table[9][2];
static SIMD_RegVariant _esize2regvariant[9];
public:
static SIMD_Arrangement esize2arrangement(int esize, bool isQ);
enum SIMD_RegVariant {
B, H, S, D, Q, INVALID
};
static SIMD_Arrangement esize2arrangement(unsigned esize, bool isQ);
static SIMD_RegVariant elemType_to_regVariant(BasicType bt);
static SIMD_RegVariant elemBytes_to_regVariant(unsigned esize);
enum shift_kind { LSL, LSR, ASR, ROR };
@ -2927,7 +2930,7 @@ public:
f(0, 10), rf(Vn, 5), rf(Vd, 0);
}
// SVE arithmetics - unpredicated
// SVE arithmetic - unpredicated
#define INSN(NAME, opcode) \
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \
starti; \
@ -2964,7 +2967,7 @@ private:
public:
// SVE integer arithmetics - predicate
// SVE integer arithmetic - predicate
#define INSN(NAME, op1, op2) \
void NAME(FloatRegister Zdn_or_Zd_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm_or_Vn) { \
assert(T != Q, "invalid register variant"); \
@ -2992,7 +2995,7 @@ public:
INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar
#undef INSN
// SVE floating-point arithmetics - predicate
// SVE floating-point arithmetic - predicate
#define INSN(NAME, op1, op2) \
void NAME(FloatRegister Zd_or_Zdn_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn_or_Zm) { \
assert(T == S || T == D, "invalid register variant"); \
@ -3121,7 +3124,7 @@ private:
public:
// SVE load/store - predicated
// SVE contiguous load/store
#define INSN(NAME, op1, type, imm_op2, scalar_op2) \
void NAME(FloatRegister Zt, SIMD_RegVariant T, PRegister Pg, const Address &a) { \
assert(T != Q, "invalid register variant"); \
@ -3138,6 +3141,23 @@ public:
INSN(sve_st1d, 0b1110010, 0b11, 0b111, 0b010);
#undef INSN
// Gather/scatter load/store (SVE) - scalar plus vector
#define INSN(NAME, op1, type, op2, op3) \
void NAME(FloatRegister Zt, PRegister Pg, Register Xn, FloatRegister Zm) { \
starti; \
f(op1, 31, 25), f(type, 24, 23), f(op2, 22, 21), rf(Zm, 16); \
f(op3, 15, 13), pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); \
}
// SVE 32-bit gather load words (scalar plus 32-bit scaled offsets)
INSN(sve_ld1w_gather, 0b1000010, 0b10, 0b01, 0b010);
// SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
INSN(sve_ld1d_gather, 0b1100010, 0b11, 0b01, 0b010);
// SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
INSN(sve_st1w_scatter, 0b1110010, 0b10, 0b11, 0b100);
// SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offsets)
INSN(sve_st1d_scatter, 0b1110010, 0b11, 0b01, 0b100);
#undef INSN
// SVE load/store - unpredicated
#define INSN(NAME, op1) \
void NAME(FloatRegister Zt, const Address &a) { \
@ -3151,6 +3171,7 @@ public:
INSN(sve_str, 0b111); // STR (vector)
#undef INSN
// SVE stack frame adjustment
#define INSN(NAME, op) \
void NAME(Register Xd, Register Xn, int imm6) { \
starti; \
@ -3158,8 +3179,8 @@ public:
srf(Xn, 16), f(0b01010, 15, 11), sf(imm6, 10, 5), srf(Xd, 0); \
}
INSN(sve_addvl, 0b01);
INSN(sve_addpl, 0b11);
INSN(sve_addvl, 0b01); // Add multiple of vector register size to scalar register
INSN(sve_addpl, 0b11); // Add multiple of predicate register size to scalar register
#undef INSN
// SVE inc/dec register by element count
@ -3175,15 +3196,15 @@ public:
INSN(sve_dec, 1);
#undef INSN
// SVE predicate count
void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) {
// SVE increment register by predicate count
void sve_incp(const Register rd, SIMD_RegVariant T, PRegister pg) {
starti;
assert(T != Q, "invalid size");
f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14);
prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0);
f(0b00100101, 31, 24), f(T, 23, 22), f(0b1011001000100, 21, 9),
prf(pg, 5), rf(rd, 0);
}
// SVE dup scalar
// SVE broadcast general-purpose register to vector elements (unpredicated)
void sve_dup(FloatRegister Zd, SIMD_RegVariant T, Register Rn) {
starti;
assert(T != Q, "invalid size");
@ -3191,7 +3212,7 @@ public:
srf(Rn, 5), rf(Zd, 0);
}
// SVE dup imm
// SVE broadcast signed immediate to vector elements (unpredicated)
void sve_dup(FloatRegister Zd, SIMD_RegVariant T, int imm8) {
starti;
assert(T != Q, "invalid size");
@ -3214,19 +3235,119 @@ public:
f(pattern, 9, 5), f(0b0, 4), prf(pd, 0);
}
// Integer comparisons (SVE)
#define INSN(NAME, cond) \
void NAME(PRegister Pd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn, FloatRegister Zm) { \
starti; \
assert(T != Q, "invalid size"); \
f(0b00100100, 31, 24), f(T, 23, 22), f(0, 21), rf(Zm, 16), f((cond >> 1) & 7, 15, 13); \
pgrf(Pg, 10), rf(Zn, 5), f(cond & 1, 4), prf(Pd, 0); \
// SVE copy general-purpose register to vector elements (predicated)
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, Register Rn) {
starti;
assert(T != Q, "invalid size");
f(0b00000101, 31, 24), f(T, 23, 22), f(0b101000101, 21, 13);
pgrf(Pg, 10), srf(Rn, 5), rf(Zd, 0);
}
INSN(sve_cmpeq, 0b1010); // Compare signed equal to vector
INSN(sve_cmpne, 0b1011); // Compare not equal to vector
INSN(sve_cmpge, 0b1000); // Compare signed greater than or equal to vector
INSN(sve_cmpgt, 0b1001); // Compare signed greater than vector
// SVE copy signed integer immediate to vector elements (predicated)
void sve_cpy(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int imm8, bool isMerge) {
starti;
assert(T != Q, "invalid size");
int sh = 0;
if (imm8 <= 127 && imm8 >= -128) {
sh = 0;
} else if (T != B && imm8 <= 32512 && imm8 >= -32768 && (imm8 & 0xff) == 0) {
sh = 1;
imm8 = (imm8 >> 8);
} else {
guarantee(false, "invalid immediate");
}
int m = isMerge ? 1 : 0;
f(0b00000101, 31, 24), f(T, 23, 22), f(0b01, 21, 20);
prf(Pg, 16), f(0b0, 15), f(m, 14), f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0);
}
// SVE conditionally select elements from two vectors
void sve_sel(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg,
FloatRegister Zn, FloatRegister Zm) {
starti;
assert(T != Q, "invalid size");
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
f(0b11, 15, 14), prf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}
// SVE Integer/Floating-Point Compare - Vectors
#define INSN(NAME, op1, op2, fp) \
void NAME(Condition cond, PRegister Pd, SIMD_RegVariant T, PRegister Pg, \
FloatRegister Zn, FloatRegister Zm) { \
starti; \
if (fp == 0) { \
assert(T != Q, "invalid size"); \
} else { \
assert(T != B && T != Q, "invalid size"); \
assert(cond != HI && cond != HS, "invalid condition for fcm"); \
} \
int cond_op; \
switch(cond) { \
case EQ: cond_op = (op2 << 2) | 0b10; break; \
case NE: cond_op = (op2 << 2) | 0b11; break; \
case GE: cond_op = (op2 << 2) | 0b00; break; \
case GT: cond_op = (op2 << 2) | 0b01; break; \
case HI: cond_op = 0b0001; break; \
case HS: cond_op = 0b0000; break; \
default: \
ShouldNotReachHere(); \
} \
f(op1, 31, 24), f(T, 23, 22), f(0, 21), rf(Zm, 16), f((cond_op >> 1) & 7, 15, 13); \
pgrf(Pg, 10), rf(Zn, 5), f(cond_op & 1, 4), prf(Pd, 0); \
}
INSN(sve_cmp, 0b00100100, 0b10, 0);
INSN(sve_fcm, 0b01100101, 0b01, 1);
#undef INSN
// SVE Integer Compare - Signed Immediate
void sve_cmp(Condition cond, PRegister Pd, SIMD_RegVariant T,
PRegister Pg, FloatRegister Zn, int imm5) {
starti;
assert(T != Q, "invalid size");
guarantee(-16 <= imm5 && imm5 <= 15, "invalid immediate");
int cond_op;
switch(cond) {
case EQ: cond_op = 0b1000; break;
case NE: cond_op = 0b1001; break;
case GE: cond_op = 0b0000; break;
case GT: cond_op = 0b0001; break;
case LE: cond_op = 0b0011; break;
case LT: cond_op = 0b0010; break;
default:
ShouldNotReachHere();
}
f(0b00100101, 31, 24), f(T, 23, 22), f(0b0, 21), sf(imm5, 20, 16),
f((cond_op >> 1) & 0x7, 15, 13), pgrf(Pg, 10), rf(Zn, 5);
f(cond_op & 0x1, 4), prf(Pd, 0);
}
// SVE unpack vector elements
#define INSN(NAME, op) \
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn) { \
starti; \
assert(T != B && T != Q, "invalid size"); \
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1100, 21, 18); \
f(op, 17, 16), f(0b001110, 15, 10), rf(Zn, 5), rf(Zd, 0); \
}
INSN(sve_uunpkhi, 0b11); // Signed unpack and extend half of vector - high half
INSN(sve_uunpklo, 0b10); // Signed unpack and extend half of vector - low half
INSN(sve_sunpkhi, 0b01); // Unsigned unpack and extend half of vector - high half
INSN(sve_sunpklo, 0b00); // Unsigned unpack and extend half of vector - low half
#undef INSN
// SVE permute vector elements
#define INSN(NAME, op) \
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \
starti; \
assert(T != Q, "invalid size"); \
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16); \
f(0b01101, 15, 11), f(op, 10), rf(Zn, 5), rf(Zd, 0); \
}
INSN(sve_uzp1, 0b0); // Concatenate even elements from two vectors
INSN(sve_uzp2, 0b1); // Concatenate odd elements from two vectors
#undef INSN
// Predicate counted loop (SVE) (32-bit variants are not included)
@ -3245,15 +3366,26 @@ public:
INSN(sve_whilels, 0b111); // While incrementing unsigned scalar lower than or the same as scalar
#undef INSN
// Predicate scan (SVE)
// Break after the first true condition
void sve_brka(PRegister pd, PRegister pg, PRegister pn, bool isMerge) {
// SVE predicate reverse
void sve_rev(PRegister Pd, SIMD_RegVariant T, PRegister Pn) {
starti;
f(0b00100101, 31, 24), f(0b00, 23, 22), f(0b01000001, 21, 14),
prf(pg, 10), f(0b0, 9), prf(pn, 5), f(isMerge ? 1 : 0, 4), prf(pd, 0);
assert(T != Q, "invalid size");
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1101000100000, 21, 9);
prf(Pn, 5), f(0, 4), prf(Pd, 0);
}
// SVE partition break condition
#define INSN(NAME, op) \
void NAME(PRegister Pd, PRegister Pg, PRegister Pn, bool isMerge) { \
starti; \
f(0b00100101, 31, 24), f(op, 23, 22), f(0b01000001, 21, 14); \
prf(Pg, 10), f(0b0, 9), prf(Pn, 5), f(isMerge ? 1 : 0, 4), prf(Pd, 0); \
}
INSN(sve_brka, 0b00); // Break after first true condition
INSN(sve_brkb, 0b10); // Break before first true condition
#undef INSN
// Element count and increment scalar (SVE)
#define INSN(NAME, TYPE) \
void NAME(Register Xdn, unsigned imm4 = 1, int pattern = 0b11111) { \
@ -3268,14 +3400,122 @@ public:
INSN(sve_cntd, D); // Set scalar to multiple of 64-bit predicate constraint element count
#undef INSN
// Predicate count and increment scalar (SVE)
// Set scalar to the number of Active predicate elements that are TRUE
void sve_incp(const Register rd, SIMD_RegVariant T, PRegister pg) {
// Set scalar to active predicate element count
void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) {
starti;
assert(T != Q, "invalid size");
f(0b00100101, 31, 24), f(T, 23, 22), f(0b1011001000100, 21, 9),
prf(pg, 5), rf(rd, 0);
f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14);
prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0);
}
// SVE convert signed integer to floating-point (predicated)
void sve_scvtf(FloatRegister Zd, SIMD_RegVariant T_dst, PRegister Pg,
FloatRegister Zn, SIMD_RegVariant T_src) {
starti;
assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q &&
(T_src != H || T_dst == T_src), "invalid register variant");
int opc = T_dst;
int opc2 = T_src;
// In most cases we can treat T_dst, T_src as opc, opc2,
// except for the following two combinations.
// +-----+------+---+------------------------------------+
// | opc | opc2 | U | Instruction Details |
// +-----+------+---+------------------------------------+
// | 11 | 00 | 0 | SCVTF - 32-bit to double-precision |
// | 11 | 10 | 0 | SCVTF - 64-bit to single-precision |
// +-----+------+---+------------------------------------+
if (T_src == S && T_dst == D) {
opc = 0b11;
opc2 = 0b00;
} else if (T_src == D && T_dst == S) {
opc = 0b11;
opc2 = 0b10;
}
f(0b01100101, 31, 24), f(opc, 23, 22), f(0b010, 21, 19);
f(opc2, 18, 17), f(0b0101, 16, 13);
pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}
// SVE floating-point convert to signed integer, rounding toward zero (predicated)
void sve_fcvtzs(FloatRegister Zd, SIMD_RegVariant T_dst, PRegister Pg,
FloatRegister Zn, SIMD_RegVariant T_src) {
starti;
assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q &&
(T_dst != H || T_src == H), "invalid register variant");
int opc = T_src;
int opc2 = T_dst;
// In most cases we can treat T_src, T_dst as opc, opc2,
// except for the following two combinations.
// +-----+------+---+-------------------------------------+
// | opc | opc2 | U | Instruction Details |
// +-----+------+---+-------------------------------------+
// | 11 | 10 | 0 | FCVTZS - single-precision to 64-bit |
// | 11 | 00 | 0 | FCVTZS - double-precision to 32-bit |
// +-----+------+---+-------------------------------------+
if (T_src == S && T_dst == D) {
opc = 0b11;
opc2 = 0b10;
} else if (T_src == D && T_dst == S) {
opc = 0b11;
opc2 = 0b00;
}
f(0b01100101, 31, 24), f(opc, 23, 22), f(0b011, 21, 19);
f(opc2, 18, 17), f(0b0101, 16, 13);
pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}
// SVE floating-point convert precision (predicated)
void sve_fcvt(FloatRegister Zd, SIMD_RegVariant T_dst, PRegister Pg,
FloatRegister Zn, SIMD_RegVariant T_src) {
starti;
assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q &&
T_src != T_dst, "invalid register variant");
guarantee(T_src != H && T_dst != H, "half-precision unsupported");
f(0b01100101, 31, 24), f(0b11, 23, 22), f(0b0010, 21, 18);
f(T_dst, 17, 16), f(0b101, 15, 13);
pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}
// SVE extract element to general-purpose register
#define INSN(NAME, before) \
void NAME(Register Rd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn) { \
starti; \
f(0b00000101, 31, 24), f(T, 23, 22), f(0b10000, 21, 17); \
f(before, 16), f(0b101, 15, 13); \
pgrf(Pg, 10), rf(Zn, 5), rf(Rd, 0); \
}
INSN(sve_lasta, 0b0);
INSN(sve_lastb, 0b1);
#undef INSN
// SVE extract element to SIMD&FP scalar register
#define INSN(NAME, before) \
void NAME(FloatRegister Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn) { \
starti; \
f(0b00000101, 31, 24), f(T, 23, 22), f(0b10001, 21, 17); \
f(before, 16), f(0b100, 15, 13); \
pgrf(Pg, 10), rf(Zn, 5), rf(Vd, 0); \
}
INSN(sve_lasta, 0b0);
INSN(sve_lastb, 0b1);
#undef INSN
// SVE create index starting from and incremented by immediate
void sve_index(FloatRegister Zd, SIMD_RegVariant T, int imm1, int imm2) {
starti;
f(0b00000100, 31, 24), f(T, 23, 22), f(0b1, 21);
sf(imm2, 20, 16), f(0b010000, 15, 10);
sf(imm1, 9, 5), rf(Zd, 0);
}
// SVE programmable table lookup/permute using vector of element indices
void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) {
starti;
assert(T != Q, "invalid size");
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
}
Assembler(CodeBuffer* code) : AbstractAssembler(code) {

View File

@ -584,7 +584,7 @@ void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
// Perform the comparison. An element of the destination predicate is set
// to active if the particular char is matched.
sve_cmpeq(tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
// Branch if the particular char is found.
br(NE, MATCH);
@ -905,7 +905,7 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2,
void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
FloatRegister src2, int cond, bool isQ) {
SIMD_Arrangement size = esize2arrangement(type2aelembytes(bt), isQ);
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
if (bt == T_FLOAT || bt == T_DOUBLE) {
switch (cond) {
case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
@ -944,3 +944,56 @@ void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegis
}
}
}
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
FloatRegister zn, FloatRegister zm, int cond) {
assert(pg->is_governing(), "This register has to be a governing predicate register");
FloatRegister z1 = zn, z2 = zm;
// Convert the original BoolTest condition to Assembler::condition.
Condition condition;
switch (cond) {
case BoolTest::eq: condition = Assembler::EQ; break;
case BoolTest::ne: condition = Assembler::NE; break;
case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
case BoolTest::ge: condition = Assembler::GE; break;
case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
case BoolTest::gt: condition = Assembler::GT; break;
default:
assert(false, "unsupported compare condition");
ShouldNotReachHere();
}
SIMD_RegVariant size = elemType_to_regVariant(bt);
if (bt == T_FLOAT || bt == T_DOUBLE) {
sve_fcm(condition, pd, size, pg, z1, z2);
} else {
assert(is_integral_type(bt), "unsupported element type");
sve_cmp(condition, pd, size, pg, z1, z2);
}
}
void C2_MacroAssembler::sve_vmask_reduction(int opc, Register dst, SIMD_RegVariant size, FloatRegister src,
PRegister pg, PRegister pn, int length) {
assert(pg->is_governing(), "This register has to be a governing predicate register");
// The conditional flags will be clobbered by this function
sve_cmp(Assembler::NE, pn, size, pg, src, 0);
switch (opc) {
case Op_VectorMaskTrueCount:
sve_cntp(dst, size, ptrue, pn);
break;
case Op_VectorMaskFirstTrue:
sve_brkb(pn, pg, pn, false);
sve_cntp(dst, size, ptrue, pn);
break;
case Op_VectorMaskLastTrue:
sve_rev(pn, size, pn);
sve_brkb(pn, ptrue, pn, false);
sve_cntp(dst, size, ptrue, pn);
movw(rscratch1, length - 1);
subw(dst, rscratch1, dst);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
}

View File

@ -58,4 +58,30 @@
void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
FloatRegister src2, int cond, bool isQ);
void sve_compare(PRegister pd, BasicType bt, PRegister pg,
FloatRegister zn, FloatRegister zm, int cond);
void sve_vmask_reduction(int opc, Register dst, SIMD_RegVariant size, FloatRegister src,
PRegister pg, PRegister pn, int length = MaxVectorSize);
// Generate predicate through whilelo, by comparing ZR with an unsigned
// immediate. rscratch1 will be clobbered.
inline void sve_whilelo_zr_imm(PRegister pd, SIMD_RegVariant size, uint imm) {
assert(UseSVE > 0, "not supported");
mov(rscratch1, imm);
sve_whilelo(pd, size, zr, rscratch1);
}
// Extract a scalar element from an sve vector at position 'idx'.
// rscratch1 will be clobbered.
// T could be FloatRegister or Register.
template<class T>
inline void sve_extract(T dst, SIMD_RegVariant size, PRegister pg, FloatRegister src, int idx) {
assert(UseSVE > 0, "not supported");
assert(pg->is_governing(), "This register has to be a governing predicate register");
mov(rscratch1, idx);
sve_whilele(pg, size, zr, rscratch1);
sve_lastb(dst, size, pg, src);
}
#endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP

View File

@ -2537,7 +2537,9 @@ void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
}
if (restore_vectors) {
// We may use predicate registers and rely on ptrue with SVE,
// regardless of wide vector (> 8 bytes) used or not.
if (use_sve) {
reinitialize_ptrue();
}

View File

@ -2108,8 +2108,12 @@ void PhaseOutput::ScheduleAndBundle() {
return;
// Scheduling code works only with pairs (8 bytes) maximum.
if (C->max_vector_size() > 8)
// And when the scalable vector register is used, we may spill/unspill
// the whole reg regardless of the max vector size.
if (C->max_vector_size() > 8 ||
(C->max_vector_size() > 0 && Matcher::supports_scalable_vector())) {
return;
}
Compile::TracePhase tp("isched", &timers[_t_instrSched]);

View File

@ -1536,52 +1536,117 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);",
["fmov", "__ fmovhid(r0, v1);", "fmov\tx0, v1.d[1]"],
["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"],
# SVE instructions
["cpy", "__ sve_cpy(z0, __ S, p0, v1);", "mov\tz0.s, p0/m, s1"],
["inc", "__ sve_inc(r0, __ S);", "incw\tx0"],
["dec", "__ sve_dec(r1, __ H);", "dech\tx1"],
["lsl", "__ sve_lsl(z0, __ B, z1, 7);", "lsl\tz0.b, z1.b, #7"],
["lsl", "__ sve_lsl(z21, __ H, z1, 15);", "lsl\tz21.h, z1.h, #15"],
["lsl", "__ sve_lsl(z0, __ S, z1, 31);", "lsl\tz0.s, z1.s, #31"],
["lsl", "__ sve_lsl(z0, __ D, z1, 63);", "lsl\tz0.d, z1.d, #63"],
["lsr", "__ sve_lsr(z0, __ B, z1, 7);", "lsr\tz0.b, z1.b, #7"],
["asr", "__ sve_asr(z0, __ H, z11, 15);", "asr\tz0.h, z11.h, #15"],
["lsr", "__ sve_lsr(z30, __ S, z1, 31);", "lsr\tz30.s, z1.s, #31"],
["asr", "__ sve_asr(z0, __ D, z1, 63);", "asr\tz0.d, z1.d, #63"],
["addvl", "__ sve_addvl(sp, r0, 31);", "addvl\tsp, x0, #31"],
["addpl", "__ sve_addpl(r1, sp, -32);", "addpl\tx1, sp, -32"],
["cntp", "__ sve_cntp(r8, __ B, p0, p1);", "cntp\tx8, p0, p1.b"],
["dup", "__ sve_dup(z0, __ B, 127);", "dup\tz0.b, 127"],
["dup", "__ sve_dup(z1, __ H, -128);", "dup\tz1.h, -128"],
["dup", "__ sve_dup(z2, __ S, 32512);", "dup\tz2.s, 32512"],
["dup", "__ sve_dup(z7, __ D, -32768);", "dup\tz7.d, -32768"],
["dup", "__ sve_dup(z4, __ B, r3);", "dup\tz4.b, w3"],
["dup", "__ sve_dup(z14, __ H, r22);", "dup\tz14.h, w22"],
["ld1b", "__ sve_ld1b(z0, __ B, p0, Address(sp));", "ld1b\t{z0.b}, p0/z, [sp]"],
["ld1h", "__ sve_ld1h(z10, __ H, p1, Address(sp, -8));", "ld1h\t{z10.h}, p1/z, [sp, #-8, MUL VL]"],
["ld1w", "__ sve_ld1w(z20, __ S, p2, Address(r0, 7));", "ld1w\t{z20.s}, p2/z, [x0, #7, MUL VL]"],
["ld1b", "__ sve_ld1b(z30, __ B, p3, Address(sp, r8));", "ld1b\t{z30.b}, p3/z, [sp, x8]"],
["ld1w", "__ sve_ld1w(z0, __ S, p4, Address(sp, r28));", "ld1w\t{z0.s}, p4/z, [sp, x28, LSL #2]"],
["ld1d", "__ sve_ld1d(z11, __ D, p5, Address(r0, r1));", "ld1d\t{z11.d}, p5/z, [x0, x1, LSL #3]"],
["st1b", "__ sve_st1b(z22, __ B, p6, Address(sp));", "st1b\t{z22.b}, p6, [sp]"],
["st1b", "__ sve_st1b(z31, __ B, p7, Address(sp, -8));", "st1b\t{z31.b}, p7, [sp, #-8, MUL VL]"],
["st1w", "__ sve_st1w(z0, __ S, p1, Address(r0, 7));", "st1w\t{z0.s}, p1, [x0, #7, MUL VL]"],
["st1b", "__ sve_st1b(z0, __ B, p2, Address(sp, r1));", "st1b\t{z0.b}, p2, [sp, x1]"],
["st1h", "__ sve_st1h(z0, __ H, p3, Address(sp, r8));", "st1h\t{z0.h}, p3, [sp, x8, LSL #1]"],
["st1d", "__ sve_st1d(z0, __ D, p4, Address(r0, r17));", "st1d\t{z0.d}, p4, [x0, x17, LSL #3]"],
["ldr", "__ sve_ldr(z0, Address(sp));", "ldr\tz0, [sp]"],
["ldr", "__ sve_ldr(z31, Address(sp, -256));", "ldr\tz31, [sp, #-256, MUL VL]"],
["str", "__ sve_str(z8, Address(r8, 255));", "str\tz8, [x8, #255, MUL VL]"],
["cntb", "__ sve_cntb(r9);", "cntb\tx9"],
["cnth", "__ sve_cnth(r10);", "cnth\tx10"],
["cntw", "__ sve_cntw(r11);", "cntw\tx11"],
["cntd", "__ sve_cntd(r12);", "cntd\tx12"],
["brka", "__ sve_brka(p2, p0, p2, false);", "brka\tp2.b, p0/z, p2.b"],
["brka", "__ sve_brka(p1, p2, p3, true);", "brka\tp1.b, p2/m, p3.b"],
["incp", "__ sve_incp(r0, __ B, p2);", "incp\tx0, p2.b"],
["whilelt", "__ sve_whilelt(p0, __ B, r1, r28);", "whilelt\tp0.b, x1, x28"],
["whilele", "__ sve_whilele(p2, __ H, r11, r8);", "whilele\tp2.h, x11, x8"],
["whilelo", "__ sve_whilelo(p3, __ S, r7, r2);", "whilelo\tp3.s, x7, x2"],
["whilels", "__ sve_whilels(p4, __ D, r17, r10);", "whilels\tp4.d, x17, x10"],
["cpy", "__ sve_cpy(z0, __ S, p0, v1);", "mov\tz0.s, p0/m, s1"],
["cpy", "__ sve_cpy(z0, __ B, p0, 127, true);", "mov\tz0.b, p0/m, 127"],
["cpy", "__ sve_cpy(z1, __ H, p0, -128, true);", "mov\tz1.h, p0/m, -128"],
["cpy", "__ sve_cpy(z2, __ S, p0, 32512, true);", "mov\tz2.s, p0/m, 32512"],
["cpy", "__ sve_cpy(z5, __ D, p0, -32768, false);", "mov\tz5.d, p0/z, -32768"],
["cpy", "__ sve_cpy(z10, __ B, p0, -1, false);", "mov\tz10.b, p0/z, -1"],
["cpy", "__ sve_cpy(z11, __ S, p0, -1, false);", "mov\tz11.s, p0/z, -1"],
["inc", "__ sve_inc(r0, __ S);", "incw\tx0"],
["dec", "__ sve_dec(r1, __ H);", "dech\tx1"],
["lsl", "__ sve_lsl(z0, __ B, z1, 7);", "lsl\tz0.b, z1.b, #7"],
["lsl", "__ sve_lsl(z21, __ H, z1, 15);", "lsl\tz21.h, z1.h, #15"],
["lsl", "__ sve_lsl(z0, __ S, z1, 31);", "lsl\tz0.s, z1.s, #31"],
["lsl", "__ sve_lsl(z0, __ D, z1, 63);", "lsl\tz0.d, z1.d, #63"],
["lsr", "__ sve_lsr(z0, __ B, z1, 7);", "lsr\tz0.b, z1.b, #7"],
["asr", "__ sve_asr(z0, __ H, z11, 15);", "asr\tz0.h, z11.h, #15"],
["lsr", "__ sve_lsr(z30, __ S, z1, 31);", "lsr\tz30.s, z1.s, #31"],
["asr", "__ sve_asr(z0, __ D, z1, 63);", "asr\tz0.d, z1.d, #63"],
["addvl", "__ sve_addvl(sp, r0, 31);", "addvl\tsp, x0, #31"],
["addpl", "__ sve_addpl(r1, sp, -32);", "addpl\tx1, sp, -32"],
["cntp", "__ sve_cntp(r8, __ B, p0, p1);", "cntp\tx8, p0, p1.b"],
["dup", "__ sve_dup(z0, __ B, 127);", "dup\tz0.b, 127"],
["dup", "__ sve_dup(z1, __ H, -128);", "dup\tz1.h, -128"],
["dup", "__ sve_dup(z2, __ S, 32512);", "dup\tz2.s, 32512"],
["dup", "__ sve_dup(z7, __ D, -32768);", "dup\tz7.d, -32768"],
["dup", "__ sve_dup(z10, __ B, -1);", "dup\tz10.b, -1"],
["dup", "__ sve_dup(z11, __ S, -1);", "dup\tz11.s, -1"],
["ld1b", "__ sve_ld1b(z0, __ B, p0, Address(sp));", "ld1b\t{z0.b}, p0/z, [sp]"],
["ld1b", "__ sve_ld1b(z0, __ H, p1, Address(sp));", "ld1b\t{z0.h}, p1/z, [sp]"],
["ld1b", "__ sve_ld1b(z0, __ S, p2, Address(sp, r8));", "ld1b\t{z0.s}, p2/z, [sp, x8]"],
["ld1b", "__ sve_ld1b(z0, __ D, p3, Address(sp, 7));", "ld1b\t{z0.d}, p3/z, [sp, #7, MUL VL]"],
["ld1h", "__ sve_ld1h(z10, __ H, p1, Address(sp, -8));", "ld1h\t{z10.h}, p1/z, [sp, #-8, MUL VL]"],
["ld1w", "__ sve_ld1w(z20, __ S, p2, Address(r0, 7));", "ld1w\t{z20.s}, p2/z, [x0, #7, MUL VL]"],
["ld1b", "__ sve_ld1b(z30, __ B, p3, Address(sp, r8));", "ld1b\t{z30.b}, p3/z, [sp, x8]"],
["ld1w", "__ sve_ld1w(z0, __ S, p4, Address(sp, r28));", "ld1w\t{z0.s}, p4/z, [sp, x28, LSL #2]"],
["ld1d", "__ sve_ld1d(z11, __ D, p5, Address(r0, r1));", "ld1d\t{z11.d}, p5/z, [x0, x1, LSL #3]"],
["st1b", "__ sve_st1b(z22, __ B, p6, Address(sp));", "st1b\t{z22.b}, p6, [sp]"],
["st1b", "__ sve_st1b(z31, __ B, p7, Address(sp, -8));", "st1b\t{z31.b}, p7, [sp, #-8, MUL VL]"],
["st1b", "__ sve_st1b(z0, __ H, p1, Address(sp));", "st1b\t{z0.h}, p1, [sp]"],
["st1b", "__ sve_st1b(z0, __ S, p2, Address(sp, r8));", "st1b\t{z0.s}, p2, [sp, x8]"],
["st1b", "__ sve_st1b(z0, __ D, p3, Address(sp));", "st1b\t{z0.d}, p3, [sp]"],
["st1w", "__ sve_st1w(z0, __ S, p1, Address(r0, 7));", "st1w\t{z0.s}, p1, [x0, #7, MUL VL]"],
["st1b", "__ sve_st1b(z0, __ B, p2, Address(sp, r1));", "st1b\t{z0.b}, p2, [sp, x1]"],
["st1h", "__ sve_st1h(z0, __ H, p3, Address(sp, r8));", "st1h\t{z0.h}, p3, [sp, x8, LSL #1]"],
["st1d", "__ sve_st1d(z0, __ D, p4, Address(r0, r17));", "st1d\t{z0.d}, p4, [x0, x17, LSL #3]"],
["ldr", "__ sve_ldr(z0, Address(sp));", "ldr\tz0, [sp]"],
["ldr", "__ sve_ldr(z31, Address(sp, -256));", "ldr\tz31, [sp, #-256, MUL VL]"],
["str", "__ sve_str(z8, Address(r8, 255));", "str\tz8, [x8, #255, MUL VL]"],
["cntb", "__ sve_cntb(r9);", "cntb\tx9"],
["cnth", "__ sve_cnth(r10);", "cnth\tx10"],
["cntw", "__ sve_cntw(r11);", "cntw\tx11"],
["cntd", "__ sve_cntd(r12);", "cntd\tx12"],
["brka", "__ sve_brka(p2, p0, p2, false);", "brka\tp2.b, p0/z, p2.b"],
["brka", "__ sve_brka(p1, p2, p3, true);", "brka\tp1.b, p2/m, p3.b"],
["brkb", "__ sve_brkb(p1, p2, p3, false);", "brkb\tp1.b, p2/z, p3.b"],
["brkb", "__ sve_brkb(p2, p3, p4, true);", "brkb\tp2.b, p3/m, p4.b"],
["rev", "__ sve_rev(p0, __ B, p1);", "rev\tp0.b, p1.b"],
["rev", "__ sve_rev(p1, __ H, p2);", "rev\tp1.h, p2.h"],
["rev", "__ sve_rev(p2, __ S, p3);", "rev\tp2.s, p3.s"],
["rev", "__ sve_rev(p3, __ D, p4);", "rev\tp3.d, p4.d"],
["incp", "__ sve_incp(r0, __ B, p2);", "incp\tx0, p2.b"],
["whilelt", "__ sve_whilelt(p0, __ B, r1, r28);", "whilelt\tp0.b, x1, x28"],
["whilele", "__ sve_whilele(p2, __ H, r11, r8);", "whilele\tp2.h, x11, x8"],
["whilelo", "__ sve_whilelo(p3, __ S, r7, r2);", "whilelo\tp3.s, x7, x2"],
["whilels", "__ sve_whilels(p4, __ D, r17, r10);", "whilels\tp4.d, x17, x10"],
["sel", "__ sve_sel(z0, __ B, p0, z1, z2);", "sel\tz0.b, p0, z1.b, z2.b"],
["sel", "__ sve_sel(z4, __ D, p0, z5, z6);", "sel\tz4.d, p0, z5.d, z6.d"],
["cmpeq", "__ sve_cmp(Assembler::EQ, p1, __ B, p0, z0, z1);", "cmpeq\tp1.b, p0/z, z0.b, z1.b"],
["cmpne", "__ sve_cmp(Assembler::NE, p1, __ H, p0, z2, z3);", "cmpne\tp1.h, p0/z, z2.h, z3.h"],
["cmpge", "__ sve_cmp(Assembler::GE, p1, __ S, p2, z4, z5);", "cmpge\tp1.s, p2/z, z4.s, z5.s"],
["cmpgt", "__ sve_cmp(Assembler::GT, p1, __ D, p3, z6, z7);", "cmpgt\tp1.d, p3/z, z6.d, z7.d"],
["cmphi", "__ sve_cmp(Assembler::HI, p1, __ S, p2, z4, z5);", "cmphi\tp1.s, p2/z, z4.s, z5.s"],
["cmphs", "__ sve_cmp(Assembler::HS, p1, __ D, p3, z6, z7);", "cmphs\tp1.d, p3/z, z6.d, z7.d"],
["cmpeq", "__ sve_cmp(Assembler::EQ, p1, __ B, p4, z0, 15);", "cmpeq\tp1.b, p4/z, z0.b, #15"],
["cmpne", "__ sve_cmp(Assembler::NE, p1, __ H, p0, z2, -16);", "cmpne\tp1.h, p0/z, z2.h, #-16"],
["cmple", "__ sve_cmp(Assembler::LE, p1, __ S, p1, z4, 0);", "cmple\tp1.s, p1/z, z4.s, #0"],
["cmplt", "__ sve_cmp(Assembler::LT, p1, __ D, p2, z6, -1);", "cmplt\tp1.d, p2/z, z6.d, #-1"],
["cmpge", "__ sve_cmp(Assembler::GE, p1, __ S, p3, z4, 5);", "cmpge\tp1.s, p3/z, z4.s, #5"],
["cmpgt", "__ sve_cmp(Assembler::GT, p1, __ B, p4, z6, -2);", "cmpgt\tp1.b, p4/z, z6.b, #-2"],
["fcmeq", "__ sve_fcm(Assembler::EQ, p1, __ S, p0, z0, z1);", "fcmeq\tp1.s, p0/z, z0.s, z1.s"],
["fcmne", "__ sve_fcm(Assembler::NE, p1, __ D, p0, z2, z3);", "fcmne\tp1.d, p0/z, z2.d, z3.d"],
["fcmgt", "__ sve_fcm(Assembler::GT, p1, __ S, p2, z4, z5);", "fcmgt\tp1.s, p2/z, z4.s, z5.s"],
["fcmge", "__ sve_fcm(Assembler::GE, p1, __ D, p3, z6, z7);", "fcmge\tp1.d, p3/z, z6.d, z7.d"],
["uunpkhi", "__ sve_uunpkhi(z0, __ H, z1);", "uunpkhi\tz0.h, z1.b"],
["uunpklo", "__ sve_uunpklo(z4, __ S, z5);", "uunpklo\tz4.s, z5.h"],
["sunpkhi", "__ sve_sunpkhi(z6, __ D, z7);", "sunpkhi\tz6.d, z7.s"],
["sunpklo", "__ sve_sunpklo(z10, __ H, z11);", "sunpklo\tz10.h, z11.b"],
["scvtf", "__ sve_scvtf(z1, __ D, p0, z0, __ S);", "scvtf\tz1.d, p0/m, z0.s"],
["scvtf", "__ sve_scvtf(z3, __ D, p1, z2, __ D);", "scvtf\tz3.d, p1/m, z2.d"],
["scvtf", "__ sve_scvtf(z6, __ S, p2, z1, __ D);", "scvtf\tz6.s, p2/m, z1.d"],
["scvtf", "__ sve_scvtf(z6, __ S, p3, z1, __ S);", "scvtf\tz6.s, p3/m, z1.s"],
["scvtf", "__ sve_scvtf(z6, __ H, p3, z1, __ S);", "scvtf\tz6.h, p3/m, z1.s"],
["scvtf", "__ sve_scvtf(z6, __ H, p3, z1, __ D);", "scvtf\tz6.h, p3/m, z1.d"],
["scvtf", "__ sve_scvtf(z6, __ H, p3, z1, __ H);", "scvtf\tz6.h, p3/m, z1.h"],
["fcvt", "__ sve_fcvt(z5, __ D, p3, z4, __ S);", "fcvt\tz5.d, p3/m, z4.s"],
["fcvt", "__ sve_fcvt(z1, __ S, p3, z0, __ D);", "fcvt\tz1.s, p3/m, z0.d"],
["fcvtzs", "__ sve_fcvtzs(z19, __ D, p2, z1, __ D);", "fcvtzs\tz19.d, p2/m, z1.d"],
["fcvtzs", "__ sve_fcvtzs(z9, __ S, p1, z8, __ S);", "fcvtzs\tz9.s, p1/m, z8.s"],
["fcvtzs", "__ sve_fcvtzs(z1, __ S, p2, z0, __ D);", "fcvtzs\tz1.s, p2/m, z0.d"],
["fcvtzs", "__ sve_fcvtzs(z1, __ D, p3, z0, __ S);", "fcvtzs\tz1.d, p3/m, z0.s"],
["fcvtzs", "__ sve_fcvtzs(z1, __ S, p4, z18, __ H);", "fcvtzs\tz1.s, p4/m, z18.h"],
["lasta", "__ sve_lasta(r0, __ B, p0, z15);", "lasta\tw0, p0, z15.b"],
["lastb", "__ sve_lastb(r1, __ B, p1, z16);", "lastb\tw1, p1, z16.b"],
["lasta", "__ sve_lasta(v0, __ B, p0, z15);", "lasta\tb0, p0, z15.b"],
["lastb", "__ sve_lastb(v1, __ B, p1, z16);", "lastb\tb1, p1, z16.b"],
["index", "__ sve_index(z6, __ S, 1, 1);", "index\tz6.s, #1, #1"],
["cpy", "__ sve_cpy(z7, __ H, p3, r5);", "cpy\tz7.h, p3/m, w5"],
["tbl", "__ sve_tbl(z16, __ S, z17, z18);", "tbl\tz16.s, {z17.s}, z18.s"],
["ld1w", "__ sve_ld1w_gather(z15, p0, r5, z16);", "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"],
["ld1d", "__ sve_ld1d_gather(z15, p0, r5, z16);", "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"],
["st1w", "__ sve_st1w_scatter(z15, p0, r5, z16);", "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"],
["st1d", "__ sve_st1d_scatter(z15, p0, r5, z16);", "st1d\t{z15.d}, p0, [x5, z16.d, uxtw #3]"],
])
print "\n// FloatImmediateOp"
@ -1651,10 +1716,8 @@ generate(SVEVectorOp, [["add", "ZZZ"],
["eor", "ZZZ"],
["orr", "ZZZ"],
["bic", "ZZZ"],
["cmpeq", "PPZZ", "z"],
["cmpge", "PPZZ", "z"],
["cmpgt", "PPZZ", "z"],
["cmpne", "PPZZ", "z"],
["uzp1", "ZZZ"],
["uzp2", "ZZZ"],
])
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],

View File

@ -726,6 +726,12 @@
__ fmovhid(r0, v1); // fmov x0, v1.d[1]
__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0
__ sve_cpy(z0, __ S, p0, v1); // mov z0.s, p0/m, s1
__ sve_cpy(z0, __ B, p0, 127, true); // mov z0.b, p0/m, 127
__ sve_cpy(z1, __ H, p0, -128, true); // mov z1.h, p0/m, -128
__ sve_cpy(z2, __ S, p0, 32512, true); // mov z2.s, p0/m, 32512
__ sve_cpy(z5, __ D, p0, -32768, false); // mov z5.d, p0/z, -32768
__ sve_cpy(z10, __ B, p0, -1, false); // mov z10.b, p0/z, -1
__ sve_cpy(z11, __ S, p0, -1, false); // mov z11.s, p0/z, -1
__ sve_inc(r0, __ S); // incw x0
__ sve_dec(r1, __ H); // dech x1
__ sve_lsl(z0, __ B, z1, 7); // lsl z0.b, z1.b, #7
@ -743,9 +749,12 @@
__ sve_dup(z1, __ H, -128); // dup z1.h, -128
__ sve_dup(z2, __ S, 32512); // dup z2.s, 32512
__ sve_dup(z7, __ D, -32768); // dup z7.d, -32768
__ sve_dup(z4, __ B, r3); // dup z4.b, w3
__ sve_dup(z14, __ H, r22); // dup z14.h, w22
__ sve_dup(z10, __ B, -1); // dup z10.b, -1
__ sve_dup(z11, __ S, -1); // dup z11.s, -1
__ sve_ld1b(z0, __ B, p0, Address(sp)); // ld1b {z0.b}, p0/z, [sp]
__ sve_ld1b(z0, __ H, p1, Address(sp)); // ld1b {z0.h}, p1/z, [sp]
__ sve_ld1b(z0, __ S, p2, Address(sp, r8)); // ld1b {z0.s}, p2/z, [sp, x8]
__ sve_ld1b(z0, __ D, p3, Address(sp, 7)); // ld1b {z0.d}, p3/z, [sp, #7, MUL VL]
__ sve_ld1h(z10, __ H, p1, Address(sp, -8)); // ld1h {z10.h}, p1/z, [sp, #-8, MUL VL]
__ sve_ld1w(z20, __ S, p2, Address(r0, 7)); // ld1w {z20.s}, p2/z, [x0, #7, MUL VL]
__ sve_ld1b(z30, __ B, p3, Address(sp, r8)); // ld1b {z30.b}, p3/z, [sp, x8]
@ -753,6 +762,9 @@
__ sve_ld1d(z11, __ D, p5, Address(r0, r1)); // ld1d {z11.d}, p5/z, [x0, x1, LSL #3]
__ sve_st1b(z22, __ B, p6, Address(sp)); // st1b {z22.b}, p6, [sp]
__ sve_st1b(z31, __ B, p7, Address(sp, -8)); // st1b {z31.b}, p7, [sp, #-8, MUL VL]
__ sve_st1b(z0, __ H, p1, Address(sp)); // st1b {z0.h}, p1, [sp]
__ sve_st1b(z0, __ S, p2, Address(sp, r8)); // st1b {z0.s}, p2, [sp, x8]
__ sve_st1b(z0, __ D, p3, Address(sp)); // st1b {z0.d}, p3, [sp]
__ sve_st1w(z0, __ S, p1, Address(r0, 7)); // st1w {z0.s}, p1, [x0, #7, MUL VL]
__ sve_st1b(z0, __ B, p2, Address(sp, r1)); // st1b {z0.b}, p2, [sp, x1]
__ sve_st1h(z0, __ H, p3, Address(sp, r8)); // st1h {z0.h}, p3, [sp, x8, LSL #1]
@ -766,11 +778,64 @@
__ sve_cntd(r12); // cntd x12
__ sve_brka(p2, p0, p2, false); // brka p2.b, p0/z, p2.b
__ sve_brka(p1, p2, p3, true); // brka p1.b, p2/m, p3.b
__ sve_brkb(p1, p2, p3, false); // brkb p1.b, p2/z, p3.b
__ sve_brkb(p2, p3, p4, true); // brkb p2.b, p3/m, p4.b
__ sve_rev(p0, __ B, p1); // rev p0.b, p1.b
__ sve_rev(p1, __ H, p2); // rev p1.h, p2.h
__ sve_rev(p2, __ S, p3); // rev p2.s, p3.s
__ sve_rev(p3, __ D, p4); // rev p3.d, p4.d
__ sve_incp(r0, __ B, p2); // incp x0, p2.b
__ sve_whilelt(p0, __ B, r1, r28); // whilelt p0.b, x1, x28
__ sve_whilele(p2, __ H, r11, r8); // whilele p2.h, x11, x8
__ sve_whilelo(p3, __ S, r7, r2); // whilelo p3.s, x7, x2
__ sve_whilels(p4, __ D, r17, r10); // whilels p4.d, x17, x10
__ sve_sel(z0, __ B, p0, z1, z2); // sel z0.b, p0, z1.b, z2.b
__ sve_sel(z4, __ D, p0, z5, z6); // sel z4.d, p0, z5.d, z6.d
__ sve_cmp(Assembler::EQ, p1, __ B, p0, z0, z1); // cmpeq p1.b, p0/z, z0.b, z1.b
__ sve_cmp(Assembler::NE, p1, __ H, p0, z2, z3); // cmpne p1.h, p0/z, z2.h, z3.h
__ sve_cmp(Assembler::GE, p1, __ S, p2, z4, z5); // cmpge p1.s, p2/z, z4.s, z5.s
__ sve_cmp(Assembler::GT, p1, __ D, p3, z6, z7); // cmpgt p1.d, p3/z, z6.d, z7.d
__ sve_cmp(Assembler::HI, p1, __ S, p2, z4, z5); // cmphi p1.s, p2/z, z4.s, z5.s
__ sve_cmp(Assembler::HS, p1, __ D, p3, z6, z7); // cmphs p1.d, p3/z, z6.d, z7.d
__ sve_cmp(Assembler::EQ, p1, __ B, p4, z0, 15); // cmpeq p1.b, p4/z, z0.b, #15
__ sve_cmp(Assembler::NE, p1, __ H, p0, z2, -16); // cmpne p1.h, p0/z, z2.h, #-16
__ sve_cmp(Assembler::LE, p1, __ S, p1, z4, 0); // cmple p1.s, p1/z, z4.s, #0
__ sve_cmp(Assembler::LT, p1, __ D, p2, z6, -1); // cmplt p1.d, p2/z, z6.d, #-1
__ sve_cmp(Assembler::GE, p1, __ S, p3, z4, 5); // cmpge p1.s, p3/z, z4.s, #5
__ sve_cmp(Assembler::GT, p1, __ B, p4, z6, -2); // cmpgt p1.b, p4/z, z6.b, #-2
__ sve_fcm(Assembler::EQ, p1, __ S, p0, z0, z1); // fcmeq p1.s, p0/z, z0.s, z1.s
__ sve_fcm(Assembler::NE, p1, __ D, p0, z2, z3); // fcmne p1.d, p0/z, z2.d, z3.d
__ sve_fcm(Assembler::GT, p1, __ S, p2, z4, z5); // fcmgt p1.s, p2/z, z4.s, z5.s
__ sve_fcm(Assembler::GE, p1, __ D, p3, z6, z7); // fcmge p1.d, p3/z, z6.d, z7.d
__ sve_uunpkhi(z0, __ H, z1); // uunpkhi z0.h, z1.b
__ sve_uunpklo(z4, __ S, z5); // uunpklo z4.s, z5.h
__ sve_sunpkhi(z6, __ D, z7); // sunpkhi z6.d, z7.s
__ sve_sunpklo(z10, __ H, z11); // sunpklo z10.h, z11.b
__ sve_scvtf(z1, __ D, p0, z0, __ S); // scvtf z1.d, p0/m, z0.s
__ sve_scvtf(z3, __ D, p1, z2, __ D); // scvtf z3.d, p1/m, z2.d
__ sve_scvtf(z6, __ S, p2, z1, __ D); // scvtf z6.s, p2/m, z1.d
__ sve_scvtf(z6, __ S, p3, z1, __ S); // scvtf z6.s, p3/m, z1.s
__ sve_scvtf(z6, __ H, p3, z1, __ S); // scvtf z6.h, p3/m, z1.s
__ sve_scvtf(z6, __ H, p3, z1, __ D); // scvtf z6.h, p3/m, z1.d
__ sve_scvtf(z6, __ H, p3, z1, __ H); // scvtf z6.h, p3/m, z1.h
__ sve_fcvt(z5, __ D, p3, z4, __ S); // fcvt z5.d, p3/m, z4.s
__ sve_fcvt(z1, __ S, p3, z0, __ D); // fcvt z1.s, p3/m, z0.d
__ sve_fcvtzs(z19, __ D, p2, z1, __ D); // fcvtzs z19.d, p2/m, z1.d
__ sve_fcvtzs(z9, __ S, p1, z8, __ S); // fcvtzs z9.s, p1/m, z8.s
__ sve_fcvtzs(z1, __ S, p2, z0, __ D); // fcvtzs z1.s, p2/m, z0.d
__ sve_fcvtzs(z1, __ D, p3, z0, __ S); // fcvtzs z1.d, p3/m, z0.s
__ sve_fcvtzs(z1, __ S, p4, z18, __ H); // fcvtzs z1.s, p4/m, z18.h
__ sve_lasta(r0, __ B, p0, z15); // lasta w0, p0, z15.b
__ sve_lastb(r1, __ B, p1, z16); // lastb w1, p1, z16.b
__ sve_lasta(v0, __ B, p0, z15); // lasta b0, p0, z15.b
__ sve_lastb(v1, __ B, p1, z16); // lastb b1, p1, z16.b
__ sve_index(z6, __ S, 1, 1); // index z6.s, #1, #1
__ sve_cpy(z7, __ H, p3, r5); // cpy z7.h, p3/m, w5
__ sve_tbl(z16, __ S, z17, z18); // tbl z16.s, {z17.s}, z18.s
__ sve_ld1w_gather(z15, p0, r5, z16); // ld1w {z15.s}, p0/z, [x5, z16.s, uxtw #2]
__ sve_ld1d_gather(z15, p0, r5, z16); // ld1d {z15.d}, p0/z, [x5, z16.d, uxtw #3]
__ sve_st1w_scatter(z15, p0, r5, z16); // st1w {z15.s}, p0, [x5, z16.s, uxtw #2]
__ sve_st1d_scatter(z15, p0, r5, z16); // st1d {z15.d}, p0, [x5, z16.d, uxtw #3]
// FloatImmediateOp
__ fmovd(v0, 2.0); // fmov d0, #2.0
@ -946,21 +1011,19 @@
__ sve_eor(z2, z11, z28); // eor z2.d, z11.d, z28.d
__ sve_orr(z7, z1, z26); // orr z7.d, z1.d, z26.d
__ sve_bic(z17, z14, z8); // bic z17.d, z14.d, z8.d
__ sve_cmpeq(p5, __ S, p6, z5, z19); // cmpeq p5.s, p6/z, z5.s, z19.s
__ sve_cmpge(p4, __ S, p5, z16, z29); // cmpge p4.s, p5/z, z16.s, z29.s
__ sve_cmpgt(p5, __ D, p0, z4, z17); // cmpgt p5.d, p0/z, z4.d, z17.d
__ sve_cmpne(p1, __ D, p5, z4, z23); // cmpne p1.d, p5/z, z4.d, z23.d
__ sve_uzp1(z21, __ S, z24, z5); // uzp1 z21.s, z24.s, z5.s
__ sve_uzp2(z21, __ S, z17, z22); // uzp2 z21.s, z17.s, z22.s
// SVEReductionOp
__ sve_andv(v19, __ H, p0, z8); // andv h19, p0, z8.h
__ sve_orv(v14, __ D, p6, z17); // orv d14, p6, z17.d
__ sve_eorv(v21, __ B, p1, z30); // eorv b21, p1, z30.b
__ sve_smaxv(v10, __ B, p5, z12); // smaxv b10, p5, z12.b
__ sve_sminv(v9, __ S, p1, z24); // sminv s9, p1, z24.s
__ sve_fminv(v4, __ S, p6, z6); // fminv s4, p6, z6.s
__ sve_fmaxv(v27, __ D, p6, z13); // fmaxv d27, p6, z13.d
__ sve_fadda(v30, __ D, p5, z22); // fadda d30, p5, d30, z22.d
__ sve_uaddv(v30, __ H, p7, z9); // uaddv d30, p7, z9.h
__ sve_andv(v29, __ B, p5, z19); // andv b29, p5, z19.b
__ sve_orv(v4, __ B, p4, z23); // orv b4, p4, z23.b
__ sve_eorv(v19, __ D, p1, z23); // eorv d19, p1, z23.d
__ sve_smaxv(v19, __ H, p0, z8); // smaxv h19, p0, z8.h
__ sve_sminv(v14, __ D, p6, z17); // sminv d14, p6, z17.d
__ sve_fminv(v21, __ S, p1, z30); // fminv s21, p1, z30.s
__ sve_fmaxv(v10, __ S, p5, z12); // fmaxv s10, p5, z12.s
__ sve_fadda(v9, __ D, p1, z24); // fadda d9, p1, d9, z24.d
__ sve_uaddv(v4, __ H, p6, z6); // uaddv d4, p6, z6.h
__ bind(forth);
@ -979,30 +1042,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x1400030d, 0x94000000,
0x97ffffd4, 0x9400030a, 0x3400000a, 0x34fffa2a,
0x340060ea, 0x35000008, 0x35fff9c8, 0x35006088,
0xb400000b, 0xb4fff96b, 0xb400602b, 0xb500001d,
0xb5fff91d, 0xb5005fdd, 0x10000013, 0x10fff8b3,
0x10005f73, 0x90000013, 0x36300016, 0x3637f836,
0x36305ef6, 0x3758000c, 0x375ff7cc, 0x37585e8c,
0x14000000, 0x17ffffd7, 0x1400034c, 0x94000000,
0x97ffffd4, 0x94000349, 0x3400000a, 0x34fffa2a,
0x340068ca, 0x35000008, 0x35fff9c8, 0x35006868,
0xb400000b, 0xb4fff96b, 0xb400680b, 0xb500001d,
0xb5fff91d, 0xb50067bd, 0x10000013, 0x10fff8b3,
0x10006753, 0x90000013, 0x36300016, 0x3637f836,
0x363066d6, 0x3758000c, 0x375ff7cc, 0x3758666c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x54005c60, 0x54000001, 0x54fff541, 0x54005c01,
0x54000002, 0x54fff4e2, 0x54005ba2, 0x54000002,
0x54fff482, 0x54005b42, 0x54000003, 0x54fff423,
0x54005ae3, 0x54000003, 0x54fff3c3, 0x54005a83,
0x54000004, 0x54fff364, 0x54005a24, 0x54000005,
0x54fff305, 0x540059c5, 0x54000006, 0x54fff2a6,
0x54005966, 0x54000007, 0x54fff247, 0x54005907,
0x54000008, 0x54fff1e8, 0x540058a8, 0x54000009,
0x54fff189, 0x54005849, 0x5400000a, 0x54fff12a,
0x540057ea, 0x5400000b, 0x54fff0cb, 0x5400578b,
0x5400000c, 0x54fff06c, 0x5400572c, 0x5400000d,
0x54fff00d, 0x540056cd, 0x5400000e, 0x54ffefae,
0x5400566e, 0x5400000f, 0x54ffef4f, 0x5400560f,
0x54006440, 0x54000001, 0x54fff541, 0x540063e1,
0x54000002, 0x54fff4e2, 0x54006382, 0x54000002,
0x54fff482, 0x54006322, 0x54000003, 0x54fff423,
0x540062c3, 0x54000003, 0x54fff3c3, 0x54006263,
0x54000004, 0x54fff364, 0x54006204, 0x54000005,
0x54fff305, 0x540061a5, 0x54000006, 0x54fff2a6,
0x54006146, 0x54000007, 0x54fff247, 0x540060e7,
0x54000008, 0x54fff1e8, 0x54006088, 0x54000009,
0x54fff189, 0x54006029, 0x5400000a, 0x54fff12a,
0x54005fca, 0x5400000b, 0x54fff0cb, 0x54005f6b,
0x5400000c, 0x54fff06c, 0x54005f0c, 0x5400000d,
0x54fff00d, 0x54005ead, 0x5400000e, 0x54ffefae,
0x54005e4e, 0x5400000f, 0x54ffef4f, 0x54005def,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd69f03e0, 0xd6bf03e0,
0xd5033fdf, 0xd5033e9f, 0xd50332bf, 0xd61f0200,
@ -1034,7 +1097,7 @@
0x791f226d, 0xf95aa2f3, 0xb9587bb7, 0x395f7176,
0x795d9143, 0x399e7e08, 0x799a2697, 0x79df3422,
0xb99c2624, 0xfd5c2374, 0xbd5fa1d9, 0xfd1d595a,
0xbd1b1869, 0x5800465b, 0x1800000b, 0xf8945060,
0xbd1b1869, 0x58004e3b, 0x1800000b, 0xf8945060,
0xd8000000, 0xf8ae6ba0, 0xf99a0080, 0x1a070035,
0x3a0700a8, 0x5a0e0367, 0x7a11009b, 0x9a000380,
0xba1e030c, 0xda0f0320, 0xfa030301, 0x0b340b11,
@ -1122,59 +1185,75 @@
0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, 0x4e071fe1,
0x4e042c20, 0x4e062c20, 0x4e052c20, 0x4e083c20,
0x0e0c3c20, 0x0e0a3c20, 0x0e073c20, 0x9eae0020,
0x4cc0ac3f, 0x05a08020, 0x04b0e3e0, 0x0470e7e1,
0x042f9c20, 0x043f9c35, 0x047f9c20, 0x04ff9c20,
0x04299420, 0x04319160, 0x0461943e, 0x04a19020,
0x042053ff, 0x047f5401, 0x25208028, 0x2538cfe0,
0x2578d001, 0x25b8efe2, 0x25f8f007, 0x05203864,
0x05603ace, 0xa400a3e0, 0xa4a8a7ea, 0xa547a814,
0xa4084ffe, 0xa55c53e0, 0xa5e1540b, 0xe400fbf6,
0xe408ffff, 0xe547e400, 0xe4014be0, 0xe4a84fe0,
0x4cc0ac3f, 0x05a08020, 0x05104fe0, 0x05505001,
0x05906fe2, 0x05d03005, 0x05101fea, 0x05901feb,
0x04b0e3e0, 0x0470e7e1, 0x042f9c20, 0x043f9c35,
0x047f9c20, 0x04ff9c20, 0x04299420, 0x04319160,
0x0461943e, 0x04a19020, 0x042053ff, 0x047f5401,
0x25208028, 0x2538cfe0, 0x2578d001, 0x25b8efe2,
0x25f8f007, 0x2538dfea, 0x25b8dfeb, 0xa400a3e0,
0xa420a7e0, 0xa4484be0, 0xa467afe0, 0xa4a8a7ea,
0xa547a814, 0xa4084ffe, 0xa55c53e0, 0xa5e1540b,
0xe400fbf6, 0xe408ffff, 0xe420e7e0, 0xe4484be0,
0xe460efe0, 0xe547e400, 0xe4014be0, 0xe4a84fe0,
0xe5f15000, 0x858043e0, 0x85a043ff, 0xe59f5d08,
0x0420e3e9, 0x0460e3ea, 0x04a0e3eb, 0x04e0e3ec,
0x25104042, 0x25104871, 0x252c8840, 0x253c1420,
0x25681572, 0x25a21ce3, 0x25ea1e34, 0x1e601000,
0x1e603000, 0x1e621000, 0x1e623000, 0x1e641000,
0x1e643000, 0x1e661000, 0x1e663000, 0x1e681000,
0x1e683000, 0x1e6a1000, 0x1e6a3000, 0x1e6c1000,
0x1e6c3000, 0x1e6e1000, 0x1e6e3000, 0x1e701000,
0x1e703000, 0x1e721000, 0x1e723000, 0x1e741000,
0x1e743000, 0x1e761000, 0x1e763000, 0x1e781000,
0x1e783000, 0x1e7a1000, 0x1e7a3000, 0x1e7c1000,
0x1e7c3000, 0x1e7e1000, 0x1e7e3000, 0xf8208193,
0xf83101b6, 0xf83c13fe, 0xf821239a, 0xf824309e,
0xf826535e, 0xf8304109, 0xf82c7280, 0xf8216058,
0xf8a08309, 0xf8ba03d0, 0xf8a312ea, 0xf8aa21e4,
0xf8a2310b, 0xf8aa522f, 0xf8a2418a, 0xf8ac71af,
0xf8a26287, 0xf8fa8090, 0xf8e20184, 0xf8f01215,
0xf8f022ab, 0xf8f7334c, 0xf8f751dc, 0xf8eb4038,
0xf8ec715f, 0xf8f06047, 0xf863826d, 0xf8710070,
0xf86113cb, 0xf86521e8, 0xf87d301e, 0xf8745287,
0xf87742bc, 0xf87b70b9, 0xf8616217, 0xb83f8185,
0xb82901fc, 0xb83d13f6, 0xb83320bf, 0xb82e33f0,
0xb830529b, 0xb830416c, 0xb82973c6, 0xb831639b,
0xb8be8147, 0xb8b4008a, 0xb8b81231, 0xb8b623a3,
0xb8af3276, 0xb8b35056, 0xb8af4186, 0xb8b071ab,
0xb8b763c1, 0xb8f38225, 0xb8e202d0, 0xb8ed12aa,
0xb8fd219b, 0xb8fb3023, 0xb8ff5278, 0xb8f14389,
0xb8fb70ef, 0xb8f563f7, 0xb87983e2, 0xb87b0150,
0xb8771073, 0xb8702320, 0xb87a3057, 0xb870508c,
0xb87c43be, 0xb87070db, 0xb86961fd, 0xce273c87,
0xce080ac9, 0xce7e8e9b, 0xce808b45, 0xce79806e,
0xce758768, 0xcec0835a, 0xce608ad8, 0x043100c4,
0x046105e3, 0x65c900a6, 0x65d60a87, 0x65c80545,
0x0416a63e, 0x04001f8b, 0x0450979a, 0x04dabe0d,
0x045381a5, 0x04918b4f, 0x049006cb, 0x0497a264,
0x045eadd1, 0x04881062, 0x040a04d7, 0x04810f71,
0x04dca450, 0x65c084c3, 0x65cd8d93, 0x65c69a68,
0x65878ae0, 0x65c29db3, 0x049da0e6, 0x6582b911,
0x65c0b6d6, 0x65c1a1e2, 0x65cda494, 0x65c18107,
0x65af1493, 0x65e52b36, 0x65ab4ed0, 0x65f06a8d,
0x0451448f, 0x049c7c86, 0x0429335d, 0x04bc3162,
0x047a3027, 0x04e831d1, 0x2493b8a5, 0x249d9604,
0x24d18095, 0x24d7b491, 0x045a2113, 0x04d83a2e,
0x041927d5, 0x0408358a, 0x048a2709, 0x658738c4,
0x65c639bb, 0x65d836de, 0x04413d3e,
0x25104042, 0x25104871, 0x25904861, 0x25904c92,
0x05344020, 0x05744041, 0x05b44062, 0x05f44083,
0x252c8840, 0x253c1420, 0x25681572, 0x25a21ce3,
0x25ea1e34, 0x0522c020, 0x05e6c0a4, 0x2401a001,
0x2443a051, 0x24858881, 0x24c78cd1, 0x24850891,
0x24c70cc1, 0x250f9001, 0x25508051, 0x25802491,
0x25df28c1, 0x25850c81, 0x251e10d1, 0x65816001,
0x65c36051, 0x65854891, 0x65c74cc1, 0x05733820,
0x05b238a4, 0x05f138e6, 0x0570396a, 0x65d0a001,
0x65d6a443, 0x65d4a826, 0x6594ac26, 0x6554ac26,
0x6556ac26, 0x6552ac26, 0x65cbac85, 0x65caac01,
0x65dea833, 0x659ca509, 0x65d8a801, 0x65dcac01,
0x655cb241, 0x0520a1e0, 0x0521a601, 0x052281e0,
0x05238601, 0x04a14026, 0x0568aca7, 0x05b23230,
0x853040af, 0xc5b040af, 0xe57080af, 0xe5b080af,
0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000,
0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000,
0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000,
0x1e6c1000, 0x1e6c3000, 0x1e6e1000, 0x1e6e3000,
0x1e701000, 0x1e703000, 0x1e721000, 0x1e723000,
0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000,
0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000,
0x1e7c1000, 0x1e7c3000, 0x1e7e1000, 0x1e7e3000,
0xf8208193, 0xf83101b6, 0xf83c13fe, 0xf821239a,
0xf824309e, 0xf826535e, 0xf8304109, 0xf82c7280,
0xf8216058, 0xf8a08309, 0xf8ba03d0, 0xf8a312ea,
0xf8aa21e4, 0xf8a2310b, 0xf8aa522f, 0xf8a2418a,
0xf8ac71af, 0xf8a26287, 0xf8fa8090, 0xf8e20184,
0xf8f01215, 0xf8f022ab, 0xf8f7334c, 0xf8f751dc,
0xf8eb4038, 0xf8ec715f, 0xf8f06047, 0xf863826d,
0xf8710070, 0xf86113cb, 0xf86521e8, 0xf87d301e,
0xf8745287, 0xf87742bc, 0xf87b70b9, 0xf8616217,
0xb83f8185, 0xb82901fc, 0xb83d13f6, 0xb83320bf,
0xb82e33f0, 0xb830529b, 0xb830416c, 0xb82973c6,
0xb831639b, 0xb8be8147, 0xb8b4008a, 0xb8b81231,
0xb8b623a3, 0xb8af3276, 0xb8b35056, 0xb8af4186,
0xb8b071ab, 0xb8b763c1, 0xb8f38225, 0xb8e202d0,
0xb8ed12aa, 0xb8fd219b, 0xb8fb3023, 0xb8ff5278,
0xb8f14389, 0xb8fb70ef, 0xb8f563f7, 0xb87983e2,
0xb87b0150, 0xb8771073, 0xb8702320, 0xb87a3057,
0xb870508c, 0xb87c43be, 0xb87070db, 0xb86961fd,
0xce273c87, 0xce080ac9, 0xce7e8e9b, 0xce808b45,
0xce79806e, 0xce758768, 0xcec0835a, 0xce608ad8,
0x043100c4, 0x046105e3, 0x65c900a6, 0x65d60a87,
0x65c80545, 0x0416a63e, 0x04001f8b, 0x0450979a,
0x04dabe0d, 0x045381a5, 0x04918b4f, 0x049006cb,
0x0497a264, 0x045eadd1, 0x04881062, 0x040a04d7,
0x04810f71, 0x04dca450, 0x65c084c3, 0x65cd8d93,
0x65c69a68, 0x65878ae0, 0x65c29db3, 0x049da0e6,
0x6582b911, 0x65c0b6d6, 0x65c1a1e2, 0x65cda494,
0x65c18107, 0x65af1493, 0x65e52b36, 0x65ab4ed0,
0x65f06a8d, 0x0451448f, 0x049c7c86, 0x0429335d,
0x04bc3162, 0x047a3027, 0x04e831d1, 0x05a56b15,
0x05b66e35, 0x041a367d, 0x041832e4, 0x04d926f3,
0x04482113, 0x04ca3a2e, 0x658727d5, 0x6586358a,
0x65d82709, 0x044138c4,
};
// END Generated code -- do not edit