8287028: AArch64: [vectorapi] Backend implementation of VectorMask.fromLong with SVE2

Reviewed-by: xgong, ngasson
This commit is contained in:
Eric Liu 2022-06-14 03:38:42 +00:00 committed by Xiaohong Gong
parent fbe9266622
commit 86c9241cce
8 changed files with 133 additions and 32 deletions

@ -2471,6 +2471,7 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
case Op_CompressV:
case Op_CompressM:
case Op_ExpandV:
case Op_VectorLongToMask:
return false;
default:
break;

@ -153,6 +153,8 @@ source %{
if (UseSVE < 2 || is_subword_type(bt)) return false;
case Op_VectorMaskToLong:
if (vlen > 64) return false;
case Op_VectorLongToMask:
if (UseSVE < 2 || vlen > 64 || !VM_Version::supports_svebitperm()) return false;
default:
break;
}
@ -5675,6 +5677,20 @@ instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
ins_pipe(pipe_slow);
%}
instruct vmask_fromlong(pRegGov dst, iRegL src, vReg vtmp1, vReg vtmp2) %{
match(Set dst (VectorLongToMask src));
effect(TEMP vtmp1, TEMP vtmp2);
ins_cost(10 * SVE_COST);
format %{ "vmask_fromlong $dst, $src\t# vector mask fromlong (sve2)" %}
ins_encode %{
__ sve_vmask_fromlong(as_PRegister($dst$$reg), as_Register($src$$reg),
Matcher::vector_element_basic_type(this),
Matcher::vector_length(this),
as_FloatRegister($vtmp1$$reg), as_FloatRegister($vtmp2$$reg));
%}
ins_pipe(pipe_slow);
%}
// ---------------------------- Vector mask generation ---------------------------
// The rules below set predicate registers. They can guarantee the high bits of dst
// are cleared with zero when the vector length is less than the full size of

@ -148,6 +148,8 @@ source %{
if (UseSVE < 2 || is_subword_type(bt)) return false;
case Op_VectorMaskToLong:
if (vlen > 64) return false;
case Op_VectorLongToMask:
if (UseSVE < 2 || vlen > 64 || !VM_Version::supports_svebitperm()) return false;
default:
break;
}
@ -3168,6 +3170,20 @@ instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
ins_pipe(pipe_slow);
%}
instruct vmask_fromlong(pRegGov dst, iRegL src, vReg vtmp1, vReg vtmp2) %{
match(Set dst (VectorLongToMask src));
effect(TEMP vtmp1, TEMP vtmp2);
ins_cost(10 * SVE_COST);
format %{ "vmask_fromlong $dst, $src\t# vector mask fromlong (sve2)" %}
ins_encode %{
__ sve_vmask_fromlong(as_PRegister($dst$$reg), as_Register($src$$reg),
Matcher::vector_element_basic_type(this),
Matcher::vector_length(this),
as_FloatRegister($vtmp1$$reg), as_FloatRegister($vtmp2$$reg));
%}
ins_pipe(pipe_slow);
%}
// ---------------------------- Vector mask generation ---------------------------
// The rules below set predicate registers. They can guarantee the high bits of dst
// are cleared with zero when the vector length is less than the full size of

@ -3861,6 +3861,7 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
}
INSN(sve_bext, 0b00);
INSN(sve_bdep, 0b01);
#undef INSN
Assembler(CodeBuffer* code) : AbstractAssembler(code) {

@ -1030,6 +1030,66 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
}
}
// Unpack the mask, a long value in src, into predicate register dst based on the
// corresponding data type. Note that dst can support at most 64 lanes.
// Below example gives the expected dst predicate register in different types, with
// a valid src(0x658D) on a 1024-bit vector size machine.
// BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
// INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
// LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
//
// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
// has 24 significant bits would be an invalid input if dst predicate register refers to
// a LONG type 1024-bit vector, which has at most 16 lanes.
void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
// Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
// Expected: dst = 0b01101001 10001101
// Put long value from general purpose register into the first lane of vector.
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
sve_dup(vtmp1, B, 0);
mov(vtmp1, D, 0, src);
// As sve_cmp generates mask value with the minimum unit in byte, we should
// transform the value in the first lane which is mask in bit now to the
// mask in byte, which can be done by SVE2's BDEP instruction.
// The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
if (lane_cnt <= 8) {
// Nothing. As only one byte exsits.
} else if (lane_cnt <= 16) {
ins(vtmp1, B, vtmp1, 8, 1);
mov(vtmp1, B, 1, zr);
} else {
sve_vector_extend(vtmp1, D, vtmp1, B);
}
// The second source input of BDEP instruction, initialized with 0x01 for each byte.
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
sve_dup(vtmp2, B, 1);
// BDEP vtmp1.D, vtmp1.D, vtmp2.D
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
// ---------------------------------------
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
sve_bdep(vtmp1, D, vtmp1, vtmp2);
if (bt != T_BYTE) {
sve_vector_extend(vtmp1, size, vtmp1, B);
}
// Generate mask according to the given vector, in which the elements have been
// extended to expected type.
// dst = 0b01101001 10001101
sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
}
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
FloatRegister zn, FloatRegister zm, int cond) {
assert(pg->is_governing(), "This register has to be a governing predicate register");

@ -64,6 +64,11 @@
void sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2);
// Unpack the mask, a long value in src, into predicate register dst based on the
// corresponding data type. Note that dst can support at most 64 lanes.
void sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2);
// SIMD&FP comparison
void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
FloatRegister src2, int cond, bool isQ);

@ -1908,6 +1908,7 @@ generate(SVEVectorOp, [["add", "ZZZ"],
["uzp2", "ZZZ"],
# SVE2 instructions
["bext", "ZZZ"],
["bdep", "ZZZ"],
])
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],

@ -1195,17 +1195,18 @@
__ sve_uzp1(z10, __ S, z19, z11); // uzp1 z10.s, z19.s, z11.s
__ sve_uzp2(z23, __ D, z23, z8); // uzp2 z23.d, z23.d, z8.d
__ sve_bext(z17, __ S, z19, z19); // bext z17.s, z19.s, z19.s
__ sve_bdep(z4, __ D, z20, z13); // bdep z4.d, z20.d, z13.d
// SVEReductionOp
__ sve_andv(v4, __ D, p5, z13); // andv d4, p5, z13.d
__ sve_orv(v22, __ D, p7, z30); // orv d22, p7, z30.d
__ sve_eorv(v17, __ H, p4, z14); // eorv h17, p4, z14.h
__ sve_smaxv(v12, __ B, p7, z20); // smaxv b12, p7, z20.b
__ sve_sminv(v1, __ B, p3, z13); // sminv b1, p3, z13.b
__ sve_fminv(v7, __ D, p2, z11); // fminv d7, p2, z11.d
__ sve_fmaxv(v4, __ S, p6, z15); // fmaxv s4, p6, z15.s
__ sve_fadda(v3, __ D, p7, z0); // fadda d3, p7, d3, z0.d
__ sve_uaddv(v5, __ D, p5, z30); // uaddv d5, p5, z30.d
__ sve_andv(v22, __ D, p7, z30); // andv d22, p7, z30.d
__ sve_orv(v17, __ H, p4, z14); // orv h17, p4, z14.h
__ sve_eorv(v12, __ B, p7, z20); // eorv b12, p7, z20.b
__ sve_smaxv(v1, __ B, p3, z13); // smaxv b1, p3, z13.b
__ sve_sminv(v7, __ S, p2, z11); // sminv s7, p2, z11.s
__ sve_fminv(v4, __ S, p6, z15); // fminv s4, p6, z15.s
__ sve_fmaxv(v3, __ D, p7, z0); // fmaxv d3, p7, z0.d
__ sve_fadda(v5, __ D, p5, z30); // fadda d5, p5, d5, z30.d
__ sve_uaddv(v13, __ H, p3, z8); // uaddv d13, p3, z8.h
__ bind(forth);
@ -1224,30 +1225,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x140003f0, 0x94000000,
0x97ffffd4, 0x940003ed, 0x3400000a, 0x34fffa2a,
0x34007d4a, 0x35000008, 0x35fff9c8, 0x35007ce8,
0xb400000b, 0xb4fff96b, 0xb4007c8b, 0xb500001d,
0xb5fff91d, 0xb5007c3d, 0x10000013, 0x10fff8b3,
0x10007bd3, 0x90000013, 0x36300016, 0x3637f836,
0x36307b56, 0x3758000c, 0x375ff7cc, 0x37587aec,
0x14000000, 0x17ffffd7, 0x140003f1, 0x94000000,
0x97ffffd4, 0x940003ee, 0x3400000a, 0x34fffa2a,
0x34007d6a, 0x35000008, 0x35fff9c8, 0x35007d08,
0xb400000b, 0xb4fff96b, 0xb4007cab, 0xb500001d,
0xb5fff91d, 0xb5007c5d, 0x10000013, 0x10fff8b3,
0x10007bf3, 0x90000013, 0x36300016, 0x3637f836,
0x36307b76, 0x3758000c, 0x375ff7cc, 0x37587b0c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x540078c0, 0x54000001, 0x54fff541, 0x54007861,
0x54000002, 0x54fff4e2, 0x54007802, 0x54000002,
0x54fff482, 0x540077a2, 0x54000003, 0x54fff423,
0x54007743, 0x54000003, 0x54fff3c3, 0x540076e3,
0x54000004, 0x54fff364, 0x54007684, 0x54000005,
0x54fff305, 0x54007625, 0x54000006, 0x54fff2a6,
0x540075c6, 0x54000007, 0x54fff247, 0x54007567,
0x54000008, 0x54fff1e8, 0x54007508, 0x54000009,
0x54fff189, 0x540074a9, 0x5400000a, 0x54fff12a,
0x5400744a, 0x5400000b, 0x54fff0cb, 0x540073eb,
0x5400000c, 0x54fff06c, 0x5400738c, 0x5400000d,
0x54fff00d, 0x5400732d, 0x5400000e, 0x54ffefae,
0x540072ce, 0x5400000f, 0x54ffef4f, 0x5400726f,
0x540078e0, 0x54000001, 0x54fff541, 0x54007881,
0x54000002, 0x54fff4e2, 0x54007822, 0x54000002,
0x54fff482, 0x540077c2, 0x54000003, 0x54fff423,
0x54007763, 0x54000003, 0x54fff3c3, 0x54007703,
0x54000004, 0x54fff364, 0x540076a4, 0x54000005,
0x54fff305, 0x54007645, 0x54000006, 0x54fff2a6,
0x540075e6, 0x54000007, 0x54fff247, 0x54007587,
0x54000008, 0x54fff1e8, 0x54007528, 0x54000009,
0x54fff189, 0x540074c9, 0x5400000a, 0x54fff12a,
0x5400746a, 0x5400000b, 0x54fff0cb, 0x5400740b,
0x5400000c, 0x54fff06c, 0x540073ac, 0x5400000d,
0x54fff00d, 0x5400734d, 0x5400000e, 0x54ffefae,
0x540072ee, 0x5400000f, 0x54ffef4f, 0x5400728f,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@ -1474,8 +1475,8 @@
0x65eec81b, 0x65e3f415, 0x65fd4739, 0x65ee6191,
0x04c2422d, 0x045d76b4, 0x04203048, 0x04a032d7,
0x04773359, 0x04e132b5, 0x05ab6a6a, 0x05e86ef7,
0x4593b271, 0x04da35a4, 0x04d83fd6, 0x045931d1,
0x04083e8c, 0x040a2da1, 0x65c72967, 0x658639e4,
0x65d83c03, 0x04c137c5,
0x4593b271, 0x45cdb684, 0x04da3fd6, 0x045831d1,
0x04193e8c, 0x04082da1, 0x048a2967, 0x658739e4,
0x65c63c03, 0x65d837c5, 0x04412d0d,
};
// END Generated code -- do not edit