8282966: AArch64: Optimize VectorMask.toLong with SVE2

Reviewed-by: xgong, ngasson
This commit is contained in:
Eric Liu 2022-05-12 01:15:16 +00:00 committed by Xiaohong Gong
parent 57a7670886
commit e9f45bb270
7 changed files with 115 additions and 51 deletions

View File

@ -149,6 +149,8 @@ source %{
case Op_LoadVector:
case Op_StoreVector:
return Matcher::vector_size_supported(bt, vlen);
case Op_VectorMaskToLong:
if (vlen > 64) return false;
default:
break;
}
@ -5487,8 +5489,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
%}
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
predicate(UseSVE > 0 &&
n->in(1)->bottom_type()->is_vect()->length() <= 64);
predicate(UseSVE > 0);
match(Set dst (VectorMaskToLong src));
effect(TEMP vtmp1, TEMP vtmp2);
ins_cost(13 * SVE_COST);

View File

@ -144,6 +144,8 @@ source %{
case Op_LoadVector:
case Op_StoreVector:
return Matcher::vector_size_supported(bt, vlen);
case Op_VectorMaskToLong:
if (vlen > 64) return false;
default:
break;
}
@ -3055,8 +3057,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
%}
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
predicate(UseSVE > 0 &&
n->in(1)->bottom_type()->is_vect()->length() <= 64);
predicate(UseSVE > 0);
match(Set dst (VectorMaskToLong src));
effect(TEMP vtmp1, TEMP vtmp2);
ins_cost(13 * SVE_COST);

View File

@ -3819,6 +3819,19 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
}
// SVE2 bitwise permute
#define INSN(NAME, opc) \
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \
starti; \
assert(T != Q, "invalid size"); \
f(0b01000101, 31, 24), f(T, 23, 22), f(0b0, 21); \
rf(Zm, 16), f(0b1011, 15, 12), f(opc, 11, 10); \
rf(Zn, 5), rf(Zd, 0); \
}
INSN(sve_bext, 0b00);
#undef INSN
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
}

View File

@ -958,7 +958,7 @@ void C2_MacroAssembler::bytemask_compress(Register dst) {
// Pack the lowest-numbered bit of each mask element in src into a long value
// in dst, at most the first 64 lane elements.
// Clobbers: rscratch1
// Clobbers: rscratch1 if hardware doesn't support FEAT_BITPERM.
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
@ -966,24 +966,66 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
assert_different_registers(vtmp1, vtmp2);
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
// Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
// Expected: dst = 0x658D
// Pack the mask into vector with sequential bytes.
// Convert the mask into vector with sequential bytes.
// vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
sve_cpy(vtmp1, size, src, 1, false);
if (bt != T_BYTE) {
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
}
// Compress the lowest 8 bytes.
fmovd(dst, vtmp1);
bytemask_compress(dst);
if (lane_cnt <= 8) return;
if (UseSVE > 0 && !VM_Version::supports_svebitperm()) {
// Compress the lowest 8 bytes.
fmovd(dst, vtmp1);
bytemask_compress(dst);
if (lane_cnt <= 8) return;
// Repeat on higher bytes and join the results.
// Compress 8 bytes in each iteration.
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
bytemask_compress(rscratch1);
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
// Repeat on higher bytes and join the results.
// Compress 8 bytes in each iteration.
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
bytemask_compress(rscratch1);
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
}
} else if (UseSVE == 2 && VM_Version::supports_svebitperm()) {
// Given by the vector with value 0x00 or 0x01 in each byte, the basic idea
// is to compress each significant bit of the byte in a cross-lane way. Due
// to the lack of cross-lane bit-compress instruction, here we use BEXT
// (bit-compress in each lane) with the biggest lane size (T = D) and
// concatenates the results then.
// The second source input of BEXT, initialized with 0x01 in each byte.
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
sve_dup(vtmp2, B, 1);
// BEXT vtmp1.D, vtmp1.D, vtmp2.D
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
// ---------------------------------------
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
sve_bext(vtmp1, D, vtmp1, vtmp2);
// Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
// result to dst.
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
// dst = 0x658D
if (lane_cnt <= 8) {
// No need to concatenate.
umov(dst, vtmp1, B, 0);
} else if (lane_cnt <= 16) {
ins(vtmp1, B, vtmp1, 1, 8);
umov(dst, vtmp1, H, 0);
} else {
// As the lane count is 64 at most, the final expected value must be in
// the lowest 64 bits after narrowing vtmp1 from D to B.
sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
umov(dst, vtmp1, D, 0);
}
} else {
assert(false, "unsupported");
ShouldNotReachHere();
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@ -275,6 +275,9 @@ class PRegisterImpl: public AbstractRegisterImpl {
REGISTER_IMPL_DECLARATION(PRegister, PRegisterImpl, PRegisterImpl::number_of_registers);
// The predicate registers of SVE.
//
CONSTANT_REGISTER_DECLARATION(PRegister, pnoreg, (-1));
CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0));
CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1));
CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2));

View File

@ -1894,6 +1894,8 @@ generate(SVEVectorOp, [["add", "ZZZ"],
["bic", "ZZZ"],
["uzp1", "ZZZ"],
["uzp2", "ZZZ"],
# SVE2 instructions
["bext", "ZZZ"],
])
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
@ -1904,8 +1906,9 @@ outfile.write("forth:\n")
outfile.close()
# compile for sve with 8.3 and sha3 because of SHA3 crypto extension.
subprocess.check_call([AARCH64_AS, "-march=armv8.3-a+sha3+sve", "aarch64ops.s", "-o", "aarch64ops.o"])
# compile for sve with armv9-a+sha3+sve2-bitperm because of SHA3 crypto extension and SVE2 bitperm instructions.
# armv9-a enables sve and sve2 by default.
subprocess.check_call([AARCH64_AS, "-march=armv9-a+sha3+sve2-bitperm", "aarch64ops.s", "-o", "aarch64ops.o"])
print
print "/*"

View File

@ -1183,17 +1183,18 @@
__ sve_bic(z8, z2, z0); // bic z8.d, z2.d, z0.d
__ sve_uzp1(z23, __ S, z22, z0); // uzp1 z23.s, z22.s, z0.s
__ sve_uzp2(z25, __ H, z26, z23); // uzp2 z25.h, z26.h, z23.h
__ sve_bext(z21, __ B, z21, z1); // bext z21.b, z21.b, z1.b
// SVEReductionOp
__ sve_andv(v21, __ B, p5, z1); // andv b21, p5, z1.b
__ sve_orv(v10, __ S, p5, z11); // orv s10, p5, z11.s
__ sve_eorv(v23, __ D, p6, z8); // eorv d23, p6, z8.d
__ sve_smaxv(v17, __ S, p5, z19); // smaxv s17, p5, z19.s
__ sve_sminv(v4, __ D, p5, z13); // sminv d4, p5, z13.d
__ sve_fminv(v22, __ D, p7, z30); // fminv d22, p7, z30.d
__ sve_fmaxv(v17, __ S, p4, z14); // fmaxv s17, p4, z14.s
__ sve_fadda(v12, __ S, p7, z20); // fadda s12, p7, s12, z20.s
__ sve_uaddv(v1, __ B, p3, z13); // uaddv d1, p3, z13.b
__ sve_andv(v10, __ S, p5, z11); // andv s10, p5, z11.s
__ sve_orv(v23, __ D, p6, z8); // orv d23, p6, z8.d
__ sve_eorv(v17, __ S, p5, z19); // eorv s17, p5, z19.s
__ sve_smaxv(v4, __ D, p5, z13); // smaxv d4, p5, z13.d
__ sve_sminv(v22, __ D, p7, z30); // sminv d22, p7, z30.d
__ sve_fminv(v17, __ S, p4, z14); // fminv s17, p4, z14.s
__ sve_fmaxv(v12, __ S, p7, z20); // fmaxv s12, p7, z20.s
__ sve_fadda(v1, __ S, p3, z13); // fadda s1, p3, s1, z13.s
__ sve_uaddv(v7, __ S, p2, z11); // uaddv d7, p2, z11.s
__ bind(forth);
@ -1212,30 +1213,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x140003e4, 0x94000000,
0x97ffffd4, 0x940003e1, 0x3400000a, 0x34fffa2a,
0x34007bca, 0x35000008, 0x35fff9c8, 0x35007b68,
0xb400000b, 0xb4fff96b, 0xb4007b0b, 0xb500001d,
0xb5fff91d, 0xb5007abd, 0x10000013, 0x10fff8b3,
0x10007a53, 0x90000013, 0x36300016, 0x3637f836,
0x363079d6, 0x3758000c, 0x375ff7cc, 0x3758796c,
0x14000000, 0x17ffffd7, 0x140003e5, 0x94000000,
0x97ffffd4, 0x940003e2, 0x3400000a, 0x34fffa2a,
0x34007bea, 0x35000008, 0x35fff9c8, 0x35007b88,
0xb400000b, 0xb4fff96b, 0xb4007b2b, 0xb500001d,
0xb5fff91d, 0xb5007add, 0x10000013, 0x10fff8b3,
0x10007a73, 0x90000013, 0x36300016, 0x3637f836,
0x363079f6, 0x3758000c, 0x375ff7cc, 0x3758798c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x54007740, 0x54000001, 0x54fff541, 0x540076e1,
0x54000002, 0x54fff4e2, 0x54007682, 0x54000002,
0x54fff482, 0x54007622, 0x54000003, 0x54fff423,
0x540075c3, 0x54000003, 0x54fff3c3, 0x54007563,
0x54000004, 0x54fff364, 0x54007504, 0x54000005,
0x54fff305, 0x540074a5, 0x54000006, 0x54fff2a6,
0x54007446, 0x54000007, 0x54fff247, 0x540073e7,
0x54000008, 0x54fff1e8, 0x54007388, 0x54000009,
0x54fff189, 0x54007329, 0x5400000a, 0x54fff12a,
0x540072ca, 0x5400000b, 0x54fff0cb, 0x5400726b,
0x5400000c, 0x54fff06c, 0x5400720c, 0x5400000d,
0x54fff00d, 0x540071ad, 0x5400000e, 0x54ffefae,
0x5400714e, 0x5400000f, 0x54ffef4f, 0x540070ef,
0x54007760, 0x54000001, 0x54fff541, 0x54007701,
0x54000002, 0x54fff4e2, 0x540076a2, 0x54000002,
0x54fff482, 0x54007642, 0x54000003, 0x54fff423,
0x540075e3, 0x54000003, 0x54fff3c3, 0x54007583,
0x54000004, 0x54fff364, 0x54007524, 0x54000005,
0x54fff305, 0x540074c5, 0x54000006, 0x54fff2a6,
0x54007466, 0x54000007, 0x54fff247, 0x54007407,
0x54000008, 0x54fff1e8, 0x540073a8, 0x54000009,
0x54fff189, 0x54007349, 0x5400000a, 0x54fff12a,
0x540072ea, 0x5400000b, 0x54fff0cb, 0x5400728b,
0x5400000c, 0x54fff06c, 0x5400722c, 0x5400000d,
0x54fff00d, 0x540071cd, 0x5400000e, 0x54ffefae,
0x5400716e, 0x5400000f, 0x54ffef4f, 0x5400710f,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@ -1459,8 +1460,8 @@
0x65f1af0b, 0x65eec9f1, 0x65a7fed6, 0x65aa5f65,
0x65b47aae, 0x04c55723, 0x0441723d, 0x042d33ae,
0x04be3051, 0x047d32b6, 0x04e03048, 0x05a06ad7,
0x05776f59, 0x041a3435, 0x0498356a, 0x04d93917,
0x04883671, 0x04ca35a4, 0x65c73fd6, 0x658631d1,
0x65983e8c, 0x04012da1,
0x05776f59, 0x4501b2b5, 0x049a356a, 0x04d83917,
0x04993671, 0x04c835a4, 0x04ca3fd6, 0x658731d1,
0x65863e8c, 0x65982da1, 0x04812967,
};
// END Generated code -- do not edit