8282966: AArch64: Optimize VectorMask.toLong with SVE2
Reviewed-by: xgong, ngasson
This commit is contained in:
parent
57a7670886
commit
e9f45bb270
@ -149,6 +149,8 @@ source %{
|
|||||||
case Op_LoadVector:
|
case Op_LoadVector:
|
||||||
case Op_StoreVector:
|
case Op_StoreVector:
|
||||||
return Matcher::vector_size_supported(bt, vlen);
|
return Matcher::vector_size_supported(bt, vlen);
|
||||||
|
case Op_VectorMaskToLong:
|
||||||
|
if (vlen > 64) return false;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -5487,8 +5489,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
|
|||||||
%}
|
%}
|
||||||
|
|
||||||
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
|
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
|
||||||
predicate(UseSVE > 0 &&
|
predicate(UseSVE > 0);
|
||||||
n->in(1)->bottom_type()->is_vect()->length() <= 64);
|
|
||||||
match(Set dst (VectorMaskToLong src));
|
match(Set dst (VectorMaskToLong src));
|
||||||
effect(TEMP vtmp1, TEMP vtmp2);
|
effect(TEMP vtmp1, TEMP vtmp2);
|
||||||
ins_cost(13 * SVE_COST);
|
ins_cost(13 * SVE_COST);
|
||||||
|
@ -144,6 +144,8 @@ source %{
|
|||||||
case Op_LoadVector:
|
case Op_LoadVector:
|
||||||
case Op_StoreVector:
|
case Op_StoreVector:
|
||||||
return Matcher::vector_size_supported(bt, vlen);
|
return Matcher::vector_size_supported(bt, vlen);
|
||||||
|
case Op_VectorMaskToLong:
|
||||||
|
if (vlen > 64) return false;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -3055,8 +3057,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
|
|||||||
%}
|
%}
|
||||||
|
|
||||||
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
|
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
|
||||||
predicate(UseSVE > 0 &&
|
predicate(UseSVE > 0);
|
||||||
n->in(1)->bottom_type()->is_vect()->length() <= 64);
|
|
||||||
match(Set dst (VectorMaskToLong src));
|
match(Set dst (VectorMaskToLong src));
|
||||||
effect(TEMP vtmp1, TEMP vtmp2);
|
effect(TEMP vtmp1, TEMP vtmp2);
|
||||||
ins_cost(13 * SVE_COST);
|
ins_cost(13 * SVE_COST);
|
||||||
|
@ -3819,6 +3819,19 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
|
|||||||
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
|
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SVE2 bitwise permute
|
||||||
|
#define INSN(NAME, opc) \
|
||||||
|
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \
|
||||||
|
starti; \
|
||||||
|
assert(T != Q, "invalid size"); \
|
||||||
|
f(0b01000101, 31, 24), f(T, 23, 22), f(0b0, 21); \
|
||||||
|
rf(Zm, 16), f(0b1011, 15, 12), f(opc, 11, 10); \
|
||||||
|
rf(Zn, 5), rf(Zd, 0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSN(sve_bext, 0b00);
|
||||||
|
#undef INSN
|
||||||
|
|
||||||
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
|
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -958,7 +958,7 @@ void C2_MacroAssembler::bytemask_compress(Register dst) {
|
|||||||
|
|
||||||
// Pack the lowest-numbered bit of each mask element in src into a long value
|
// Pack the lowest-numbered bit of each mask element in src into a long value
|
||||||
// in dst, at most the first 64 lane elements.
|
// in dst, at most the first 64 lane elements.
|
||||||
// Clobbers: rscratch1
|
// Clobbers: rscratch1 if hardware doesn't support FEAT_BITPERM.
|
||||||
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
|
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
|
||||||
FloatRegister vtmp1, FloatRegister vtmp2) {
|
FloatRegister vtmp1, FloatRegister vtmp2) {
|
||||||
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
|
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
|
||||||
@ -966,24 +966,66 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
|
|||||||
assert_different_registers(vtmp1, vtmp2);
|
assert_different_registers(vtmp1, vtmp2);
|
||||||
|
|
||||||
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
|
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
|
||||||
|
// Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
|
||||||
|
// Expected: dst = 0x658D
|
||||||
|
|
||||||
// Pack the mask into vector with sequential bytes.
|
// Convert the mask into vector with sequential bytes.
|
||||||
|
// vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
|
||||||
sve_cpy(vtmp1, size, src, 1, false);
|
sve_cpy(vtmp1, size, src, 1, false);
|
||||||
if (bt != T_BYTE) {
|
if (bt != T_BYTE) {
|
||||||
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
|
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compress the lowest 8 bytes.
|
if (UseSVE > 0 && !VM_Version::supports_svebitperm()) {
|
||||||
fmovd(dst, vtmp1);
|
// Compress the lowest 8 bytes.
|
||||||
bytemask_compress(dst);
|
fmovd(dst, vtmp1);
|
||||||
if (lane_cnt <= 8) return;
|
bytemask_compress(dst);
|
||||||
|
if (lane_cnt <= 8) return;
|
||||||
|
|
||||||
// Repeat on higher bytes and join the results.
|
// Repeat on higher bytes and join the results.
|
||||||
// Compress 8 bytes in each iteration.
|
// Compress 8 bytes in each iteration.
|
||||||
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
|
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
|
||||||
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
|
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
|
||||||
bytemask_compress(rscratch1);
|
bytemask_compress(rscratch1);
|
||||||
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
|
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
|
||||||
|
}
|
||||||
|
} else if (UseSVE == 2 && VM_Version::supports_svebitperm()) {
|
||||||
|
// Given by the vector with value 0x00 or 0x01 in each byte, the basic idea
|
||||||
|
// is to compress each significant bit of the byte in a cross-lane way. Due
|
||||||
|
// to the lack of cross-lane bit-compress instruction, here we use BEXT
|
||||||
|
// (bit-compress in each lane) with the biggest lane size (T = D) and
|
||||||
|
// concatenates the results then.
|
||||||
|
|
||||||
|
// The second source input of BEXT, initialized with 0x01 in each byte.
|
||||||
|
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
|
||||||
|
sve_dup(vtmp2, B, 1);
|
||||||
|
|
||||||
|
// BEXT vtmp1.D, vtmp1.D, vtmp2.D
|
||||||
|
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
|
||||||
|
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
|
||||||
|
// ---------------------------------------
|
||||||
|
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
|
||||||
|
sve_bext(vtmp1, D, vtmp1, vtmp2);
|
||||||
|
|
||||||
|
// Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
|
||||||
|
// result to dst.
|
||||||
|
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
|
||||||
|
// dst = 0x658D
|
||||||
|
if (lane_cnt <= 8) {
|
||||||
|
// No need to concatenate.
|
||||||
|
umov(dst, vtmp1, B, 0);
|
||||||
|
} else if (lane_cnt <= 16) {
|
||||||
|
ins(vtmp1, B, vtmp1, 1, 8);
|
||||||
|
umov(dst, vtmp1, H, 0);
|
||||||
|
} else {
|
||||||
|
// As the lane count is 64 at most, the final expected value must be in
|
||||||
|
// the lowest 64 bits after narrowing vtmp1 from D to B.
|
||||||
|
sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
|
||||||
|
umov(dst, vtmp1, D, 0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert(false, "unsupported");
|
||||||
|
ShouldNotReachHere();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
|
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
*
|
*
|
||||||
@ -275,6 +275,9 @@ class PRegisterImpl: public AbstractRegisterImpl {
|
|||||||
REGISTER_IMPL_DECLARATION(PRegister, PRegisterImpl, PRegisterImpl::number_of_registers);
|
REGISTER_IMPL_DECLARATION(PRegister, PRegisterImpl, PRegisterImpl::number_of_registers);
|
||||||
|
|
||||||
// The predicate registers of SVE.
|
// The predicate registers of SVE.
|
||||||
|
//
|
||||||
|
CONSTANT_REGISTER_DECLARATION(PRegister, pnoreg, (-1));
|
||||||
|
|
||||||
CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0));
|
CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0));
|
||||||
CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1));
|
CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1));
|
||||||
CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2));
|
CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2));
|
||||||
|
@ -1894,6 +1894,8 @@ generate(SVEVectorOp, [["add", "ZZZ"],
|
|||||||
["bic", "ZZZ"],
|
["bic", "ZZZ"],
|
||||||
["uzp1", "ZZZ"],
|
["uzp1", "ZZZ"],
|
||||||
["uzp2", "ZZZ"],
|
["uzp2", "ZZZ"],
|
||||||
|
# SVE2 instructions
|
||||||
|
["bext", "ZZZ"],
|
||||||
])
|
])
|
||||||
|
|
||||||
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
|
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
|
||||||
@ -1904,8 +1906,9 @@ outfile.write("forth:\n")
|
|||||||
|
|
||||||
outfile.close()
|
outfile.close()
|
||||||
|
|
||||||
# compile for sve with 8.3 and sha3 because of SHA3 crypto extension.
|
# compile for sve with armv9-a+sha3+sve2-bitperm because of SHA3 crypto extension and SVE2 bitperm instructions.
|
||||||
subprocess.check_call([AARCH64_AS, "-march=armv8.3-a+sha3+sve", "aarch64ops.s", "-o", "aarch64ops.o"])
|
# armv9-a enables sve and sve2 by default.
|
||||||
|
subprocess.check_call([AARCH64_AS, "-march=armv9-a+sha3+sve2-bitperm", "aarch64ops.s", "-o", "aarch64ops.o"])
|
||||||
|
|
||||||
print
|
print
|
||||||
print "/*"
|
print "/*"
|
||||||
|
@ -1183,17 +1183,18 @@
|
|||||||
__ sve_bic(z8, z2, z0); // bic z8.d, z2.d, z0.d
|
__ sve_bic(z8, z2, z0); // bic z8.d, z2.d, z0.d
|
||||||
__ sve_uzp1(z23, __ S, z22, z0); // uzp1 z23.s, z22.s, z0.s
|
__ sve_uzp1(z23, __ S, z22, z0); // uzp1 z23.s, z22.s, z0.s
|
||||||
__ sve_uzp2(z25, __ H, z26, z23); // uzp2 z25.h, z26.h, z23.h
|
__ sve_uzp2(z25, __ H, z26, z23); // uzp2 z25.h, z26.h, z23.h
|
||||||
|
__ sve_bext(z21, __ B, z21, z1); // bext z21.b, z21.b, z1.b
|
||||||
|
|
||||||
// SVEReductionOp
|
// SVEReductionOp
|
||||||
__ sve_andv(v21, __ B, p5, z1); // andv b21, p5, z1.b
|
__ sve_andv(v10, __ S, p5, z11); // andv s10, p5, z11.s
|
||||||
__ sve_orv(v10, __ S, p5, z11); // orv s10, p5, z11.s
|
__ sve_orv(v23, __ D, p6, z8); // orv d23, p6, z8.d
|
||||||
__ sve_eorv(v23, __ D, p6, z8); // eorv d23, p6, z8.d
|
__ sve_eorv(v17, __ S, p5, z19); // eorv s17, p5, z19.s
|
||||||
__ sve_smaxv(v17, __ S, p5, z19); // smaxv s17, p5, z19.s
|
__ sve_smaxv(v4, __ D, p5, z13); // smaxv d4, p5, z13.d
|
||||||
__ sve_sminv(v4, __ D, p5, z13); // sminv d4, p5, z13.d
|
__ sve_sminv(v22, __ D, p7, z30); // sminv d22, p7, z30.d
|
||||||
__ sve_fminv(v22, __ D, p7, z30); // fminv d22, p7, z30.d
|
__ sve_fminv(v17, __ S, p4, z14); // fminv s17, p4, z14.s
|
||||||
__ sve_fmaxv(v17, __ S, p4, z14); // fmaxv s17, p4, z14.s
|
__ sve_fmaxv(v12, __ S, p7, z20); // fmaxv s12, p7, z20.s
|
||||||
__ sve_fadda(v12, __ S, p7, z20); // fadda s12, p7, s12, z20.s
|
__ sve_fadda(v1, __ S, p3, z13); // fadda s1, p3, s1, z13.s
|
||||||
__ sve_uaddv(v1, __ B, p3, z13); // uaddv d1, p3, z13.b
|
__ sve_uaddv(v7, __ S, p2, z11); // uaddv d7, p2, z11.s
|
||||||
|
|
||||||
__ bind(forth);
|
__ bind(forth);
|
||||||
|
|
||||||
@ -1212,30 +1213,30 @@
|
|||||||
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
|
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
|
||||||
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
|
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
|
||||||
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
|
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
|
||||||
0x14000000, 0x17ffffd7, 0x140003e4, 0x94000000,
|
0x14000000, 0x17ffffd7, 0x140003e5, 0x94000000,
|
||||||
0x97ffffd4, 0x940003e1, 0x3400000a, 0x34fffa2a,
|
0x97ffffd4, 0x940003e2, 0x3400000a, 0x34fffa2a,
|
||||||
0x34007bca, 0x35000008, 0x35fff9c8, 0x35007b68,
|
0x34007bea, 0x35000008, 0x35fff9c8, 0x35007b88,
|
||||||
0xb400000b, 0xb4fff96b, 0xb4007b0b, 0xb500001d,
|
0xb400000b, 0xb4fff96b, 0xb4007b2b, 0xb500001d,
|
||||||
0xb5fff91d, 0xb5007abd, 0x10000013, 0x10fff8b3,
|
0xb5fff91d, 0xb5007add, 0x10000013, 0x10fff8b3,
|
||||||
0x10007a53, 0x90000013, 0x36300016, 0x3637f836,
|
0x10007a73, 0x90000013, 0x36300016, 0x3637f836,
|
||||||
0x363079d6, 0x3758000c, 0x375ff7cc, 0x3758796c,
|
0x363079f6, 0x3758000c, 0x375ff7cc, 0x3758798c,
|
||||||
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
|
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
|
||||||
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
|
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
|
||||||
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
|
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
|
||||||
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
|
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
|
||||||
0x54007740, 0x54000001, 0x54fff541, 0x540076e1,
|
0x54007760, 0x54000001, 0x54fff541, 0x54007701,
|
||||||
0x54000002, 0x54fff4e2, 0x54007682, 0x54000002,
|
0x54000002, 0x54fff4e2, 0x540076a2, 0x54000002,
|
||||||
0x54fff482, 0x54007622, 0x54000003, 0x54fff423,
|
0x54fff482, 0x54007642, 0x54000003, 0x54fff423,
|
||||||
0x540075c3, 0x54000003, 0x54fff3c3, 0x54007563,
|
0x540075e3, 0x54000003, 0x54fff3c3, 0x54007583,
|
||||||
0x54000004, 0x54fff364, 0x54007504, 0x54000005,
|
0x54000004, 0x54fff364, 0x54007524, 0x54000005,
|
||||||
0x54fff305, 0x540074a5, 0x54000006, 0x54fff2a6,
|
0x54fff305, 0x540074c5, 0x54000006, 0x54fff2a6,
|
||||||
0x54007446, 0x54000007, 0x54fff247, 0x540073e7,
|
0x54007466, 0x54000007, 0x54fff247, 0x54007407,
|
||||||
0x54000008, 0x54fff1e8, 0x54007388, 0x54000009,
|
0x54000008, 0x54fff1e8, 0x540073a8, 0x54000009,
|
||||||
0x54fff189, 0x54007329, 0x5400000a, 0x54fff12a,
|
0x54fff189, 0x54007349, 0x5400000a, 0x54fff12a,
|
||||||
0x540072ca, 0x5400000b, 0x54fff0cb, 0x5400726b,
|
0x540072ea, 0x5400000b, 0x54fff0cb, 0x5400728b,
|
||||||
0x5400000c, 0x54fff06c, 0x5400720c, 0x5400000d,
|
0x5400000c, 0x54fff06c, 0x5400722c, 0x5400000d,
|
||||||
0x54fff00d, 0x540071ad, 0x5400000e, 0x54ffefae,
|
0x54fff00d, 0x540071cd, 0x5400000e, 0x54ffefae,
|
||||||
0x5400714e, 0x5400000f, 0x54ffef4f, 0x540070ef,
|
0x5400716e, 0x5400000f, 0x54ffef4f, 0x5400710f,
|
||||||
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
|
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
|
||||||
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
|
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
|
||||||
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
|
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
|
||||||
@ -1459,8 +1460,8 @@
|
|||||||
0x65f1af0b, 0x65eec9f1, 0x65a7fed6, 0x65aa5f65,
|
0x65f1af0b, 0x65eec9f1, 0x65a7fed6, 0x65aa5f65,
|
||||||
0x65b47aae, 0x04c55723, 0x0441723d, 0x042d33ae,
|
0x65b47aae, 0x04c55723, 0x0441723d, 0x042d33ae,
|
||||||
0x04be3051, 0x047d32b6, 0x04e03048, 0x05a06ad7,
|
0x04be3051, 0x047d32b6, 0x04e03048, 0x05a06ad7,
|
||||||
0x05776f59, 0x041a3435, 0x0498356a, 0x04d93917,
|
0x05776f59, 0x4501b2b5, 0x049a356a, 0x04d83917,
|
||||||
0x04883671, 0x04ca35a4, 0x65c73fd6, 0x658631d1,
|
0x04993671, 0x04c835a4, 0x04ca3fd6, 0x658731d1,
|
||||||
0x65983e8c, 0x04012da1,
|
0x65863e8c, 0x65982da1, 0x04812967,
|
||||||
};
|
};
|
||||||
// END Generated code -- do not edit
|
// END Generated code -- do not edit
|
||||||
|
Loading…
Reference in New Issue
Block a user