8282966: AArch64: Optimize VectorMask.toLong with SVE2

Reviewed-by: xgong, ngasson
2022-05-12 01:15:16 +00:00 · 2022-05-12 01:15:16 +00:00 · e9f45bb270
commit e9f45bb270
parent 57a7670886
7 changed files with 115 additions and 51 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
@ -149,6 +149,8 @@ source %{
      case Op_LoadVector:
      case Op_StoreVector:
        return Matcher::vector_size_supported(bt, vlen);
+      case Op_VectorMaskToLong:
+        if (vlen > 64) return false;
      default:
        break;
    }
@ -5487,8 +5489,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
 %}

 instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
-  predicate(UseSVE > 0 &&
-            n->in(1)->bottom_type()->is_vect()->length() <= 64);
+  predicate(UseSVE > 0);
  match(Set dst (VectorMaskToLong src));
  effect(TEMP vtmp1, TEMP vtmp2);
  ins_cost(13 * SVE_COST);
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
@ -144,6 +144,8 @@ source %{
      case Op_LoadVector:
      case Op_StoreVector:
        return Matcher::vector_size_supported(bt, vlen);
+      case Op_VectorMaskToLong:
+        if (vlen > 64) return false;
      default:
        break;
    }
@ -3055,8 +3057,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
 %}

 instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
-  predicate(UseSVE > 0 &&
-            n->in(1)->bottom_type()->is_vect()->length() <= 64);
+  predicate(UseSVE > 0);
  match(Set dst (VectorMaskToLong src));
  effect(TEMP vtmp1, TEMP vtmp2);
  ins_cost(13 * SVE_COST);
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3819,6 +3819,19 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
    f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
  }

+// SVE2 bitwise permute
+#define INSN(NAME, opc)                                                                  \
+  void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn,  FloatRegister Zm) {  \
+    starti;                                                                              \
+    assert(T != Q, "invalid size");                                                      \
+    f(0b01000101, 31, 24), f(T, 23, 22), f(0b0, 21);                                     \
+    rf(Zm, 16), f(0b1011, 15, 12), f(opc, 11, 10);                                       \
+    rf(Zn, 5), rf(Zd, 0);                                                                \
+  }
+
+  INSN(sve_bext, 0b00);
+#undef INSN
+
  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
  }

--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -958,7 +958,7 @@ void C2_MacroAssembler::bytemask_compress(Register dst) {

 // Pack the lowest-numbered bit of each mask element in src into a long value
 // in dst, at most the first 64 lane elements.
-// Clobbers: rscratch1
+// Clobbers: rscratch1 if hardware doesn't support FEAT_BITPERM.
 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
                                         FloatRegister vtmp1, FloatRegister vtmp2) {
  assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
@ -966,24 +966,66 @@ void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType
  assert_different_registers(vtmp1, vtmp2);

  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
+  // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
+  // Expected:  dst = 0x658D

-  // Pack the mask into vector with sequential bytes.
+  // Convert the mask into vector with sequential bytes.
+  // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
  sve_cpy(vtmp1, size, src, 1, false);
  if (bt != T_BYTE) {
    sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
  }

-  // Compress the lowest 8 bytes.
-  fmovd(dst, vtmp1);
-  bytemask_compress(dst);
-  if (lane_cnt <= 8) return;
+  if (UseSVE > 0 && !VM_Version::supports_svebitperm()) {
+    // Compress the lowest 8 bytes.
+    fmovd(dst, vtmp1);
+    bytemask_compress(dst);
+    if (lane_cnt <= 8) return;

-  // Repeat on higher bytes and join the results.
-  // Compress 8 bytes in each iteration.
-  for (int idx = 1; idx < (lane_cnt / 8); idx++) {
-    sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
-    bytemask_compress(rscratch1);
-    orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
+    // Repeat on higher bytes and join the results.
+    // Compress 8 bytes in each iteration.
+    for (int idx = 1; idx < (lane_cnt / 8); idx++) {
+      sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
+      bytemask_compress(rscratch1);
+      orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
+    }
+  } else if (UseSVE == 2 && VM_Version::supports_svebitperm()) {
+    // Given by the vector with value 0x00 or 0x01 in each byte, the basic idea
+    // is to compress each significant bit of the byte in a cross-lane way. Due
+    // to the lack of cross-lane bit-compress instruction, here we use BEXT
+    // (bit-compress in each lane) with the biggest lane size (T = D) and
+    // concatenates the results then.
+
+    // The second source input of BEXT, initialized with 0x01 in each byte.
+    // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
+    sve_dup(vtmp2, B, 1);
+
+    // BEXT vtmp1.D, vtmp1.D, vtmp2.D
+    // vtmp1 = 0x0001010000010001 | 0x0100000001010001
+    // vtmp2 = 0x0101010101010101 | 0x0101010101010101
+    //         ---------------------------------------
+    // vtmp1 = 0x0000000000000065 | 0x000000000000008D
+    sve_bext(vtmp1, D, vtmp1, vtmp2);
+
+    // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
+    // result to dst.
+    // vtmp1 = 0x0000000000000000 | 0x000000000000658D
+    // dst   = 0x658D
+    if (lane_cnt <= 8) {
+      // No need to concatenate.
+      umov(dst, vtmp1, B, 0);
+    } else if (lane_cnt <= 16) {
+      ins(vtmp1, B, vtmp1, 1, 8);
+      umov(dst, vtmp1, H, 0);
+    } else {
+      // As the lane count is 64 at most, the final expected value must be in
+      // the lowest 64 bits after narrowing vtmp1 from D to B.
+      sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
+      umov(dst, vtmp1, D, 0);
+    }
+  } else {
+    assert(false, "unsupported");
+    ShouldNotReachHere();
  }
 }

--- a/src/hotspot/cpu/aarch64/register_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/register_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -275,6 +275,9 @@ class PRegisterImpl: public AbstractRegisterImpl {
 REGISTER_IMPL_DECLARATION(PRegister, PRegisterImpl, PRegisterImpl::number_of_registers);

 // The predicate registers of SVE.
+//
+CONSTANT_REGISTER_DECLARATION(PRegister, pnoreg, (-1));
+
 CONSTANT_REGISTER_DECLARATION(PRegister, p0,  ( 0));
 CONSTANT_REGISTER_DECLARATION(PRegister, p1,  ( 1));
 CONSTANT_REGISTER_DECLARATION(PRegister, p2,  ( 2));
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -1894,6 +1894,8 @@ generate(SVEVectorOp, [["add", "ZZZ"],
                       ["bic", "ZZZ"],
                       ["uzp1", "ZZZ"],
                       ["uzp2", "ZZZ"],
+                       # SVE2 instructions
+                       ["bext", "ZZZ"],
                      ])

 generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
@ -1904,8 +1906,9 @@ outfile.write("forth:\n")

 outfile.close()

-# compile for sve with 8.3 and sha3 because of SHA3 crypto extension.
-subprocess.check_call([AARCH64_AS, "-march=armv8.3-a+sha3+sve", "aarch64ops.s", "-o", "aarch64ops.o"])
+# compile for sve with armv9-a+sha3+sve2-bitperm because of SHA3 crypto extension and SVE2 bitperm instructions.
+# armv9-a enables sve and sve2 by default.
+subprocess.check_call([AARCH64_AS, "-march=armv9-a+sha3+sve2-bitperm", "aarch64ops.s", "-o", "aarch64ops.o"])

 print
 print "/*"
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@ -1183,17 +1183,18 @@
    __ sve_bic(z8, z2, z0);                            //       bic     z8.d, z2.d, z0.d
    __ sve_uzp1(z23, __ S, z22, z0);                   //       uzp1    z23.s, z22.s, z0.s
    __ sve_uzp2(z25, __ H, z26, z23);                  //       uzp2    z25.h, z26.h, z23.h
+    __ sve_bext(z21, __ B, z21, z1);                   //       bext    z21.b, z21.b, z1.b

 // SVEReductionOp
-    __ sve_andv(v21, __ B, p5, z1);                    //       andv b21, p5, z1.b
-    __ sve_orv(v10, __ S, p5, z11);                    //       orv s10, p5, z11.s
-    __ sve_eorv(v23, __ D, p6, z8);                    //       eorv d23, p6, z8.d
-    __ sve_smaxv(v17, __ S, p5, z19);                  //       smaxv s17, p5, z19.s
-    __ sve_sminv(v4, __ D, p5, z13);                   //       sminv d4, p5, z13.d
-    __ sve_fminv(v22, __ D, p7, z30);                  //       fminv d22, p7, z30.d
-    __ sve_fmaxv(v17, __ S, p4, z14);                  //       fmaxv s17, p4, z14.s
-    __ sve_fadda(v12, __ S, p7, z20);                  //       fadda s12, p7, s12, z20.s
-    __ sve_uaddv(v1, __ B, p3, z13);                   //       uaddv d1, p3, z13.b
+    __ sve_andv(v10, __ S, p5, z11);                   //       andv s10, p5, z11.s
+    __ sve_orv(v23, __ D, p6, z8);                     //       orv d23, p6, z8.d
+    __ sve_eorv(v17, __ S, p5, z19);                   //       eorv s17, p5, z19.s
+    __ sve_smaxv(v4, __ D, p5, z13);                   //       smaxv d4, p5, z13.d
+    __ sve_sminv(v22, __ D, p7, z30);                  //       sminv d22, p7, z30.d
+    __ sve_fminv(v17, __ S, p4, z14);                  //       fminv s17, p4, z14.s
+    __ sve_fmaxv(v12, __ S, p7, z20);                  //       fmaxv s12, p7, z20.s
+    __ sve_fadda(v1, __ S, p3, z13);                   //       fadda s1, p3, s1, z13.s
+    __ sve_uaddv(v7, __ S, p2, z11);                   //       uaddv d7, p2, z11.s

    __ bind(forth);

@ -1212,30 +1213,30 @@
    0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
    0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
    0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140003e4,     0x94000000,
-    0x97ffffd4,     0x940003e1,     0x3400000a,     0x34fffa2a,
-    0x34007bca,     0x35000008,     0x35fff9c8,     0x35007b68,
-    0xb400000b,     0xb4fff96b,     0xb4007b0b,     0xb500001d,
-    0xb5fff91d,     0xb5007abd,     0x10000013,     0x10fff8b3,
-    0x10007a53,     0x90000013,     0x36300016,     0x3637f836,
-    0x363079d6,     0x3758000c,     0x375ff7cc,     0x3758796c,
+    0x14000000,     0x17ffffd7,     0x140003e5,     0x94000000,
+    0x97ffffd4,     0x940003e2,     0x3400000a,     0x34fffa2a,
+    0x34007bea,     0x35000008,     0x35fff9c8,     0x35007b88,
+    0xb400000b,     0xb4fff96b,     0xb4007b2b,     0xb500001d,
+    0xb5fff91d,     0xb5007add,     0x10000013,     0x10fff8b3,
+    0x10007a73,     0x90000013,     0x36300016,     0x3637f836,
+    0x363079f6,     0x3758000c,     0x375ff7cc,     0x3758798c,
    0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
    0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
    0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
    0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54007740,     0x54000001,     0x54fff541,     0x540076e1,
-    0x54000002,     0x54fff4e2,     0x54007682,     0x54000002,
-    0x54fff482,     0x54007622,     0x54000003,     0x54fff423,
-    0x540075c3,     0x54000003,     0x54fff3c3,     0x54007563,
-    0x54000004,     0x54fff364,     0x54007504,     0x54000005,
-    0x54fff305,     0x540074a5,     0x54000006,     0x54fff2a6,
-    0x54007446,     0x54000007,     0x54fff247,     0x540073e7,
-    0x54000008,     0x54fff1e8,     0x54007388,     0x54000009,
-    0x54fff189,     0x54007329,     0x5400000a,     0x54fff12a,
-    0x540072ca,     0x5400000b,     0x54fff0cb,     0x5400726b,
-    0x5400000c,     0x54fff06c,     0x5400720c,     0x5400000d,
-    0x54fff00d,     0x540071ad,     0x5400000e,     0x54ffefae,
-    0x5400714e,     0x5400000f,     0x54ffef4f,     0x540070ef,
+    0x54007760,     0x54000001,     0x54fff541,     0x54007701,
+    0x54000002,     0x54fff4e2,     0x540076a2,     0x54000002,
+    0x54fff482,     0x54007642,     0x54000003,     0x54fff423,
+    0x540075e3,     0x54000003,     0x54fff3c3,     0x54007583,
+    0x54000004,     0x54fff364,     0x54007524,     0x54000005,
+    0x54fff305,     0x540074c5,     0x54000006,     0x54fff2a6,
+    0x54007466,     0x54000007,     0x54fff247,     0x54007407,
+    0x54000008,     0x54fff1e8,     0x540073a8,     0x54000009,
+    0x54fff189,     0x54007349,     0x5400000a,     0x54fff12a,
+    0x540072ea,     0x5400000b,     0x54fff0cb,     0x5400728b,
+    0x5400000c,     0x54fff06c,     0x5400722c,     0x5400000d,
+    0x54fff00d,     0x540071cd,     0x5400000e,     0x54ffefae,
+    0x5400716e,     0x5400000f,     0x54ffef4f,     0x5400710f,
    0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
    0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
    0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@ -1459,8 +1460,8 @@
    0x65f1af0b,     0x65eec9f1,     0x65a7fed6,     0x65aa5f65,
    0x65b47aae,     0x04c55723,     0x0441723d,     0x042d33ae,
    0x04be3051,     0x047d32b6,     0x04e03048,     0x05a06ad7,
-    0x05776f59,     0x041a3435,     0x0498356a,     0x04d93917,
-    0x04883671,     0x04ca35a4,     0x65c73fd6,     0x658631d1,
-    0x65983e8c,     0x04012da1,
+    0x05776f59,     0x4501b2b5,     0x049a356a,     0x04d83917,
+    0x04993671,     0x04c835a4,     0x04ca3fd6,     0x658731d1,
+    0x65863e8c,     0x65982da1,     0x04812967,
  };
 // END  Generated code -- do not edit