From ea77ef83484fa72381744a1de834acd922ba9bc5 Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xgong@openjdk.org>
Date: Fri, 16 Jul 2021 01:54:10 +0000
Subject: [PATCH] 8269725: AArch64: Add VectorMask query implementation for
 NEON

Reviewed-by: aph
---
 src/hotspot/cpu/aarch64/aarch64.ad            |  26 ++-
 src/hotspot/cpu/aarch64/aarch64_neon.ad       | 169 ++++++++++++++++++
 src/hotspot/cpu/aarch64/aarch64_neon_ad.m4    | 148 +++++++++++++++
 src/hotspot/cpu/aarch64/aarch64_sve.ad        |  15 +-
 src/hotspot/cpu/aarch64/aarch64_sve_ad.m4     |  15 +-
 src/hotspot/cpu/aarch64/assembler_aarch64.hpp |   2 +-
 test/hotspot/gtest/aarch64/aarch64-asmtest.py |   1 +
 test/hotspot/gtest/aarch64/asmtest.out.h      | 152 ++++++++--------
 8 files changed, 427 insertions(+), 101 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 34b495e62a4..94e1a1cde1b 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -1295,7 +1295,31 @@ public:
   };
 };
 
- bool is_CAS(int opcode, bool maybe_volatile);
+  static inline BasicType vector_element_basic_type(const MachNode* n) {
+    const TypeVect* vt = n->bottom_type()->is_vect();
+    return vt->element_basic_type();
+  }
+
+  static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) {
+    int def_idx = use->operand_index(opnd);
+    Node* def = use->in(def_idx);
+    const TypeVect* vt = def->bottom_type()->is_vect();
+    return vt->element_basic_type();
+  }
+
+  static inline uint vector_length(const MachNode* n) {
+    const TypeVect* vt = n->bottom_type()->is_vect();
+    return vt->length();
+  }
+
+  static inline uint vector_length(const MachNode* use, const MachOper* opnd) {
+    int def_idx = use->operand_index(opnd);
+    Node* def = use->in(def_idx);
+    const TypeVect* vt = def->bottom_type()->is_vect();
+    return vt->length();
+  }
+
+  bool is_CAS(int opcode, bool maybe_volatile);
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
 
diff --git a/src/hotspot/cpu/aarch64/aarch64_neon.ad b/src/hotspot/cpu/aarch64/aarch64_neon.ad
index 1beac317c57..90cbba9c163 100644
--- a/src/hotspot/cpu/aarch64/aarch64_neon.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad
@@ -5296,3 +5296,172 @@ instruct vpopcount2I(vecD dst, vecD src) %{
   %}
   ins_pipe(pipe_class_default);
 %}
+
+// vector mask reductions
+
+instruct vmask_truecount8B(iRegINoSp dst, vecD src, vecD tmp) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskTrueCount src));
+  effect(TEMP tmp);
+  ins_cost(2 * INSN_COST);
+  format %{ "addv $tmp, $src\n\t"
+            "umov $dst, $tmp, B, 0\t# vector (8B)" %}
+  ins_encode %{
+    // Input "src" is a vector of boolean represented as bytes with
+    // 0x00/0x01 as element values.
+    __ addv(as_FloatRegister($tmp$$reg), __ T8B, as_FloatRegister($src$$reg));
+    __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_truecount16B(iRegINoSp dst, vecX src, vecX tmp) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskTrueCount src));
+  effect(TEMP tmp);
+  ins_cost(2 * INSN_COST);
+  format %{ "addv $tmp, $src\n\t"
+            "umov $dst, $tmp, B, 0\t# vector (16B)" %}
+  ins_encode %{
+    // Input "src" is a vector of boolean represented as bytes with
+    // 0x00/0x01 as element values.
+    __ addv(as_FloatRegister($tmp$$reg), __ T16B, as_FloatRegister($src$$reg));
+    __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_firsttrue_LT8B(iRegINoSp dst, vecD src, rFlagsReg cr) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
+            n->in(1)->bottom_type()->is_vect()->length() < 8);
+  match(Set dst (VectorMaskFirstTrue src));
+  effect(KILL cr);
+  ins_cost(7 * INSN_COST);
+  format %{ "vmask_firsttrue $dst, $src\t# vector (4I/4S/2I)" %}
+  ins_encode %{
+    // Returns the index of the first active lane of the
+    // vector mask, or VLENGTH if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    //
+    // Computed by reversing the bits and counting the leading
+    // zero bytes.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ rbit($dst$$Register, $dst$$Register);
+    __ clz($dst$$Register, $dst$$Register);
+    __ lsrw($dst$$Register, $dst$$Register, 3);
+    __ movw(rscratch1, vector_length(this, $src));
+    __ cmpw($dst$$Register, rscratch1);
+    __ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_firsttrue8B(iRegINoSp dst, vecD src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
+            n->in(1)->bottom_type()->is_vect()->length() == 8);
+  match(Set dst (VectorMaskFirstTrue src));
+  ins_cost(4 * INSN_COST);
+  format %{ "vmask_firsttrue $dst, $src\t# vector (8B)" %}
+  ins_encode %{
+    // Returns the index of the first active lane of the
+    // vector mask, or VLENGTH if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    //
+    // Computed by reversing the bits and counting the leading
+    // zero bytes.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ rbit($dst$$Register, $dst$$Register);
+    __ clz($dst$$Register, $dst$$Register);
+    __ lsrw($dst$$Register, $dst$$Register, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_firsttrue16B(iRegINoSp dst, vecX src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskFirstTrue src));
+  ins_cost(6 * INSN_COST);
+  format %{ "vmask_firsttrue $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    // Returns the index of the first active lane of the
+    // vector mask, or 16 (VLENGTH) if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+
+    Label FIRST_TRUE_INDEX;
+
+    // Try to compute the result from lower 64 bits.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, zr);
+    __ cbnz($dst$$Register, FIRST_TRUE_INDEX);
+
+    // Compute the result from the higher 64 bits.
+    __ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 8);
+
+    // Reverse the bits and count the leading zero bytes.
+    __ bind(FIRST_TRUE_INDEX);
+    __ rbit($dst$$Register, $dst$$Register);
+    __ clz($dst$$Register, $dst$$Register);
+    __ addw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_lasttrue8B(iRegINoSp dst, vecD src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskLastTrue src));
+  ins_cost(4 * INSN_COST);
+  format %{ "vmask_lasttrue $dst, $src\t# vector (8B)" %}
+  ins_encode %{
+    // Returns the index of the last active lane of the
+    // vector mask, or -1 if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    //
+    // Computed by counting the leading zero bytes and
+    // substracting it by 7 (VLENGTH - 1).
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ clz($dst$$Register, $dst$$Register);
+    __ movw(rscratch1, 7);
+    __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskLastTrue src));
+  ins_cost(5 * INSN_COST);
+  format %{ "vmask_lasttrue $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    // Returns the index of the last active lane of the
+    // vector mask, or -1 if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+
+    Label LAST_TRUE_INDEX;
+
+    // Try to compute the result from higher 64 bits.
+    __ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 16 - 1);
+    __ cbnz($dst$$Register, LAST_TRUE_INDEX);
+
+    // Compute the result from the lower 64 bits.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 8 - 1);
+
+    // Count the leading zero bytes and substract it by 15 (VLENGTH - 1).
+    __ bind(LAST_TRUE_INDEX);
+    __ clz($dst$$Register, $dst$$Register);
+    __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
diff --git a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
index 306cd1b56ce..076eec1c973 100644
--- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
@@ -2243,3 +2243,151 @@ instruct vpopcount$1$2`'(vec$5 dst, vec$5 src) %{
 dnl       $1 $2 $3  $4 $5
 VPOPCOUNT(4, I, 16, 8, X)
 VPOPCOUNT(2, I, 8,  4, D)
+dnl
+dnl VMASK_TRUECOUNT($1,     $2 )
+dnl VMASK_TRUECOUNT(suffix, reg)
+define(`VMASK_TRUECOUNT', `
+instruct vmask_truecount$1(iRegINoSp dst, $2 src, $2 tmp) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskTrueCount src));
+  effect(TEMP tmp);
+  ins_cost(2 * INSN_COST);
+  format %{ "addv $tmp, $src\n\t"
+            "umov $dst, $tmp, B, 0\t# vector ($1)" %}
+  ins_encode %{
+    // Input "src" is a vector of boolean represented as bytes with
+    // 0x00/0x01 as element values.
+    __ addv(as_FloatRegister($tmp$$reg), __ T$1, as_FloatRegister($src$$reg));
+    __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+dnl
+define(`ARGLIST',
+`ifelse($1, `_LT8B', `iRegINoSp dst, vecD src, rFlagsReg cr', `iRegINoSp dst, vecD src')')
+dnl
+dnl VMASK_FIRSTTRUE_D($1,     $2,   $3,   $4  )
+dnl VMASK_FIRSTTRUE_D(suffix, cond, cost, size)
+define(`VMASK_FIRSTTRUE_D', `
+instruct vmask_firsttrue$1(ARGLIST($1)) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
+            n->in(1)->bottom_type()->is_vect()->length() $2 8);
+  match(Set dst (VectorMaskFirstTrue src));dnl
+ifelse($1, `_LT8B', `
+  effect(KILL cr);')
+  ins_cost($3 * INSN_COST);
+  format %{ "vmask_firsttrue $dst, $src\t# vector ($4)" %}
+  ins_encode %{
+    // Returns the index of the first active lane of the
+    // vector mask, or VLENGTH if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    //
+    // Computed by reversing the bits and counting the leading
+    // zero bytes.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ rbit($dst$$Register, $dst$$Register);
+    __ clz($dst$$Register, $dst$$Register);
+    __ lsrw($dst$$Register, $dst$$Register, 3);dnl
+ifelse(`$1', `_LT8B', `
+    __ movw(rscratch1, vector_length(this, $src));
+    __ cmpw($dst$$Register, rscratch1);
+    __ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);')
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+undefine(ARGLIST)dnl
+dnl
+// vector mask reductions
+VMASK_TRUECOUNT(8B,  vecD)
+VMASK_TRUECOUNT(16B, vecX)
+VMASK_FIRSTTRUE_D(_LT8B, <,  7, 4I/4S/2I)
+VMASK_FIRSTTRUE_D(8B,    ==, 4, 8B)
+
+instruct vmask_firsttrue16B(iRegINoSp dst, vecX src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskFirstTrue src));
+  ins_cost(6 * INSN_COST);
+  format %{ "vmask_firsttrue $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    // Returns the index of the first active lane of the
+    // vector mask, or 16 (VLENGTH) if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+
+    Label FIRST_TRUE_INDEX;
+
+    // Try to compute the result from lower 64 bits.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, zr);
+    __ cbnz($dst$$Register, FIRST_TRUE_INDEX);
+
+    // Compute the result from the higher 64 bits.
+    __ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 8);
+
+    // Reverse the bits and count the leading zero bytes.
+    __ bind(FIRST_TRUE_INDEX);
+    __ rbit($dst$$Register, $dst$$Register);
+    __ clz($dst$$Register, $dst$$Register);
+    __ addw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_lasttrue8B(iRegINoSp dst, vecD src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskLastTrue src));
+  ins_cost(4 * INSN_COST);
+  format %{ "vmask_lasttrue $dst, $src\t# vector (8B)" %}
+  ins_encode %{
+    // Returns the index of the last active lane of the
+    // vector mask, or -1 if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    //
+    // Computed by counting the leading zero bytes and
+    // substracting it by 7 (VLENGTH - 1).
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ clz($dst$$Register, $dst$$Register);
+    __ movw(rscratch1, 7);
+    __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
+  match(Set dst (VectorMaskLastTrue src));
+  ins_cost(5 * INSN_COST);
+  format %{ "vmask_lasttrue $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    // Returns the index of the last active lane of the
+    // vector mask, or -1 if no lane is active.
+    //
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+
+    Label LAST_TRUE_INDEX;
+
+    // Try to compute the result from higher 64 bits.
+    __ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 16 - 1);
+    __ cbnz($dst$$Register, LAST_TRUE_INDEX);
+
+    // Compute the result from the lower 64 bits.
+    __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
+    __ movw(rscratch1, 8 - 1);
+
+    // Count the leading zero bytes and substract it by 15 (VLENGTH - 1).
+    __ bind(LAST_TRUE_INDEX);
+    __ clz($dst$$Register, $dst$$Register);
+    __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
+  %}
+  ins_pipe(pipe_slow);
+%}
diff --git a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad
index 62354fc95da..1493bbc985a 100644
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
@@ -87,18 +87,6 @@ source_hpp %{
 %}
 
 source %{
-  static inline BasicType vector_element_basic_type(const MachNode* n) {
-    const TypeVect* vt = n->bottom_type()->is_vect();
-    return vt->element_basic_type();
-  }
-
-  static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) {
-    int def_idx = use->operand_index(opnd);
-    Node* def = use->in(def_idx);
-    const TypeVect* vt = def->bottom_type()->is_vect();
-    return vt->element_basic_type();
-  }
-
   static Assembler::SIMD_RegVariant elemBytes_to_regVariant(int esize) {
     switch(esize) {
       case 1:
@@ -203,6 +191,9 @@ source %{
       case Op_VectorReinterpret:
       case Op_VectorStoreMask:
       case Op_VectorTest:
+      case Op_VectorMaskTrueCount:
+      case Op_VectorMaskLastTrue:
+      case Op_VectorMaskFirstTrue:
         return false;
       default:
         return true;
diff --git a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
index b1d4f2696c4..bc166185570 100644
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
@@ -74,18 +74,6 @@ source_hpp %{
 %}
 
 source %{
-  static inline BasicType vector_element_basic_type(const MachNode* n) {
-    const TypeVect* vt = n->bottom_type()->is_vect();
-    return vt->element_basic_type();
-  }
-
-  static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) {
-    int def_idx = use->operand_index(opnd);
-    Node* def = use->in(def_idx);
-    const TypeVect* vt = def->bottom_type()->is_vect();
-    return vt->element_basic_type();
-  }
-
   static Assembler::SIMD_RegVariant elemBytes_to_regVariant(int esize) {
     switch(esize) {
       case 1:
@@ -190,6 +178,9 @@ source %{
       case Op_VectorReinterpret:
       case Op_VectorStoreMask:
       case Op_VectorTest:
+      case Op_VectorMaskTrueCount:
+      case Op_VectorMaskLastTrue:
+      case Op_VectorMaskFirstTrue:
         return false;
       default:
         return true;
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 71f0443a8cb..1405543b3d3 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -2034,7 +2034,7 @@ public:
   INSN(fmovs, 0b000, 0b00, 0b00, 0b110);
   INSN(fmovd, 0b100, 0b01, 0b00, 0b110);
 
-  // INSN(fmovhid, 0b100, 0b10, 0b01, 0b110);
+  INSN(fmovhid, 0b100, 0b10, 0b01, 0b110);
 
 #undef INSN
 
diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
index 60f02aa102a..b8ac50fda81 100644
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@@ -1533,6 +1533,7 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                         ["umov",   "__ umov(r0, v1, __ S, 1);",                          "umov\tw0, v1.s[1]"],
                         ["umov",   "__ umov(r0, v1, __ H, 2);",                          "umov\tw0, v1.h[2]"],
                         ["umov",   "__ umov(r0, v1, __ B, 3);",                          "umov\tw0, v1.b[3]"],
+                        ["fmov",   "__ fmovhid(r0, v1);",                                "fmov\tx0, v1.d[1]"],
                         ["ld1",    "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"],
                         # SVE instructions
                         ["cpy",    "__ sve_cpy(z0, __ S, p0, v1);",                      "mov\tz0.s, p0/m, s1"],
diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h
index 79bef1eb35b..24fbb10eacb 100644
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@@ -723,6 +723,7 @@
     __ umov(r0, v1, __ S, 1);                          //       umov    w0, v1.s[1]
     __ umov(r0, v1, __ H, 2);                          //       umov    w0, v1.h[2]
     __ umov(r0, v1, __ B, 3);                          //       umov    w0, v1.b[3]
+    __ fmovhid(r0, v1);                                //       fmov    x0, v1.d[1]
     __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); //       ld1     {v31.2d, v0.2d}, [x1], x0
     __ sve_cpy(z0, __ S, p0, v1);                      //       mov     z0.s, p0/m, s1
     __ sve_inc(r0, __ S);                              //       incw    x0
@@ -978,30 +979,30 @@
     0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
     0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
     0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x1400030c,     0x94000000,
-    0x97ffffd4,     0x94000309,     0x3400000a,     0x34fffa2a,
-    0x340060ca,     0x35000008,     0x35fff9c8,     0x35006068,
-    0xb400000b,     0xb4fff96b,     0xb400600b,     0xb500001d,
-    0xb5fff91d,     0xb5005fbd,     0x10000013,     0x10fff8b3,
-    0x10005f53,     0x90000013,     0x36300016,     0x3637f836,
-    0x36305ed6,     0x3758000c,     0x375ff7cc,     0x37585e6c,
+    0x14000000,     0x17ffffd7,     0x1400030d,     0x94000000,
+    0x97ffffd4,     0x9400030a,     0x3400000a,     0x34fffa2a,
+    0x340060ea,     0x35000008,     0x35fff9c8,     0x35006088,
+    0xb400000b,     0xb4fff96b,     0xb400602b,     0xb500001d,
+    0xb5fff91d,     0xb5005fdd,     0x10000013,     0x10fff8b3,
+    0x10005f73,     0x90000013,     0x36300016,     0x3637f836,
+    0x36305ef6,     0x3758000c,     0x375ff7cc,     0x37585e8c,
     0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
     0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
     0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
     0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54005c40,     0x54000001,     0x54fff541,     0x54005be1,
-    0x54000002,     0x54fff4e2,     0x54005b82,     0x54000002,
-    0x54fff482,     0x54005b22,     0x54000003,     0x54fff423,
-    0x54005ac3,     0x54000003,     0x54fff3c3,     0x54005a63,
-    0x54000004,     0x54fff364,     0x54005a04,     0x54000005,
-    0x54fff305,     0x540059a5,     0x54000006,     0x54fff2a6,
-    0x54005946,     0x54000007,     0x54fff247,     0x540058e7,
-    0x54000008,     0x54fff1e8,     0x54005888,     0x54000009,
-    0x54fff189,     0x54005829,     0x5400000a,     0x54fff12a,
-    0x540057ca,     0x5400000b,     0x54fff0cb,     0x5400576b,
-    0x5400000c,     0x54fff06c,     0x5400570c,     0x5400000d,
-    0x54fff00d,     0x540056ad,     0x5400000e,     0x54ffefae,
-    0x5400564e,     0x5400000f,     0x54ffef4f,     0x540055ef,
+    0x54005c60,     0x54000001,     0x54fff541,     0x54005c01,
+    0x54000002,     0x54fff4e2,     0x54005ba2,     0x54000002,
+    0x54fff482,     0x54005b42,     0x54000003,     0x54fff423,
+    0x54005ae3,     0x54000003,     0x54fff3c3,     0x54005a83,
+    0x54000004,     0x54fff364,     0x54005a24,     0x54000005,
+    0x54fff305,     0x540059c5,     0x54000006,     0x54fff2a6,
+    0x54005966,     0x54000007,     0x54fff247,     0x54005907,
+    0x54000008,     0x54fff1e8,     0x540058a8,     0x54000009,
+    0x54fff189,     0x54005849,     0x5400000a,     0x54fff12a,
+    0x540057ea,     0x5400000b,     0x54fff0cb,     0x5400578b,
+    0x5400000c,     0x54fff06c,     0x5400572c,     0x5400000d,
+    0x54fff00d,     0x540056cd,     0x5400000e,     0x54ffefae,
+    0x5400566e,     0x5400000f,     0x54ffef4f,     0x5400560f,
     0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
     0xd44cad80,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
     0xd5033fdf,     0xd5033e9f,     0xd50332bf,     0xd61f0200,
@@ -1033,7 +1034,7 @@
     0x791f226d,     0xf95aa2f3,     0xb9587bb7,     0x395f7176,
     0x795d9143,     0x399e7e08,     0x799a2697,     0x79df3422,
     0xb99c2624,     0xfd5c2374,     0xbd5fa1d9,     0xfd1d595a,
-    0xbd1b1869,     0x5800463b,     0x1800000b,     0xf8945060,
+    0xbd1b1869,     0x5800465b,     0x1800000b,     0xf8945060,
     0xd8000000,     0xf8ae6ba0,     0xf99a0080,     0x1a070035,
     0x3a0700a8,     0x5a0e0367,     0x7a11009b,     0x9a000380,
     0xba1e030c,     0xda0f0320,     0xfa030301,     0x0b340b11,
@@ -1120,59 +1121,60 @@
     0x8822fc7f,     0xc8247cbf,     0x88267fff,     0x4e010fe0,
     0x4e081fe1,     0x4e0c1fe1,     0x4e0a1fe1,     0x4e071fe1,
     0x4e042c20,     0x4e062c20,     0x4e052c20,     0x4e083c20,
-    0x0e0c3c20,     0x0e0a3c20,     0x0e073c20,     0x4cc0ac3f,
-    0x05a08020,     0x04b0e3e0,     0x0470e7e1,     0x042f9c20,
-    0x043f9c35,     0x047f9c20,     0x04ff9c20,     0x04299420,
-    0x04319160,     0x0461943e,     0x04a19020,     0x042053ff,
-    0x047f5401,     0x25208028,     0x2538cfe0,     0x2578d001,
-    0x25b8efe2,     0x25f8f007,     0x05203864,     0x05603ace,
-    0xa400a3e0,     0xa4a8a7ea,     0xa547a814,     0xa4084ffe,
-    0xa55c53e0,     0xa5e1540b,     0xe400fbf6,     0xe408ffff,
-    0xe547e400,     0xe4014be0,     0xe4a84fe0,     0xe5f15000,
-    0x858043e0,     0x85a043ff,     0xe59f5d08,     0x0420e3e9,
-    0x0460e3ea,     0x04a0e3eb,     0x04e0e3ec,     0x25104042,
-    0x25104871,     0x252c8840,     0x253c1420,     0x25681572,
-    0x25a21ce3,     0x25ea1e34,     0x1e601000,     0x1e603000,
-    0x1e621000,     0x1e623000,     0x1e641000,     0x1e643000,
-    0x1e661000,     0x1e663000,     0x1e681000,     0x1e683000,
-    0x1e6a1000,     0x1e6a3000,     0x1e6c1000,     0x1e6c3000,
-    0x1e6e1000,     0x1e6e3000,     0x1e701000,     0x1e703000,
-    0x1e721000,     0x1e723000,     0x1e741000,     0x1e743000,
-    0x1e761000,     0x1e763000,     0x1e781000,     0x1e783000,
-    0x1e7a1000,     0x1e7a3000,     0x1e7c1000,     0x1e7c3000,
-    0x1e7e1000,     0x1e7e3000,     0xf8208193,     0xf83101b6,
-    0xf83c13fe,     0xf821239a,     0xf824309e,     0xf826535e,
-    0xf8304109,     0xf82c7280,     0xf8216058,     0xf8a08309,
-    0xf8ba03d0,     0xf8a312ea,     0xf8aa21e4,     0xf8a2310b,
-    0xf8aa522f,     0xf8a2418a,     0xf8ac71af,     0xf8a26287,
-    0xf8fa8090,     0xf8e20184,     0xf8f01215,     0xf8f022ab,
-    0xf8f7334c,     0xf8f751dc,     0xf8eb4038,     0xf8ec715f,
-    0xf8f06047,     0xf863826d,     0xf8710070,     0xf86113cb,
-    0xf86521e8,     0xf87d301e,     0xf8745287,     0xf87742bc,
-    0xf87b70b9,     0xf8616217,     0xb83f8185,     0xb82901fc,
-    0xb83d13f6,     0xb83320bf,     0xb82e33f0,     0xb830529b,
-    0xb830416c,     0xb82973c6,     0xb831639b,     0xb8be8147,
-    0xb8b4008a,     0xb8b81231,     0xb8b623a3,     0xb8af3276,
-    0xb8b35056,     0xb8af4186,     0xb8b071ab,     0xb8b763c1,
-    0xb8f38225,     0xb8e202d0,     0xb8ed12aa,     0xb8fd219b,
-    0xb8fb3023,     0xb8ff5278,     0xb8f14389,     0xb8fb70ef,
-    0xb8f563f7,     0xb87983e2,     0xb87b0150,     0xb8771073,
-    0xb8702320,     0xb87a3057,     0xb870508c,     0xb87c43be,
-    0xb87070db,     0xb86961fd,     0xce273c87,     0xce080ac9,
-    0xce7e8e9b,     0xce808b45,     0xce79806e,     0xce758768,
-    0xcec0835a,     0xce608ad8,     0x043100c4,     0x046105e3,
-    0x65c900a6,     0x65d60a87,     0x65c80545,     0x0416a63e,
-    0x04001f8b,     0x0450979a,     0x04dabe0d,     0x045381a5,
-    0x04918b4f,     0x049006cb,     0x0497a264,     0x045eadd1,
-    0x04881062,     0x040a04d7,     0x04810f71,     0x04dca450,
-    0x65c084c3,     0x65cd8d93,     0x65c69a68,     0x65878ae0,
-    0x65c29db3,     0x049da0e6,     0x6582b911,     0x65c0b6d6,
-    0x65c1a1e2,     0x65cda494,     0x65c18107,     0x65af1493,
-    0x65e52b36,     0x65ab4ed0,     0x65f06a8d,     0x0451448f,
-    0x049c7c86,     0x0429335d,     0x04bc3162,     0x047a3027,
-    0x04e831d1,     0x2493b8a5,     0x249d9604,     0x24d18095,
-    0x24d7b491,     0x045a2113,     0x04d83a2e,     0x041927d5,
-    0x0408358a,     0x048a2709,     0x658738c4,     0x65c639bb,
-    0x65d836de,     0x04413d3e,
+    0x0e0c3c20,     0x0e0a3c20,     0x0e073c20,     0x9eae0020,
+    0x4cc0ac3f,     0x05a08020,     0x04b0e3e0,     0x0470e7e1,
+    0x042f9c20,     0x043f9c35,     0x047f9c20,     0x04ff9c20,
+    0x04299420,     0x04319160,     0x0461943e,     0x04a19020,
+    0x042053ff,     0x047f5401,     0x25208028,     0x2538cfe0,
+    0x2578d001,     0x25b8efe2,     0x25f8f007,     0x05203864,
+    0x05603ace,     0xa400a3e0,     0xa4a8a7ea,     0xa547a814,
+    0xa4084ffe,     0xa55c53e0,     0xa5e1540b,     0xe400fbf6,
+    0xe408ffff,     0xe547e400,     0xe4014be0,     0xe4a84fe0,
+    0xe5f15000,     0x858043e0,     0x85a043ff,     0xe59f5d08,
+    0x0420e3e9,     0x0460e3ea,     0x04a0e3eb,     0x04e0e3ec,
+    0x25104042,     0x25104871,     0x252c8840,     0x253c1420,
+    0x25681572,     0x25a21ce3,     0x25ea1e34,     0x1e601000,
+    0x1e603000,     0x1e621000,     0x1e623000,     0x1e641000,
+    0x1e643000,     0x1e661000,     0x1e663000,     0x1e681000,
+    0x1e683000,     0x1e6a1000,     0x1e6a3000,     0x1e6c1000,
+    0x1e6c3000,     0x1e6e1000,     0x1e6e3000,     0x1e701000,
+    0x1e703000,     0x1e721000,     0x1e723000,     0x1e741000,
+    0x1e743000,     0x1e761000,     0x1e763000,     0x1e781000,
+    0x1e783000,     0x1e7a1000,     0x1e7a3000,     0x1e7c1000,
+    0x1e7c3000,     0x1e7e1000,     0x1e7e3000,     0xf8208193,
+    0xf83101b6,     0xf83c13fe,     0xf821239a,     0xf824309e,
+    0xf826535e,     0xf8304109,     0xf82c7280,     0xf8216058,
+    0xf8a08309,     0xf8ba03d0,     0xf8a312ea,     0xf8aa21e4,
+    0xf8a2310b,     0xf8aa522f,     0xf8a2418a,     0xf8ac71af,
+    0xf8a26287,     0xf8fa8090,     0xf8e20184,     0xf8f01215,
+    0xf8f022ab,     0xf8f7334c,     0xf8f751dc,     0xf8eb4038,
+    0xf8ec715f,     0xf8f06047,     0xf863826d,     0xf8710070,
+    0xf86113cb,     0xf86521e8,     0xf87d301e,     0xf8745287,
+    0xf87742bc,     0xf87b70b9,     0xf8616217,     0xb83f8185,
+    0xb82901fc,     0xb83d13f6,     0xb83320bf,     0xb82e33f0,
+    0xb830529b,     0xb830416c,     0xb82973c6,     0xb831639b,
+    0xb8be8147,     0xb8b4008a,     0xb8b81231,     0xb8b623a3,
+    0xb8af3276,     0xb8b35056,     0xb8af4186,     0xb8b071ab,
+    0xb8b763c1,     0xb8f38225,     0xb8e202d0,     0xb8ed12aa,
+    0xb8fd219b,     0xb8fb3023,     0xb8ff5278,     0xb8f14389,
+    0xb8fb70ef,     0xb8f563f7,     0xb87983e2,     0xb87b0150,
+    0xb8771073,     0xb8702320,     0xb87a3057,     0xb870508c,
+    0xb87c43be,     0xb87070db,     0xb86961fd,     0xce273c87,
+    0xce080ac9,     0xce7e8e9b,     0xce808b45,     0xce79806e,
+    0xce758768,     0xcec0835a,     0xce608ad8,     0x043100c4,
+    0x046105e3,     0x65c900a6,     0x65d60a87,     0x65c80545,
+    0x0416a63e,     0x04001f8b,     0x0450979a,     0x04dabe0d,
+    0x045381a5,     0x04918b4f,     0x049006cb,     0x0497a264,
+    0x045eadd1,     0x04881062,     0x040a04d7,     0x04810f71,
+    0x04dca450,     0x65c084c3,     0x65cd8d93,     0x65c69a68,
+    0x65878ae0,     0x65c29db3,     0x049da0e6,     0x6582b911,
+    0x65c0b6d6,     0x65c1a1e2,     0x65cda494,     0x65c18107,
+    0x65af1493,     0x65e52b36,     0x65ab4ed0,     0x65f06a8d,
+    0x0451448f,     0x049c7c86,     0x0429335d,     0x04bc3162,
+    0x047a3027,     0x04e831d1,     0x2493b8a5,     0x249d9604,
+    0x24d18095,     0x24d7b491,     0x045a2113,     0x04d83a2e,
+    0x041927d5,     0x0408358a,     0x048a2709,     0x658738c4,
+    0x65c639bb,     0x65d836de,     0x04413d3e,
   };
 // END  Generated code -- do not edit
+