From 05ea7a51e1167ff18720209f43ecb23f46eb3a17 Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xgong@openjdk.org>
Date: Fri, 1 Apr 2022 09:33:46 +0000
Subject: [PATCH] 8282431: AArch64: Add optimized rules for masked vector
 multiply-add/sub for SVE

Reviewed-by: njian, ngasson
---
 src/hotspot/cpu/aarch64/aarch64_sve.ad        | 316 +++++++++++++-
 src/hotspot/cpu/aarch64/aarch64_sve_ad.m4     | 250 +++++++++--
 src/hotspot/cpu/aarch64/assembler_aarch64.hpp |   7 +-
 test/hotspot/gtest/aarch64/aarch64-asmtest.py |   3 +
 test/hotspot/gtest/aarch64/asmtest.out.h      |  93 ++--
 .../VectorFusedMultiplyAddSubTest.java        | 412 ++++++++++++++++++
 6 files changed, 973 insertions(+), 108 deletions(-)
 create mode 100644 test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java

diff --git a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad
index c5ef0b57870..165a6444f83 100644
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
@@ -1544,10 +1544,10 @@ instruct vfmlaD(vReg dst_src1, vReg src2, vReg src3) %{
   ins_pipe(pipe_slow);
 %}
 
-// vector fmla - predicated
+// vector fmad - predicated
 
 // dst_src1 = dst_src1 * src2 + src3
-instruct vfmlaF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+instruct vfmadF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
   predicate(UseFMA && UseSVE > 0);
   match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary src3 pg)));
   ins_cost(SVE_COST);
@@ -1560,7 +1560,7 @@ instruct vfmlaF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
 %}
 
 // dst_src1 = dst_src1 * src2 + src3
-instruct vfmlaD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+instruct vfmadD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
   predicate(UseFMA && UseSVE > 0);
   match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary src3 pg)));
   ins_cost(SVE_COST);
@@ -1575,10 +1575,25 @@ instruct vfmlaD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
 // vector fmls
 
 // dst_src1 = dst_src1 + -src2 * src3
-// dst_src1 = dst_src1 + src2 * -src3
-instruct vfmlsF(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegVF must not be predicated.
+instruct vfmlsF1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVF dst_src1 (Binary (NegVF src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ S,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 + src2 * -src3
+// The NegVF must not be predicated.
+instruct vfmlsF2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %}
@@ -1590,10 +1605,25 @@ instruct vfmlsF(vReg dst_src1, vReg src2, vReg src3) %{
 %}
 
 // dst_src1 = dst_src1 + -src2 * src3
-// dst_src1 = dst_src1 + src2 * -src3
-instruct vfmlsD(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegVD must not be predicated.
+instruct vfmlsD1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVD dst_src1 (Binary (NegVD src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ D,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 + src2 * -src3
+// The NegVD must not be predicated.
+instruct vfmlsD2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %}
@@ -1604,13 +1634,62 @@ instruct vfmlsD(vReg dst_src1, vReg src2, vReg src3) %{
   ins_pipe(pipe_slow);
 %}
 
+// vector fmsb - predicated
+
+// dst_src1 = dst_src1 * -src2 + src3
+// The NegVF must not be predicated.
+instruct vfmsbF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVF (Binary dst_src1 (NegVF src2)) (Binary src3 pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_fmsb(as_FloatRegister($dst_src1$$reg), __ S, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 * -src2 + src3
+// The NegVD must not be predicated.
+instruct vfmsbD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVD (Binary dst_src1 (NegVD src2)) (Binary src3 pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_fmsb(as_FloatRegister($dst_src1$$reg), __ D, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector fnmla
 
 // dst_src1 = -dst_src1 + -src2 * src3
-// dst_src1 = -dst_src1 + src2 * -src3
-instruct vfnmlaF(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegVF must not be predicated.
+instruct vfnmlaF1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary (NegVF src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ S,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = -dst_src1 + src2 * -src3
+// The NegVF must not be predicated.
+instruct vfnmlaF2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %}
@@ -1622,10 +1701,27 @@ instruct vfnmlaF(vReg dst_src1, vReg src2, vReg src3) %{
 %}
 
 // dst_src1 = -dst_src1 + -src2 * src3
-// dst_src1 = -dst_src1 + src2 * -src3
-instruct vfnmlaD(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegVD must not be predicated.
+instruct vfnmlaD1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary (NegVD src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ D,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = -dst_src1 + src2 * -src3
+// The NegVD must not be predicated.
+instruct vfnmlaD2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %}
@@ -1636,11 +1732,47 @@ instruct vfnmlaD(vReg dst_src1, vReg src2, vReg src3) %{
   ins_pipe(pipe_slow);
 %}
 
+// vector fnmad - predicated
+
+// dst_src1 = -src3 + dst_src1 * -src2
+// The NegVF must not be predicated.
+instruct vfnmadF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVF (Binary dst_src1 (NegVF src2)) (Binary (NegVF src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmad $dst_src1, $pg, $src2, $src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_fnmad(as_FloatRegister($dst_src1$$reg), __ S, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = -src3 + dst_src1 * -src2
+// The NegVD must not be predicated.
+instruct vfnmadD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVD (Binary dst_src1 (NegVD src2)) (Binary (NegVD src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmad $dst_src1, $pg, $src2, $src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_fnmad(as_FloatRegister($dst_src1$$reg), __ D, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector fnmls
 
 // dst_src1 = -dst_src1 + src2 * src3
+// The NegVF must not be predicated.
 instruct vfnmlsF(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3)));
   ins_cost(SVE_COST);
   format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %}
@@ -1652,8 +1784,10 @@ instruct vfnmlsF(vReg dst_src1, vReg src2, vReg src3) %{
 %}
 
 // dst_src1 = -dst_src1 + src2 * src3
+// The NegVD must not be predicated.
 instruct vfnmlsD(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3)));
   ins_cost(SVE_COST);
   format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %}
@@ -1664,6 +1798,38 @@ instruct vfnmlsD(vReg dst_src1, vReg src2, vReg src3) %{
   ins_pipe(pipe_slow);
 %}
 
+// vector fnmsb - predicated
+
+// dst_src1 = -src3 + dst_src1 * src2
+// The NegVF must not be predicated.
+instruct vfnmsbF_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary (NegVF src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_fnmsb(as_FloatRegister($dst_src1$$reg), __ S, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = -src3 + dst_src1 * src2
+// The NegVD must not be predicated.
+instruct vfnmsbD_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary (NegVD src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_fnmsb(as_FloatRegister($dst_src1$$reg), __ D, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector mla
 
 // dst_src1 = dst_src1 + src2 * src3
@@ -1722,6 +1888,64 @@ instruct vmlaL(vReg dst_src1, vReg src2, vReg src3)
   ins_pipe(pipe_slow);
 %}
 
+// vector mla - predicated
+
+// dst_src1 = dst_src1 + src2 * src3
+instruct vmlaB_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddVB (Binary dst_src1 (MulVB src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mla $dst_src1, $pg, src2, src3\t # vector (sve) (B)" %}
+  ins_encode %{
+    __ sve_mla(as_FloatRegister($dst_src1$$reg), __ B, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 + src2 * src3
+instruct vmlaS_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddVS (Binary dst_src1 (MulVS src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mla $dst_src1, $pg, src2, src3\t # vector (sve) (H)" %}
+  ins_encode %{
+    __ sve_mla(as_FloatRegister($dst_src1$$reg), __ H, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 + src2 * src3
+instruct vmlaI_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddVI (Binary dst_src1 (MulVI src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mla $dst_src1, $pg, src2, src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_mla(as_FloatRegister($dst_src1$$reg), __ S, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 + src2 * src3
+instruct vmlaL_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddVL (Binary dst_src1 (MulVL src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mla $dst_src1, $pg, src2, src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_mla(as_FloatRegister($dst_src1$$reg), __ D, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector mls
 
 // dst_src1 = dst_src1 - src2 * src3
@@ -1780,6 +2004,64 @@ instruct vmlsL(vReg dst_src1, vReg src2, vReg src3)
   ins_pipe(pipe_slow);
 %}
 
+// vector mls - predicated
+
+// dst_src1 = dst_src1 - src2 * src3
+instruct vmlsB_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubVB (Binary dst_src1 (MulVB src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mls $dst_src1, $pg, src2, src3\t # vector (sve) (B)" %}
+  ins_encode %{
+    __ sve_mls(as_FloatRegister($dst_src1$$reg), __ B, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 - src2 * src3
+instruct vmlsS_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubVS (Binary dst_src1 (MulVS src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mls $dst_src1, $pg, src2, src3\t # vector (sve) (H)" %}
+  ins_encode %{
+    __ sve_mls(as_FloatRegister($dst_src1$$reg), __ H, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 - src2 * src3
+instruct vmlsI_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubVI (Binary dst_src1 (MulVI src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mls $dst_src1, $pg, src2, src3\t # vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_mls(as_FloatRegister($dst_src1$$reg), __ S, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// dst_src1 = dst_src1 - src2 * src3
+instruct vmlsL_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubVL (Binary dst_src1 (MulVL src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mls $dst_src1, $pg, src2, src3\t # vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_mls(as_FloatRegister($dst_src1$$reg), __ D, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector mul
 
 instruct vmulB(vReg dst_src1, vReg src2) %{
diff --git a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
index 8b0a56d0f30..bb757b7d152 100644
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
@@ -907,8 +907,8 @@ VMINMAX_PREDICATE(min, MinV, sve_fmin, sve_smin)
 VMINMAX_PREDICATE(max, MaxV, sve_fmax, sve_smax)
 
 dnl
-dnl VFMLA($1           $2    $3         )
-dnl VFMLA(name_suffix, size, min_vec_len)
+dnl VFMLA($1           $2  )
+dnl VFMLA(name_suffix, size)
 define(`VFMLA', `
 // dst_src1 = dst_src1 + src2 * src3
 instruct vfmla$1(vReg dst_src1, vReg src2, vReg src3) %{
@@ -924,15 +924,15 @@ instruct vfmla$1(vReg dst_src1, vReg src2, vReg src3) %{
 %}')dnl
 dnl
 // vector fmla
-VFMLA(F, S, 4)
-VFMLA(D, D, 2)
+VFMLA(F, S)
+VFMLA(D, D)
 
 dnl
-dnl VFMLA_PREDICATE($1,   $2  )
-dnl VFMLA_PREDICATE(type, size)
-define(`VFMLA_PREDICATE', `
+dnl VFMAD_PREDICATE($1           $2  )
+dnl VFMAD_PREDICATE(name_suffix, size)
+define(`VFMAD_PREDICATE', `
 // dst_src1 = dst_src1 * src2 + src3
-instruct vfmla$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+instruct vfmad$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
   predicate(UseFMA && UseSVE > 0);
   match(Set dst_src1 (FmaV$1 (Binary dst_src1 src2) (Binary src3 pg)));
   ins_cost(SVE_COST);
@@ -944,19 +944,37 @@ instruct vfmla$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}')dnl
 dnl
-// vector fmla - predicated
-VFMLA_PREDICATE(F, S)
-VFMLA_PREDICATE(D, D)
+// vector fmad - predicated
+VFMAD_PREDICATE(F, S)
+VFMAD_PREDICATE(D, D)
 
 dnl
-dnl VFMLS($1           $2    $3         )
-dnl VFMLS(name_suffix, size, min_vec_len)
-define(`VFMLS', `
+dnl VFMLS1($1           $2  )
+dnl VFMLS1(name_suffix, size)
+define(`VFMLS1', `
 // dst_src1 = dst_src1 + -src2 * src3
-// dst_src1 = dst_src1 + src2 * -src3
-instruct vfmls$1(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegV$1 must not be predicated.
+instruct vfmls`$1'1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaV$1 dst_src1 (Binary (NegV$1 src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ $2,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+dnl VFMLS2($1           $2  )
+dnl VFMLS2(name_suffix, size)
+define(`VFMLS2', `
+// dst_src1 = dst_src1 + src2 * -src3
+// The NegV$1 must not be predicated.
+instruct vfmls`$1'2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 (NegV$1 src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %}
@@ -968,18 +986,63 @@ instruct vfmls$1(vReg dst_src1, vReg src2, vReg src3) %{
 %}')dnl
 dnl
 // vector fmls
-VFMLS(F, S, 4)
-VFMLS(D, D, 2)
+VFMLS1(F, S)
+VFMLS2(F, S)
+VFMLS1(D, D)
+VFMLS2(D, D)
 
 dnl
-dnl VFNMLA($1           $2    $3         )
-dnl VFNMLA(name_suffix, size, min_vec_len)
-define(`VFNMLA', `
+dnl VFMSB_PREDICATE($1           $2  )
+dnl VFMSB_PREDICATE(name_suffix, size)
+define(`VFMSB_PREDICATE', `
+// dst_src1 = dst_src1 * -src2 + src3
+// The NegV$1 must not be predicated.
+instruct vfmsb$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaV$1 (Binary dst_src1 (NegV$1 src2)) (Binary src3 pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_fmsb(as_FloatRegister($dst_src1$$reg), __ $2, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector fmsb - predicated
+VFMSB_PREDICATE(F, S)
+VFMSB_PREDICATE(D, D)
+
+dnl
+dnl VFNMLA1($1           $2  )
+dnl VFNMLA1(name_suffix, size)
+define(`VFNMLA1', `
 // dst_src1 = -dst_src1 + -src2 * src3
-// dst_src1 = -dst_src1 + src2 * -src3
-instruct vfnmla$1(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+// The NegV$1 must not be predicated.
+instruct vfnmla`$1'1(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary (NegV$1 src2) src3)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ $2,
+         ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+dnl VFNMLA2($1           $2  )
+dnl VFNMLA2(name_suffix, size)
+define(`VFNMLA2', `
+// dst_src1 = -dst_src1 + src2 * -src3
+// The NegV$1 must not be predicated.
+instruct vfnmla`$1'2(vReg dst_src1, vReg src2, vReg src3) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(2)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 (NegV$1 src3))));
   ins_cost(SVE_COST);
   format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %}
@@ -991,16 +1054,44 @@ instruct vfnmla$1(vReg dst_src1, vReg src2, vReg src3) %{
 %}')dnl
 dnl
 // vector fnmla
-VFNMLA(F, S, 4)
-VFNMLA(D, D, 2)
+VFNMLA1(F, S)
+VFNMLA2(F, S)
+VFNMLA1(D, D)
+VFNMLA2(D, D)
 
 dnl
-dnl VFNMLS($1           $2    $3         )
-dnl VFNMLS(name_suffix, size, min_vec_len)
+dnl VFNMAD_PREDICATE($1           $2  )
+dnl VFNMAD_PREDICATE(name_suffix, size)
+define(`VFNMAD_PREDICATE', `
+// dst_src1 = -src3 + dst_src1 * -src2
+// The NegV$1 must not be predicated.
+instruct vfnmad$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->in(2)->as_Vector()->is_predicated_vector() &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaV$1 (Binary dst_src1 (NegV$1 src2)) (Binary (NegV$1 src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmad $dst_src1, $pg, $src2, $src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_fnmad(as_FloatRegister($dst_src1$$reg), __ $2, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector fnmad - predicated
+VFNMAD_PREDICATE(F, S)
+VFNMAD_PREDICATE(D, D)
+
+dnl
+dnl VFNMLS($1           $2  )
+dnl VFNMLS(name_suffix, size)
 define(`VFNMLS', `
 // dst_src1 = -dst_src1 + src2 * src3
+// The NegV$1 must not be predicated.
 instruct vfnmls$1(vReg dst_src1, vReg src2, vReg src3) %{
-  predicate(UseFMA && UseSVE > 0);
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(1)->as_Vector()->is_predicated_vector());
   match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 src3)));
   ins_cost(SVE_COST);
   format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %}
@@ -1012,12 +1103,35 @@ instruct vfnmls$1(vReg dst_src1, vReg src2, vReg src3) %{
 %}')dnl
 dnl
 // vector fnmls
-VFNMLS(F, S, 4)
-VFNMLS(D, D, 2)
+VFNMLS(F, S)
+VFNMLS(D, D)
 
 dnl
-dnl VMLA($1           $2    $3         )
-dnl VMLA(name_suffix, size, min_vec_len)
+dnl VFNMSB_PREDICATE($1           $2  )
+dnl VFNMSB_PREDICATE(name_suffix, size)
+define(`VFNMSB_PREDICATE', `
+// dst_src1 = -src3 + dst_src1 * src2
+// The NegV$1 must not be predicated.
+instruct vfnmsb$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
+  predicate(UseFMA && UseSVE > 0 &&
+            !n->in(2)->in(1)->as_Vector()->is_predicated_vector());
+  match(Set dst_src1 (FmaV$1 (Binary dst_src1 src2) (Binary (NegV$1 src3) pg)));
+  ins_cost(SVE_COST);
+  format %{ "sve_fnmsb $dst_src1, $pg, $src2, $src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_fnmsb(as_FloatRegister($dst_src1$$reg), __ $2, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector fnmsb - predicated
+VFNMSB_PREDICATE(F, S)
+VFNMSB_PREDICATE(D, D)
+
+dnl
+dnl VMLA($1           $2  )
+dnl VMLA(name_suffix, size)
 define(`VMLA', `
 // dst_src1 = dst_src1 + src2 * src3
 instruct vmla$1(vReg dst_src1, vReg src2, vReg src3)
@@ -1034,14 +1148,38 @@ instruct vmla$1(vReg dst_src1, vReg src2, vReg src3)
 %}')dnl
 dnl
 // vector mla
-VMLA(B, B, 16)
-VMLA(S, H, 8)
-VMLA(I, S, 4)
-VMLA(L, D, 2)
+VMLA(B, B)
+VMLA(S, H)
+VMLA(I, S)
+VMLA(L, D)
 
 dnl
-dnl VMLS($1           $2    $3         )
-dnl VMLS(name_suffix, size, min_vec_len)
+dnl VMLA_PREDICATE($1           $2  )
+dnl VMLA_PREDICATE(name_suffix, size)
+define(`VMLA_PREDICATE', `
+// dst_src1 = dst_src1 + src2 * src3
+instruct vmla$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddV$1 (Binary dst_src1 (MulV$1 src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mla $dst_src1, $pg, src2, src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_mla(as_FloatRegister($dst_src1$$reg), __ $2, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector mla - predicated
+VMLA_PREDICATE(B, B)
+VMLA_PREDICATE(S, H)
+VMLA_PREDICATE(I, S)
+VMLA_PREDICATE(L, D)
+
+dnl
+dnl VMLS($1           $2  )
+dnl VMLS(name_suffix, size)
 define(`VMLS', `
 // dst_src1 = dst_src1 - src2 * src3
 instruct vmls$1(vReg dst_src1, vReg src2, vReg src3)
@@ -1058,10 +1196,34 @@ instruct vmls$1(vReg dst_src1, vReg src2, vReg src3)
 %}')dnl
 dnl
 // vector mls
-VMLS(B, B, 16)
-VMLS(S, H, 8)
-VMLS(I, S, 4)
-VMLS(L, D, 2)
+VMLS(B, B)
+VMLS(S, H)
+VMLS(I, S)
+VMLS(L, D)
+
+dnl
+dnl VMLS_PREDICATE($1           $2  )
+dnl VMLS_PREDICATE(name_suffix, size)
+define(`VMLS_PREDICATE', `
+// dst_src1 = dst_src1 - src2 * src3
+instruct vmls$1_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg)
+%{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubV$1 (Binary dst_src1 (MulV$1 src2 src3)) pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_mls $dst_src1, $pg, src2, src3\t # vector (sve) ($2)" %}
+  ins_encode %{
+    __ sve_mls(as_FloatRegister($dst_src1$$reg), __ $2, as_PRegister($pg$$reg),
+         as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector mls - predicated
+VMLS_PREDICATE(B, B)
+VMLS_PREDICATE(S, H)
+VMLS_PREDICATE(I, S)
+VMLS_PREDICATE(L, D)
 
 dnl
 dnl BINARY_OP_TRUE_PREDICATE($1,        $2,      $3,   $4,          $5  )
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 10fcdaa243c..448648d8d80 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -3173,8 +3173,11 @@ public:
   INSN(sve_fnmla, 0b01100101, 1, 0b010); // floating-point negated fused multiply-add: Zda = -Zda + -Zn * Zm
   INSN(sve_fnmls, 0b01100101, 1, 0b011); // floating-point negated fused multiply-subtract: Zda = -Zda + Zn * Zm
   INSN(sve_fmad,  0b01100101, 1, 0b100); // floating-point fused multiply-add, writing multiplicand: Zda = Zm + Zda * Zn
-  INSN(sve_mla,   0b00000100, 0, 0b010); // multiply-add: Zda = Zda + Zn*Zm
-  INSN(sve_mls,   0b00000100, 0, 0b011); // multiply-subtract: Zda = Zda + -Zn*Zm
+  INSN(sve_fmsb,  0b01100101, 1, 0b101); // floating-point fused multiply-subtract, writing multiplicand: Zda = Zm + -Zda * Zn
+  INSN(sve_fnmad, 0b01100101, 1, 0b110); // floating-point negated fused multiply-add, writing multiplicand: Zda = -Zm + -Zda * Zn
+  INSN(sve_fnmsb, 0b01100101, 1, 0b111); // floating-point negated fused multiply-subtract, writing multiplicand: Zda = -Zm + Zda * Zn
+  INSN(sve_mla,   0b00000100, 0, 0b010); // multiply-add, writing addend: Zda = Zda + Zn*Zm
+  INSN(sve_mls,   0b00000100, 0, 0b011); // multiply-subtract, writing addend: Zda = Zda + -Zn*Zm
 #undef INSN
 
 // SVE bitwise logical - unpredicated
diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
index 40892ae0d94..34935714619 100644
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@@ -1849,6 +1849,9 @@ generate(SVEVectorOp, [["add", "ZZZ"],
                        ["fmad", "ZPZZ", "m"],
                        ["fmla", "ZPZZ", "m"],
                        ["fmls", "ZPZZ", "m"],
+                       ["fmsb", "ZPZZ", "m"],
+                       ["fnmad", "ZPZZ", "m"],
+                       ["fnmsb", "ZPZZ", "m"],
                        ["fnmla", "ZPZZ", "m"],
                        ["fnmls", "ZPZZ", "m"],
                        ["mla", "ZPZZ", "m"],
diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h
index 03147cba049..20fb391bcca 100644
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@@ -1153,27 +1153,30 @@
     __ sve_fmad(z17, __ D, p2, z16, z17);              //       fmad    z17.d, p2/m, z16.d, z17.d
     __ sve_fmla(z0, __ S, p1, z2, z23);                //       fmla    z0.s, p1/m, z2.s, z23.s
     __ sve_fmls(z6, __ D, p2, z20, z14);               //       fmls    z6.d, p2/m, z20.d, z14.d
-    __ sve_fnmla(z29, __ D, p3, z3, z3);               //       fnmla   z29.d, p3/m, z3.d, z3.d
-    __ sve_fnmls(z9, __ S, p0, z24, z27);              //       fnmls   z9.s, p0/m, z24.s, z27.s
-    __ sve_mla(z19, __ S, p5, z7, z25);                //       mla     z19.s, p5/m, z7.s, z25.s
-    __ sve_mls(z13, __ B, p1, z7, z25);                //       mls     z13.b, p1/m, z7.b, z25.b
-    __ sve_and(z21, z17, z17);                         //       and     z21.d, z17.d, z17.d
-    __ sve_eor(z3, z9, z19);                           //       eor     z3.d, z9.d, z19.d
-    __ sve_orr(z7, z11, z14);                          //       orr     z7.d, z11.d, z14.d
-    __ sve_bic(z17, z11, z13);                         //       bic     z17.d, z11.d, z13.d
-    __ sve_uzp1(z17, __ H, z30, z17);                  //       uzp1    z17.h, z30.h, z17.h
-    __ sve_uzp2(z15, __ S, z14, z26);                  //       uzp2    z15.s, z14.s, z26.s
+    __ sve_fmsb(z29, __ D, p3, z3, z3);                //       fmsb    z29.d, p3/m, z3.d, z3.d
+    __ sve_fnmad(z9, __ S, p0, z24, z27);              //       fnmad   z9.s, p0/m, z24.s, z27.s
+    __ sve_fnmsb(z19, __ D, p5, z7, z25);              //       fnmsb   z19.d, p5/m, z7.d, z25.d
+    __ sve_fnmla(z13, __ S, p1, z7, z25);              //       fnmla   z13.s, p1/m, z7.s, z25.s
+    __ sve_fnmls(z21, __ S, p4, z17, z0);              //       fnmls   z21.s, p4/m, z17.s, z0.s
+    __ sve_mla(z9, __ H, p5, z11, z7);                 //       mla     z9.h, p5/m, z11.h, z7.h
+    __ sve_mls(z14, __ H, p4, z17, z11);               //       mls     z14.h, p4/m, z17.h, z11.h
+    __ sve_and(z24, z17, z30);                         //       and     z24.d, z17.d, z30.d
+    __ sve_eor(z8, z15, z14);                          //       eor     z8.d, z15.d, z14.d
+    __ sve_orr(z22, z27, z22);                         //       orr     z22.d, z27.d, z22.d
+    __ sve_bic(z8, z5, z27);                           //       bic     z8.d, z5.d, z27.d
+    __ sve_uzp1(z10, __ D, z0, z14);                   //       uzp1    z10.d, z0.d, z14.d
+    __ sve_uzp2(z21, __ B, z20, z0);                   //       uzp2    z21.b, z20.b, z0.b
 
 // SVEReductionOp
-    __ sve_andv(v27, __ H, p5, z7);                    //       andv h27, p5, z7.h
-    __ sve_orv(v5, __ H, p7, z27);                     //       orv h5, p7, z27.h
-    __ sve_eorv(v0, __ S, p3, z24);                    //       eorv s0, p3, z24.s
-    __ sve_smaxv(v20, __ S, p0, z3);                   //       smaxv s20, p0, z3.s
-    __ sve_sminv(v25, __ D, p1, z25);                  //       sminv d25, p1, z25.d
-    __ sve_fminv(v17, __ S, p4, z1);                   //       fminv s17, p4, z1.s
-    __ sve_fmaxv(v14, __ S, p7, z13);                  //       fmaxv s14, p7, z13.s
-    __ sve_fadda(v17, __ D, p0, z30);                  //       fadda d17, p0, d17, z30.d
-    __ sve_uaddv(v22, __ H, p5, z29);                  //       uaddv d22, p5, z29.h
+    __ sve_andv(v22, __ D, p6, z5);                    //       andv d22, p6, z5.d
+    __ sve_orv(v29, __ B, p4, z17);                    //       orv b29, p4, z17.b
+    __ sve_eorv(v12, __ H, p3, z29);                   //       eorv h12, p3, z29.h
+    __ sve_smaxv(v0, __ D, p4, z2);                    //       smaxv d0, p4, z2.d
+    __ sve_sminv(v20, __ D, p5, z21);                  //       sminv d20, p5, z21.d
+    __ sve_fminv(v12, __ S, p2, z2);                   //       fminv s12, p2, z2.s
+    __ sve_fmaxv(v14, __ S, p5, z22);                  //       fmaxv s14, p5, z22.s
+    __ sve_fadda(v19, __ D, p6, z26);                  //       fadda d19, p6, d19, z26.d
+    __ sve_uaddv(v12, __ B, p5, z21);                  //       uaddv d12, p5, z21.b
 
     __ bind(forth);
 
@@ -1192,30 +1195,30 @@
     0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
     0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
     0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140003d2,     0x94000000,
-    0x97ffffd4,     0x940003cf,     0x3400000a,     0x34fffa2a,
-    0x3400798a,     0x35000008,     0x35fff9c8,     0x35007928,
-    0xb400000b,     0xb4fff96b,     0xb40078cb,     0xb500001d,
-    0xb5fff91d,     0xb500787d,     0x10000013,     0x10fff8b3,
-    0x10007813,     0x90000013,     0x36300016,     0x3637f836,
-    0x36307796,     0x3758000c,     0x375ff7cc,     0x3758772c,
+    0x14000000,     0x17ffffd7,     0x140003d5,     0x94000000,
+    0x97ffffd4,     0x940003d2,     0x3400000a,     0x34fffa2a,
+    0x340079ea,     0x35000008,     0x35fff9c8,     0x35007988,
+    0xb400000b,     0xb4fff96b,     0xb400792b,     0xb500001d,
+    0xb5fff91d,     0xb50078dd,     0x10000013,     0x10fff8b3,
+    0x10007873,     0x90000013,     0x36300016,     0x3637f836,
+    0x363077f6,     0x3758000c,     0x375ff7cc,     0x3758778c,
     0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
     0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
     0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
     0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54007500,     0x54000001,     0x54fff541,     0x540074a1,
-    0x54000002,     0x54fff4e2,     0x54007442,     0x54000002,
-    0x54fff482,     0x540073e2,     0x54000003,     0x54fff423,
-    0x54007383,     0x54000003,     0x54fff3c3,     0x54007323,
-    0x54000004,     0x54fff364,     0x540072c4,     0x54000005,
-    0x54fff305,     0x54007265,     0x54000006,     0x54fff2a6,
-    0x54007206,     0x54000007,     0x54fff247,     0x540071a7,
-    0x54000008,     0x54fff1e8,     0x54007148,     0x54000009,
-    0x54fff189,     0x540070e9,     0x5400000a,     0x54fff12a,
-    0x5400708a,     0x5400000b,     0x54fff0cb,     0x5400702b,
-    0x5400000c,     0x54fff06c,     0x54006fcc,     0x5400000d,
-    0x54fff00d,     0x54006f6d,     0x5400000e,     0x54ffefae,
-    0x54006f0e,     0x5400000f,     0x54ffef4f,     0x54006eaf,
+    0x54007560,     0x54000001,     0x54fff541,     0x54007501,
+    0x54000002,     0x54fff4e2,     0x540074a2,     0x54000002,
+    0x54fff482,     0x54007442,     0x54000003,     0x54fff423,
+    0x540073e3,     0x54000003,     0x54fff3c3,     0x54007383,
+    0x54000004,     0x54fff364,     0x54007324,     0x54000005,
+    0x54fff305,     0x540072c5,     0x54000006,     0x54fff2a6,
+    0x54007266,     0x54000007,     0x54fff247,     0x54007207,
+    0x54000008,     0x54fff1e8,     0x540071a8,     0x54000009,
+    0x54fff189,     0x54007149,     0x5400000a,     0x54fff12a,
+    0x540070ea,     0x5400000b,     0x54fff0cb,     0x5400708b,
+    0x5400000c,     0x54fff06c,     0x5400702c,     0x5400000d,
+    0x54fff00d,     0x54006fcd,     0x5400000e,     0x54ffefae,
+    0x54006f6e,     0x5400000f,     0x54ffef4f,     0x54006f0f,
     0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
     0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
     0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@@ -1432,11 +1435,11 @@
     0x65cd85ef,     0x65c68145,     0x6587801a,     0x65c29d53,
     0x04ddb4e3,     0x6582aebc,     0x65c0ae3a,     0x65c1ac51,
     0x658db690,     0x65c18033,     0x65f18a11,     0x65b70440,
-    0x65ee2a86,     0x65e34c7d,     0x65bb6309,     0x049954f3,
-    0x041964ed,     0x04313235,     0x04b33123,     0x046e3167,
-    0x04ed3171,     0x05716bd1,     0x05ba6dcf,     0x045a34fb,
-    0x04583f65,     0x04992f00,     0x04882074,     0x04ca2739,
-    0x65873031,     0x65863dae,     0x65d823d1,     0x044137b6,
-
+    0x65ee2a86,     0x65e3ac7d,     0x65bbc309,     0x65f9f4f3,
+    0x65b944ed,     0x65a07235,     0x04475569,     0x044b722e,
+    0x043e3238,     0x04ae31e8,     0x04763376,     0x04fb30a8,
+    0x05ee680a,     0x05206e95,     0x04da38b6,     0x0418323d,
+    0x04592fac,     0x04c83040,     0x04ca36b4,     0x6587284c,
+    0x658636ce,     0x65d83b53,     0x040136ac,
   };
 // END  Generated code -- do not edit
diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java
new file mode 100644
index 00000000000..a5c5c7f422c
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorFusedMultiplyAddSubTest.java
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2022, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+
+import java.util.Random;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.IntVector;
+import jdk.incubator.vector.LongVector;
+import jdk.incubator.vector.ShortVector;
+import jdk.incubator.vector.VectorMask;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+import jdk.test.lib.Asserts;
+import jdk.test.lib.Utils;
+
+/**
+ * @test
+ * @bug 8282431
+ * @key randomness
+ * @library /test/lib /
+ * @requires vm.cpu.features ~= ".*sve.*"
+ * @summary AArch64: Add optimized rules for masked vector multiply-add/sub for SVE
+ * @modules jdk.incubator.vector
+ *
+ * @run driver compiler.vectorapi.VectorFusedMultiplyAddSubTest
+ */
+
+public class VectorFusedMultiplyAddSubTest {
+    private static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
+    private static final VectorSpecies<Double> D_SPECIES = DoubleVector.SPECIES_MAX;
+    private static final VectorSpecies<Float> F_SPECIES = FloatVector.SPECIES_MAX;
+    private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
+    private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
+    private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
+
+    private static int LENGTH = 1024;
+    private static final Random RD = Utils.getRandomInstance();
+
+    private static byte[] ba;
+    private static byte[] bb;
+    private static byte[] bc;
+    private static byte[] br;
+    private static short[] sa;
+    private static short[] sb;
+    private static short[] sc;
+    private static short[] sr;
+    private static int[] ia;
+    private static int[] ib;
+    private static int[] ic;
+    private static int[] ir;
+    private static long[] la;
+    private static long[] lb;
+    private static long[] lc;
+    private static long[] lr;
+    private static float[] fa;
+    private static float[] fb;
+    private static float[] fc;
+    private static float[] fr;
+    private static double[] da;
+    private static double[] db;
+    private static double[] dc;
+    private static double[] dr;
+    private static boolean[] m;
+
+    static {
+        ba = new byte[LENGTH];
+        bb = new byte[LENGTH];
+        bc = new byte[LENGTH];
+        br = new byte[LENGTH];
+        sa = new short[LENGTH];
+        sb = new short[LENGTH];
+        sc = new short[LENGTH];
+        sr = new short[LENGTH];
+        ia = new int[LENGTH];
+        ib = new int[LENGTH];
+        ic = new int[LENGTH];
+        ir = new int[LENGTH];
+        la = new long[LENGTH];
+        lb = new long[LENGTH];
+        lc = new long[LENGTH];
+        lr = new long[LENGTH];
+        fa = new float[LENGTH];
+        fb = new float[LENGTH];
+        fc = new float[LENGTH];
+        fr = new float[LENGTH];
+        da = new double[LENGTH];
+        db = new double[LENGTH];
+        dc = new double[LENGTH];
+        dr = new double[LENGTH];
+        m = new boolean[LENGTH];
+
+        for (int i = 0; i < LENGTH; i++) {
+            ba[i] = (byte) RD.nextInt(25);
+            bb[i] = (byte) RD.nextInt(25);
+            bc[i] = (byte) RD.nextInt(25);
+            sa[i] = (short) RD.nextInt(25);
+            sb[i] = (short) RD.nextInt(25);
+            sc[i] = (short) RD.nextInt(25);
+            ia[i] = RD.nextInt(25);
+            ib[i] = RD.nextInt(25);
+            ic[i] = RD.nextInt(25);
+            la[i] = RD.nextLong(25);
+            lb[i] = RD.nextLong(25);
+            lc[i] = RD.nextLong(25);
+            fa[i] = RD.nextFloat((float) 25.0);
+            fb[i] = RD.nextFloat((float) 25.0);
+            fc[i] = RD.nextFloat((float) 25.0);
+            da[i] = RD.nextDouble(25.0);
+            db[i] = RD.nextDouble(25.0);
+            dc[i] = RD.nextDouble(25.0);
+            m[i] = RD.nextBoolean();
+        }
+    }
+
+    interface BTenOp {
+        byte apply(byte a, byte b, byte c);
+    }
+
+    interface STenOp {
+        short apply(short a, short b, short c);
+    }
+
+    interface ITenOp {
+        int apply(int a, int b, int c);
+    }
+
+    interface LTenOp {
+        long apply(long a, long b, long c);
+    }
+
+    interface FTenOp {
+        float apply(float a, float b, float c);
+    }
+
+    interface DTenOp {
+        double apply(double a, double b, double c);
+    }
+
+    private static void assertArrayEquals(byte[] r, byte[] a, byte[] b, byte[] c, boolean[] m, BTenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % B_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    private static void assertArrayEquals(short[] r, short[] a, short[] b, short[] c, boolean[] m, STenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % S_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    private static void assertArrayEquals(int[] r, int[] a, int[] b, int[] c, boolean[] m, ITenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % I_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    private static void assertArrayEquals(long[] r, long[] a, long[] b, long[] c, boolean[] m, LTenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % L_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    private static void assertArrayEquals(float[] r, float[] a, float[] b, float[] c, boolean[] m, FTenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % F_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    private static void assertArrayEquals(double[] r, double[] a, double[] b, double[] c, boolean[] m, DTenOp f) {
+       for (int i = 0; i < LENGTH; i++) {
+           if (m[i % D_SPECIES.length()]) {
+               Asserts.assertEquals(f.apply(a[i], b[i], c[i]), r[i]);
+           } else {
+               Asserts.assertEquals(a[i], r[i]);
+           }
+       }
+    }
+
+    @Test
+    @IR(counts = { "sve_mla", ">= 1" })
+    public static void testByteMultiplyAddMasked() {
+        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            ByteVector av = ByteVector.fromArray(B_SPECIES, ba, i);
+            ByteVector bv = ByteVector.fromArray(B_SPECIES, bb, i);
+            ByteVector cv = ByteVector.fromArray(B_SPECIES, bc, i);
+            av.add(bv.mul(cv), mask).intoArray(br, i);
+        }
+        assertArrayEquals(br, ba, bb, bc, m, (a, b, c) -> (byte) (a + b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mls", ">= 1" })
+    public static void testByteMultiplySubMasked() {
+        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            ByteVector av = ByteVector.fromArray(B_SPECIES, ba, i);
+            ByteVector bv = ByteVector.fromArray(B_SPECIES, bb, i);
+            ByteVector cv = ByteVector.fromArray(B_SPECIES, bc, i);
+            av.sub(bv.mul(cv), mask).intoArray(br, i);
+        }
+        assertArrayEquals(br, ba, bb, bc, m, (a, b, c) -> (byte) (a - b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mla", ">= 1" })
+    public static void testShortMultiplyAddMasked() {
+        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            ShortVector av = ShortVector.fromArray(S_SPECIES, sa, i);
+            ShortVector bv = ShortVector.fromArray(S_SPECIES, sb, i);
+            ShortVector cv = ShortVector.fromArray(S_SPECIES, sc, i);
+            av.add(bv.mul(cv), mask).intoArray(sr, i);
+        }
+        assertArrayEquals(sr, sa, sb, sc, m, (a, b, c) -> (short) (a + b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mls", ">= 1" })
+    public static void testShortMultiplySubMasked() {
+        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            ShortVector av = ShortVector.fromArray(S_SPECIES, sa, i);
+            ShortVector bv = ShortVector.fromArray(S_SPECIES, sb, i);
+            ShortVector cv = ShortVector.fromArray(S_SPECIES, sc, i);
+            av.sub(bv.mul(cv), mask).intoArray(sr, i);
+        }
+        assertArrayEquals(sr, sa, sb, sc, m, (a, b, c) -> (short) (a - b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mla", ">= 1" })
+    public static void testIntMultiplyAddMasked() {
+        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
+            IntVector av = IntVector.fromArray(I_SPECIES, ia, i);
+            IntVector bv = IntVector.fromArray(I_SPECIES, ib, i);
+            IntVector cv = IntVector.fromArray(I_SPECIES, ic, i);
+            av.add(bv.mul(cv), mask).intoArray(ir, i);
+        }
+        assertArrayEquals(ir, ia, ib, ic, m, (a, b, c) -> (int) (a + b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mls", ">= 1" })
+    public static void testIntMultiplySubMasked() {
+        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
+            IntVector av = IntVector.fromArray(I_SPECIES, ia, i);
+            IntVector bv = IntVector.fromArray(I_SPECIES, ib, i);
+            IntVector cv = IntVector.fromArray(I_SPECIES, ic, i);
+            av.sub(bv.mul(cv), mask).intoArray(ir, i);
+        }
+        assertArrayEquals(ir, ia, ib, ic, m, (a, b, c) -> (int) (a - b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mla", ">= 1" })
+    public static void testLongMultiplyAddMasked() {
+        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
+            LongVector av = LongVector.fromArray(L_SPECIES, la, i);
+            LongVector bv = LongVector.fromArray(L_SPECIES, lb, i);
+            LongVector cv = LongVector.fromArray(L_SPECIES, lc, i);
+            av.add(bv.mul(cv), mask).intoArray(lr, i);
+        }
+        assertArrayEquals(lr, la, lb, lc, m, (a, b, c) -> (long) (a + b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_mls", ">= 1" })
+    public static void testLongMultiplySubMasked() {
+        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
+            LongVector av = LongVector.fromArray(L_SPECIES, la, i);
+            LongVector bv = LongVector.fromArray(L_SPECIES, lb, i);
+            LongVector cv = LongVector.fromArray(L_SPECIES, lc, i);
+            av.sub(bv.mul(cv), mask).intoArray(lr, i);
+        }
+        assertArrayEquals(lr, la, lb, lc, m, (a, b, c) -> (long) (a - b * c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fmsb", ">= 1" })
+    public static void testFloatMultiplySubMasked() {
+        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
+            FloatVector av = FloatVector.fromArray(F_SPECIES, fa, i);
+            FloatVector bv = FloatVector.fromArray(F_SPECIES, fb, i);
+            FloatVector cv = FloatVector.fromArray(F_SPECIES, fc, i);
+            av.lanewise(VectorOperators.FMA, bv.neg(), cv, mask).intoArray(fr, i);
+        }
+        assertArrayEquals(fr, fa, fb, fc, m, (a, b, c) -> (float) Math.fma(a, -b, c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fnmad", ">= 1" })
+    public static void testFloatNegatedMultiplyAddMasked() {
+        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
+            FloatVector av = FloatVector.fromArray(F_SPECIES, fa, i);
+            FloatVector bv = FloatVector.fromArray(F_SPECIES, fb, i);
+            FloatVector cv = FloatVector.fromArray(F_SPECIES, fc, i);
+            av.lanewise(VectorOperators.FMA, bv.neg(), cv.neg(), mask).intoArray(fr, i);
+        }
+        assertArrayEquals(fr, fa, fb, fc, m, (a, b, c) -> (float) Math.fma(a, -b, -c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fnmsb", ">= 1" })
+    public static void testFloatNegatedMultiplySubMasked() {
+        VectorMask<Float> mask = VectorMask.fromArray(F_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += F_SPECIES.length()) {
+            FloatVector av = FloatVector.fromArray(F_SPECIES, fa, i);
+            FloatVector bv = FloatVector.fromArray(F_SPECIES, fb, i);
+            FloatVector cv = FloatVector.fromArray(F_SPECIES, fc, i);
+            av.lanewise(VectorOperators.FMA, bv, cv.neg(), mask).intoArray(fr, i);
+        }
+        assertArrayEquals(fr, fa, fb, fc, m, (a, b, c) -> (float) Math.fma(a, b, -c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fmsb", ">= 1" })
+    public static void testDoubleMultiplySubMasked() {
+        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
+            DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, i);
+            DoubleVector bv = DoubleVector.fromArray(D_SPECIES, db, i);
+            DoubleVector cv = DoubleVector.fromArray(D_SPECIES, dc, i);
+            av.lanewise(VectorOperators.FMA, bv.neg(), cv, mask).intoArray(dr, i);
+        }
+        assertArrayEquals(dr, da, db, dc, m, (a, b, c) -> (double) Math.fma(a, -b, c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fnmad", ">= 1" })
+    public static void testDoubleNegatedMultiplyAddMasked() {
+        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
+            DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, i);
+            DoubleVector bv = DoubleVector.fromArray(D_SPECIES, db, i);
+            DoubleVector cv = DoubleVector.fromArray(D_SPECIES, dc, i);
+            av.lanewise(VectorOperators.FMA, bv.neg(), cv.neg(), mask).intoArray(dr, i);
+        }
+        assertArrayEquals(dr, da, db, dc, m, (a, b, c) -> (double) Math.fma(a, -b, -c));
+    }
+
+    @Test
+    @IR(counts = { "sve_fnmsb", ">= 1" })
+    public static void testDoubleNegatedMultiplySubMasked() {
+        VectorMask<Double> mask = VectorMask.fromArray(D_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += D_SPECIES.length()) {
+            DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, i);
+            DoubleVector bv = DoubleVector.fromArray(D_SPECIES, db, i);
+            DoubleVector cv = DoubleVector.fromArray(D_SPECIES, dc, i);
+            av.lanewise(VectorOperators.FMA, bv, cv.neg(), mask).intoArray(dr, i);
+        }
+        assertArrayEquals(dr, da, db, dc, m, (a, b, c) -> (double) Math.fma(a, b, -c));
+    }
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector",
+                                   "-XX:UseSVE=1");
+    }
+}