8299038: Add AArch64 backend support for auto-vectorized FP16 conversions

Reviewed-by: xgong, ngasson
2023-01-16 10:47:38 +00:00 · 2023-01-16 10:47:38 +00:00 · 98d75f1879
commit 98d75f1879
parent cac72a6018
6 changed files with 209 additions and 77 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -1,6 +1,6 @@
 //
-// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2020, 2022, Arm Limited. All rights reserved.
+// Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -4159,6 +4159,52 @@ instruct vcvtDtoF_gt64b(vReg dst, vReg src, vReg tmp) %{
  ins_pipe(pipe_slow);
 %}
 // VectorCastHF2F
 instruct vcvtHFtoF(vReg dst, vReg src) %{
  match(Set dst (VectorCastHF2F src));
  format %{ "vcvtHFtoF $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 4HF to 4F
      __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_vector_extend($dst$$FloatRegister, __ S, $src$$FloatRegister, __ H);
      __ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $dst$$FloatRegister, __ H);
    }
  %}
  ins_pipe(pipe_slow);
 %}
 // VectorCastF2HF
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
  ins_encode %{
    // 4F to 4HF
    __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
  %}
  ins_pipe(pipe_slow);
 %}
 instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtFtoHF_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ S);
    __ sve_vector_narrow($dst$$FloatRegister, __ H,
                         $dst$$FloatRegister, __ S, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
 %}
 // ------------------------------ Replicate ------------------------------------
 // replicate from reg
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -1,6 +1,6 @@
 //
-// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2020, 2022, Arm Limited. All rights reserved.
+// Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -2731,6 +2731,52 @@ instruct vcvtDtoF_gt64b(vReg dst, vReg src, vReg tmp) %{
  ins_pipe(pipe_slow);
 %}
 // VectorCastHF2F
 instruct vcvtHFtoF(vReg dst, vReg src) %{
  match(Set dst (VectorCastHF2F src));
  format %{ "vcvtHFtoF $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 4HF to 4F
      __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_vector_extend($dst$$FloatRegister, __ S, $src$$FloatRegister, __ H);
      __ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $dst$$FloatRegister, __ H);
    }
  %}
  ins_pipe(pipe_slow);
 %}
 // VectorCastF2HF
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
  ins_encode %{
    // 4F to 4HF
    __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
  %}
  ins_pipe(pipe_slow);
 %}
 instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtFtoHF_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ S);
    __ sve_vector_narrow($dst$$FloatRegister, __ H,
                         $dst$$FloatRegister, __ S, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
 %}
 // ------------------------------ Replicate ------------------------------------
 dnl REPLICATE_INT($1,   $2,       $3  )
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3943,9 +3943,29 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
    starti;
    assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q &&
           T_src != T_dst, "invalid register variant");
-    guarantee(T_src != H && T_dst != H, "half-precision unsupported");
+    // The encodings of fields op1 (bits 17-16) and op2 (bits 23-22)
-    f(0b01100101, 31, 24), f(0b11, 23, 22), f(0b0010, 21, 18);
+    // depend on T_src and T_dst as given below -
-    f(T_dst, 17, 16), f(0b101, 15, 13);
+    // +-----+------+---------------------------------------------+
    // | op2 | op1  |             Instruction Details             |
    // +-----+------+---------------------------------------------+
    // |  10 |  01  | FCVT - half-precision to single-precision   |
    // |  11 |  01  | FCVT - half-precision to double-precision   |
    // |  10 |  00  | FCVT - single-precision to half-precision   |
    // |  11 |  11  | FCVT - single-precision to double-precision |
    // |  11 |  00  | FCVT - double-preciison to half-precision   |
    // |  11 |  10  | FCVT - double-precision to single-precision |
    // +-----+------+---+-----------------------------------------+
    int op1 = 0b00;
    int op2 = (T_src == D || T_dst == D) ? 0b11 : 0b10;
    if (T_src == H) {
      op1 = 0b01;
    } else if (T_dst == S) {
      op1 = 0b10;
    } else if (T_dst == D) {
      op1 = 0b11;
    }
    f(0b01100101, 31, 24), f(op2, 23, 22), f(0b0010, 21, 18);
    f(op1, 17, 16), f(0b101, 15, 13);
    pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
  }
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -1772,6 +1772,10 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                        ["scvtf",    "__ sve_scvtf(z6, __ H, p3, z1, __ H);",              "scvtf\tz6.h, p3/m, z1.h"],
                        ["fcvt",     "__ sve_fcvt(z5, __ D, p3, z4, __ S);",               "fcvt\tz5.d, p3/m, z4.s"],
                        ["fcvt",     "__ sve_fcvt(z1, __ S, p3, z0, __ D);",               "fcvt\tz1.s, p3/m, z0.d"],
                        ["fcvt",     "__ sve_fcvt(z5, __ S, p3, z4, __ H);",               "fcvt\tz5.s, p3/m, z4.h"],
                        ["fcvt",     "__ sve_fcvt(z1, __ H, p3, z0, __ S);",               "fcvt\tz1.h, p3/m, z0.s"],
                        ["fcvt",     "__ sve_fcvt(z5, __ D, p3, z4, __ H);",               "fcvt\tz5.d, p3/m, z4.h"],
                        ["fcvt",     "__ sve_fcvt(z1, __ H, p3, z0, __ D);",               "fcvt\tz1.h, p3/m, z0.d"],
                        ["fcvtzs",   "__ sve_fcvtzs(z19, __ D, p2, z1, __ D);",            "fcvtzs\tz19.d, p2/m, z1.d"],
                        ["fcvtzs",   "__ sve_fcvtzs(z9, __ S, p1, z8, __ S);",             "fcvtzs\tz9.s, p1/m, z8.s"],
                        ["fcvtzs",   "__ sve_fcvtzs(z1, __ S, p2, z0, __ D);",             "fcvtzs\tz1.s, p2/m, z0.d"],
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@ -915,6 +915,10 @@
    __ sve_scvtf(z6, __ H, p3, z1, __ H);              //       scvtf   z6.h, p3/m, z1.h
    __ sve_fcvt(z5, __ D, p3, z4, __ S);               //       fcvt    z5.d, p3/m, z4.s
    __ sve_fcvt(z1, __ S, p3, z0, __ D);               //       fcvt    z1.s, p3/m, z0.d
    __ sve_fcvt(z5, __ S, p3, z4, __ H);               //       fcvt    z5.s, p3/m, z4.h
    __ sve_fcvt(z1, __ H, p3, z0, __ S);               //       fcvt    z1.h, p3/m, z0.s
    __ sve_fcvt(z5, __ D, p3, z4, __ H);               //       fcvt    z5.d, p3/m, z4.h
    __ sve_fcvt(z1, __ H, p3, z0, __ D);               //       fcvt    z1.h, p3/m, z0.d
    __ sve_fcvtzs(z19, __ D, p2, z1, __ D);            //       fcvtzs  z19.d, p2/m, z1.d
    __ sve_fcvtzs(z9, __ S, p1, z8, __ S);             //       fcvtzs  z9.s, p1/m, z8.s
    __ sve_fcvtzs(z1, __ S, p2, z0, __ D);             //       fcvtzs  z1.s, p2/m, z0.d
@ -1245,30 +1249,30 @@
    0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
    0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
    0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x14000405,     0x94000000,
+    0x14000000,     0x17ffffd7,     0x14000409,     0x94000000,
-    0x97ffffd4,     0x94000402,     0x3400000a,     0x34fffa2a,
+    0x97ffffd4,     0x94000406,     0x3400000a,     0x34fffa2a,
-    0x34007fea,     0x35000008,     0x35fff9c8,     0x35007f88,
+    0x3400806a,     0x35000008,     0x35fff9c8,     0x35008008,
-    0xb400000b,     0xb4fff96b,     0xb4007f2b,     0xb500001d,
+    0xb400000b,     0xb4fff96b,     0xb4007fab,     0xb500001d,
-    0xb5fff91d,     0xb5007edd,     0x10000013,     0x10fff8b3,
+    0xb5fff91d,     0xb5007f5d,     0x10000013,     0x10fff8b3,
-    0x10007e73,     0x90000013,     0x36300016,     0x3637f836,
+    0x10007ef3,     0x90000013,     0x36300016,     0x3637f836,
-    0x36307df6,     0x3758000c,     0x375ff7cc,     0x37587d8c,
+    0x36307e76,     0x3758000c,     0x375ff7cc,     0x37587e0c,
    0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
    0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
    0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
    0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54007b60,     0x54000001,     0x54fff541,     0x54007b01,
+    0x54007be0,     0x54000001,     0x54fff541,     0x54007b81,
-    0x54000002,     0x54fff4e2,     0x54007aa2,     0x54000002,
+    0x54000002,     0x54fff4e2,     0x54007b22,     0x54000002,
-    0x54fff482,     0x54007a42,     0x54000003,     0x54fff423,
+    0x54fff482,     0x54007ac2,     0x54000003,     0x54fff423,
-    0x540079e3,     0x54000003,     0x54fff3c3,     0x54007983,
+    0x54007a63,     0x54000003,     0x54fff3c3,     0x54007a03,
-    0x54000004,     0x54fff364,     0x54007924,     0x54000005,
+    0x54000004,     0x54fff364,     0x540079a4,     0x54000005,
-    0x54fff305,     0x540078c5,     0x54000006,     0x54fff2a6,
+    0x54fff305,     0x54007945,     0x54000006,     0x54fff2a6,
-    0x54007866,     0x54000007,     0x54fff247,     0x54007807,
+    0x540078e6,     0x54000007,     0x54fff247,     0x54007887,
-    0x54000008,     0x54fff1e8,     0x540077a8,     0x54000009,
+    0x54000008,     0x54fff1e8,     0x54007828,     0x54000009,
-    0x54fff189,     0x54007749,     0x5400000a,     0x54fff12a,
+    0x54fff189,     0x540077c9,     0x5400000a,     0x54fff12a,
-    0x540076ea,     0x5400000b,     0x54fff0cb,     0x5400768b,
+    0x5400776a,     0x5400000b,     0x54fff0cb,     0x5400770b,
-    0x5400000c,     0x54fff06c,     0x5400762c,     0x5400000d,
+    0x5400000c,     0x54fff06c,     0x540076ac,     0x5400000d,
-    0x54fff00d,     0x540075cd,     0x5400000e,     0x54ffefae,
+    0x54fff00d,     0x5400764d,     0x5400000e,     0x54ffefae,
-    0x5400756e,     0x5400000f,     0x54ffef4f,     0x5400750f,
+    0x540075ee,     0x5400000f,     0x54ffef4f,     0x5400758f,
    0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
    0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
    0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@ -1434,7 +1438,8 @@
    0x05733820,     0x05b238a4,     0x05f138e6,     0x0570396a,
    0x65d0a001,     0x65d6a443,     0x65d4a826,     0x6594ac26,
    0x6554ac26,     0x6556ac26,     0x6552ac26,     0x65cbac85,
-    0x65caac01,     0x65dea833,     0x659ca509,     0x65d8a801,
+    0x65caac01,     0x6589ac85,     0x6588ac01,     0x65c9ac85,
    0x65c8ac01,     0x65dea833,     0x659ca509,     0x65d8a801,
    0x65dcac01,     0x655cb241,     0x0520a1e0,     0x0521a601,
    0x052281e0,     0x05238601,     0x04a14026,     0x042244a6,
    0x046344a6,     0x04a444a6,     0x04e544a7,     0x0568aca7,
--- a/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,7 +26,7 @@
 * @bug 8294588
 * @summary Auto-vectorize Float.floatToFloat16, Float.float16ToFloat APIs
 * @requires vm.compiler2.enabled
- * @requires os.simpleArch == "x64"
+ * @requires (os.simpleArch == "x64" & (vm.cpu.features ~= ".*avx512f.*" | vm.cpu.features ~= ".*f16c.*")) | os.arch == "aarch64"
 * @library /test/lib /
 * @run driver compiler.vectorization.TestFloatConversionsVector
 */
@ -34,62 +34,73 @@
 package compiler.vectorization;
 import compiler.lib.ir_framework.*;
 import jdk.test.lib.Asserts;
 public class TestFloatConversionsVector {
-  private static final int ARRLEN = 1024;
+    private static final int ARRLEN = 1024;
-  private static final int ITERS  = 11000;
+    private static final int ITERS  = 11000;
-  private static float  [] finp;
+    private static float  [] finp;
-  private static short  [] sout;
+    private static short  [] sout;
-  private static short  [] sinp;
+    private static short  [] sinp;
-  private static float  [] fout;
+    private static float  [] fout;
-  public static void main(String args[]) {
+    public static void main(String args[]) {
-      TestFramework.runWithFlags("-XX:-TieredCompilation",
+        TestFramework.runWithFlags("-XX:-TieredCompilation",
-                                 "-XX:CompileThresholdScaling=0.3");
+                                   "-XX:CompileThresholdScaling=0.3");
-      System.out.println("PASSED");
+        System.out.println("PASSED");
-  }
+    }
-  @Test
+    @Test
-  @IR(counts = {IRNode.VECTOR_CAST_F2HF, "> 0"}, applyIfCPUFeatureOr = {"avx512f", "true", "f16c", "true"})
+    @IR(counts = {IRNode.VECTOR_CAST_F2HF, "> 0"})
-  public void test_float_float16(short[] sout, float[] finp) {
+    public void test_float_float16(short[] sout, float[] finp) {
-      for (int i = 0; i < finp.length; i++) {
+        for (int i = 0; i < finp.length; i++) {
-          sout[i] = Float.floatToFloat16(finp[i]);
+            sout[i] = Float.floatToFloat16(finp[i]);
-      }
+        }
-  }
+    }
-  @Run(test = {"test_float_float16"}, mode = RunMode.STANDALONE)
+    @Run(test = {"test_float_float16"}, mode = RunMode.STANDALONE)
-  public void kernel_test_float_float16() {
+    public void kernel_test_float_float16() {
-      finp = new float[ARRLEN];
+        finp = new float[ARRLEN];
-      sout = new short[ARRLEN];
+        sout = new short[ARRLEN];
-      for (int i = 0; i < ARRLEN; i++) {
+        for (int i = 0; i < ARRLEN; i++) {
-          finp[i] = (float) i * 1.4f;
+            finp[i] = (float) i * 1.4f;
-      }
+        }
-      for (int i = 0; i < ITERS; i++) {
+        for (int i = 0; i < ITERS; i++) {
-         test_float_float16(sout, finp);
+            test_float_float16(sout, finp);
-      }
+        }
  }
-  @Test
+        // Verifying the result
-  @IR(counts = {IRNode.VECTOR_CAST_HF2F, "> 0"}, applyIfCPUFeatureOr = {"avx512f", "true", "f16c", "true"})
+        for (int i = 0; i < ARRLEN; i++) {
-  public void test_float16_float(float[] fout, short[] sinp) {
+            Asserts.assertEquals(Float.floatToFloat16(finp[i]), sout[i]);
-      for (int i = 0; i < sinp.length; i++) {
+        }
-          fout[i] = Float.float16ToFloat(sinp[i]);
+    }
      }
  }
-  @Run(test = {"test_float16_float"}, mode = RunMode.STANDALONE)
+    @Test
-  public void kernel_test_float16_float() {
+    @IR(counts = {IRNode.VECTOR_CAST_HF2F, "> 0"})
-      sinp = new short[ARRLEN];
+    public void test_float16_float(float[] fout, short[] sinp) {
-      fout = new float[ARRLEN];
+        for (int i = 0; i < sinp.length; i++) {
            fout[i] = Float.float16ToFloat(sinp[i]);
        }
    }
-      for (int i = 0; i < ARRLEN; i++) {
+    @Run(test = {"test_float16_float"}, mode = RunMode.STANDALONE)
-          sinp[i] = (short)i;
+    public void kernel_test_float16_float() {
-      }
+        sinp = new short[ARRLEN];
        fout = new float[ARRLEN];
-      for (int i = 0; i < ITERS; i++) {
+        for (int i = 0; i < ARRLEN; i++) {
-          test_float16_float(fout , sinp);
+            sinp[i] = (short)i;
-      }
+        }
-  }
+
        for (int i = 0; i < ITERS; i++) {
            test_float16_float(fout, sinp);
        }
        // Verifying the result
        for (int i = 0; i < ARRLEN; i++) {
            Asserts.assertEquals(Float.float16ToFloat(sinp[i]), fout[i]);
        }
    }
 }