8299038: Add AArch64 backend support for auto-vectorized FP16 conversions

Reviewed-by: xgong, ngasson
This commit is contained in:
Bhavana Kilambi 2023-01-16 10:47:38 +00:00 committed by Nick Gasson
parent cac72a6018
commit 98d75f1879
6 changed files with 209 additions and 77 deletions

View File

@ -1,6 +1,6 @@
// //
// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2022, Arm Limited. All rights reserved. // Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
// //
// This code is free software; you can redistribute it and/or modify it // This code is free software; you can redistribute it and/or modify it
@ -4159,6 +4159,52 @@ instruct vcvtDtoF_gt64b(vReg dst, vReg src, vReg tmp) %{
ins_pipe(pipe_slow); ins_pipe(pipe_slow);
%} %}
// VectorCastHF2F
instruct vcvtHFtoF(vReg dst, vReg src) %{
match(Set dst (VectorCastHF2F src));
format %{ "vcvtHFtoF $dst, $src" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4HF to 4F
__ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ S, $src$$FloatRegister, __ H);
__ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $dst$$FloatRegister, __ H);
}
%}
ins_pipe(pipe_slow);
%}
// VectorCastF2HF
instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
match(Set dst (VectorCastF2HF src));
format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
ins_encode %{
// 4F to 4HF
__ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
%}
ins_pipe(pipe_slow);
%}
instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
match(Set dst (VectorCastF2HF src));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "vcvtFtoHF_sve $dst, $src\t# KILL $tmp" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ S);
__ sve_vector_narrow($dst$$FloatRegister, __ H,
$dst$$FloatRegister, __ S, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
// ------------------------------ Replicate ------------------------------------ // ------------------------------ Replicate ------------------------------------
// replicate from reg // replicate from reg

View File

@ -1,6 +1,6 @@
// //
// Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2022, Arm Limited. All rights reserved. // Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
// //
// This code is free software; you can redistribute it and/or modify it // This code is free software; you can redistribute it and/or modify it
@ -2731,6 +2731,52 @@ instruct vcvtDtoF_gt64b(vReg dst, vReg src, vReg tmp) %{
ins_pipe(pipe_slow); ins_pipe(pipe_slow);
%} %}
// VectorCastHF2F
instruct vcvtHFtoF(vReg dst, vReg src) %{
match(Set dst (VectorCastHF2F src));
format %{ "vcvtHFtoF $dst, $src" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4HF to 4F
__ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ S, $src$$FloatRegister, __ H);
__ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $dst$$FloatRegister, __ H);
}
%}
ins_pipe(pipe_slow);
%}
// VectorCastF2HF
instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
match(Set dst (VectorCastF2HF src));
format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
ins_encode %{
// 4F to 4HF
__ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
%}
ins_pipe(pipe_slow);
%}
instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
match(Set dst (VectorCastF2HF src));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "vcvtFtoHF_sve $dst, $src\t# KILL $tmp" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ S);
__ sve_vector_narrow($dst$$FloatRegister, __ H,
$dst$$FloatRegister, __ S, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
// ------------------------------ Replicate ------------------------------------ // ------------------------------ Replicate ------------------------------------
dnl REPLICATE_INT($1, $2, $3 ) dnl REPLICATE_INT($1, $2, $3 )

View File

@ -3943,9 +3943,29 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
starti; starti;
assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q && assert(T_src != B && T_dst != B && T_src != Q && T_dst != Q &&
T_src != T_dst, "invalid register variant"); T_src != T_dst, "invalid register variant");
guarantee(T_src != H && T_dst != H, "half-precision unsupported"); // The encodings of fields op1 (bits 17-16) and op2 (bits 23-22)
f(0b01100101, 31, 24), f(0b11, 23, 22), f(0b0010, 21, 18); // depend on T_src and T_dst as given below -
f(T_dst, 17, 16), f(0b101, 15, 13); // +-----+------+---------------------------------------------+
// | op2 | op1 | Instruction Details |
// +-----+------+---------------------------------------------+
// | 10 | 01 | FCVT - half-precision to single-precision |
// | 11 | 01 | FCVT - half-precision to double-precision |
// | 10 | 00 | FCVT - single-precision to half-precision |
// | 11 | 11 | FCVT - single-precision to double-precision |
// | 11 | 00 | FCVT - double-preciison to half-precision |
// | 11 | 10 | FCVT - double-precision to single-precision |
// +-----+------+---+-----------------------------------------+
int op1 = 0b00;
int op2 = (T_src == D || T_dst == D) ? 0b11 : 0b10;
if (T_src == H) {
op1 = 0b01;
} else if (T_dst == S) {
op1 = 0b10;
} else if (T_dst == D) {
op1 = 0b11;
}
f(0b01100101, 31, 24), f(op2, 23, 22), f(0b0010, 21, 18);
f(op1, 17, 16), f(0b101, 15, 13);
pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0); pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
} }

View File

@ -1772,6 +1772,10 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);",
["scvtf", "__ sve_scvtf(z6, __ H, p3, z1, __ H);", "scvtf\tz6.h, p3/m, z1.h"], ["scvtf", "__ sve_scvtf(z6, __ H, p3, z1, __ H);", "scvtf\tz6.h, p3/m, z1.h"],
["fcvt", "__ sve_fcvt(z5, __ D, p3, z4, __ S);", "fcvt\tz5.d, p3/m, z4.s"], ["fcvt", "__ sve_fcvt(z5, __ D, p3, z4, __ S);", "fcvt\tz5.d, p3/m, z4.s"],
["fcvt", "__ sve_fcvt(z1, __ S, p3, z0, __ D);", "fcvt\tz1.s, p3/m, z0.d"], ["fcvt", "__ sve_fcvt(z1, __ S, p3, z0, __ D);", "fcvt\tz1.s, p3/m, z0.d"],
["fcvt", "__ sve_fcvt(z5, __ S, p3, z4, __ H);", "fcvt\tz5.s, p3/m, z4.h"],
["fcvt", "__ sve_fcvt(z1, __ H, p3, z0, __ S);", "fcvt\tz1.h, p3/m, z0.s"],
["fcvt", "__ sve_fcvt(z5, __ D, p3, z4, __ H);", "fcvt\tz5.d, p3/m, z4.h"],
["fcvt", "__ sve_fcvt(z1, __ H, p3, z0, __ D);", "fcvt\tz1.h, p3/m, z0.d"],
["fcvtzs", "__ sve_fcvtzs(z19, __ D, p2, z1, __ D);", "fcvtzs\tz19.d, p2/m, z1.d"], ["fcvtzs", "__ sve_fcvtzs(z19, __ D, p2, z1, __ D);", "fcvtzs\tz19.d, p2/m, z1.d"],
["fcvtzs", "__ sve_fcvtzs(z9, __ S, p1, z8, __ S);", "fcvtzs\tz9.s, p1/m, z8.s"], ["fcvtzs", "__ sve_fcvtzs(z9, __ S, p1, z8, __ S);", "fcvtzs\tz9.s, p1/m, z8.s"],
["fcvtzs", "__ sve_fcvtzs(z1, __ S, p2, z0, __ D);", "fcvtzs\tz1.s, p2/m, z0.d"], ["fcvtzs", "__ sve_fcvtzs(z1, __ S, p2, z0, __ D);", "fcvtzs\tz1.s, p2/m, z0.d"],

View File

@ -915,6 +915,10 @@
__ sve_scvtf(z6, __ H, p3, z1, __ H); // scvtf z6.h, p3/m, z1.h __ sve_scvtf(z6, __ H, p3, z1, __ H); // scvtf z6.h, p3/m, z1.h
__ sve_fcvt(z5, __ D, p3, z4, __ S); // fcvt z5.d, p3/m, z4.s __ sve_fcvt(z5, __ D, p3, z4, __ S); // fcvt z5.d, p3/m, z4.s
__ sve_fcvt(z1, __ S, p3, z0, __ D); // fcvt z1.s, p3/m, z0.d __ sve_fcvt(z1, __ S, p3, z0, __ D); // fcvt z1.s, p3/m, z0.d
__ sve_fcvt(z5, __ S, p3, z4, __ H); // fcvt z5.s, p3/m, z4.h
__ sve_fcvt(z1, __ H, p3, z0, __ S); // fcvt z1.h, p3/m, z0.s
__ sve_fcvt(z5, __ D, p3, z4, __ H); // fcvt z5.d, p3/m, z4.h
__ sve_fcvt(z1, __ H, p3, z0, __ D); // fcvt z1.h, p3/m, z0.d
__ sve_fcvtzs(z19, __ D, p2, z1, __ D); // fcvtzs z19.d, p2/m, z1.d __ sve_fcvtzs(z19, __ D, p2, z1, __ D); // fcvtzs z19.d, p2/m, z1.d
__ sve_fcvtzs(z9, __ S, p1, z8, __ S); // fcvtzs z9.s, p1/m, z8.s __ sve_fcvtzs(z9, __ S, p1, z8, __ S); // fcvtzs z9.s, p1/m, z8.s
__ sve_fcvtzs(z1, __ S, p2, z0, __ D); // fcvtzs z1.s, p2/m, z0.d __ sve_fcvtzs(z1, __ S, p2, z0, __ D); // fcvtzs z1.s, p2/m, z0.d
@ -1245,30 +1249,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x14000405, 0x94000000, 0x14000000, 0x17ffffd7, 0x14000409, 0x94000000,
0x97ffffd4, 0x94000402, 0x3400000a, 0x34fffa2a, 0x97ffffd4, 0x94000406, 0x3400000a, 0x34fffa2a,
0x34007fea, 0x35000008, 0x35fff9c8, 0x35007f88, 0x3400806a, 0x35000008, 0x35fff9c8, 0x35008008,
0xb400000b, 0xb4fff96b, 0xb4007f2b, 0xb500001d, 0xb400000b, 0xb4fff96b, 0xb4007fab, 0xb500001d,
0xb5fff91d, 0xb5007edd, 0x10000013, 0x10fff8b3, 0xb5fff91d, 0xb5007f5d, 0x10000013, 0x10fff8b3,
0x10007e73, 0x90000013, 0x36300016, 0x3637f836, 0x10007ef3, 0x90000013, 0x36300016, 0x3637f836,
0x36307df6, 0x3758000c, 0x375ff7cc, 0x37587d8c, 0x36307e76, 0x3758000c, 0x375ff7cc, 0x37587e0c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x54007b60, 0x54000001, 0x54fff541, 0x54007b01, 0x54007be0, 0x54000001, 0x54fff541, 0x54007b81,
0x54000002, 0x54fff4e2, 0x54007aa2, 0x54000002, 0x54000002, 0x54fff4e2, 0x54007b22, 0x54000002,
0x54fff482, 0x54007a42, 0x54000003, 0x54fff423, 0x54fff482, 0x54007ac2, 0x54000003, 0x54fff423,
0x540079e3, 0x54000003, 0x54fff3c3, 0x54007983, 0x54007a63, 0x54000003, 0x54fff3c3, 0x54007a03,
0x54000004, 0x54fff364, 0x54007924, 0x54000005, 0x54000004, 0x54fff364, 0x540079a4, 0x54000005,
0x54fff305, 0x540078c5, 0x54000006, 0x54fff2a6, 0x54fff305, 0x54007945, 0x54000006, 0x54fff2a6,
0x54007866, 0x54000007, 0x54fff247, 0x54007807, 0x540078e6, 0x54000007, 0x54fff247, 0x54007887,
0x54000008, 0x54fff1e8, 0x540077a8, 0x54000009, 0x54000008, 0x54fff1e8, 0x54007828, 0x54000009,
0x54fff189, 0x54007749, 0x5400000a, 0x54fff12a, 0x54fff189, 0x540077c9, 0x5400000a, 0x54fff12a,
0x540076ea, 0x5400000b, 0x54fff0cb, 0x5400768b, 0x5400776a, 0x5400000b, 0x54fff0cb, 0x5400770b,
0x5400000c, 0x54fff06c, 0x5400762c, 0x5400000d, 0x5400000c, 0x54fff06c, 0x540076ac, 0x5400000d,
0x54fff00d, 0x540075cd, 0x5400000e, 0x54ffefae, 0x54fff00d, 0x5400764d, 0x5400000e, 0x54ffefae,
0x5400756e, 0x5400000f, 0x54ffef4f, 0x5400750f, 0x540075ee, 0x5400000f, 0x54ffef4f, 0x5400758f,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@ -1434,7 +1438,8 @@
0x05733820, 0x05b238a4, 0x05f138e6, 0x0570396a, 0x05733820, 0x05b238a4, 0x05f138e6, 0x0570396a,
0x65d0a001, 0x65d6a443, 0x65d4a826, 0x6594ac26, 0x65d0a001, 0x65d6a443, 0x65d4a826, 0x6594ac26,
0x6554ac26, 0x6556ac26, 0x6552ac26, 0x65cbac85, 0x6554ac26, 0x6556ac26, 0x6552ac26, 0x65cbac85,
0x65caac01, 0x65dea833, 0x659ca509, 0x65d8a801, 0x65caac01, 0x6589ac85, 0x6588ac01, 0x65c9ac85,
0x65c8ac01, 0x65dea833, 0x659ca509, 0x65d8a801,
0x65dcac01, 0x655cb241, 0x0520a1e0, 0x0521a601, 0x65dcac01, 0x655cb241, 0x0520a1e0, 0x0521a601,
0x052281e0, 0x05238601, 0x04a14026, 0x042244a6, 0x052281e0, 0x05238601, 0x04a14026, 0x042244a6,
0x046344a6, 0x04a444a6, 0x04e544a7, 0x0568aca7, 0x046344a6, 0x04a444a6, 0x04e544a7, 0x0568aca7,

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -26,7 +26,7 @@
* @bug 8294588 * @bug 8294588
* @summary Auto-vectorize Float.floatToFloat16, Float.float16ToFloat APIs * @summary Auto-vectorize Float.floatToFloat16, Float.float16ToFloat APIs
* @requires vm.compiler2.enabled * @requires vm.compiler2.enabled
* @requires os.simpleArch == "x64" * @requires (os.simpleArch == "x64" & (vm.cpu.features ~= ".*avx512f.*" | vm.cpu.features ~= ".*f16c.*")) | os.arch == "aarch64"
* @library /test/lib / * @library /test/lib /
* @run driver compiler.vectorization.TestFloatConversionsVector * @run driver compiler.vectorization.TestFloatConversionsVector
*/ */
@ -34,62 +34,73 @@
package compiler.vectorization; package compiler.vectorization;
import compiler.lib.ir_framework.*; import compiler.lib.ir_framework.*;
import jdk.test.lib.Asserts;
public class TestFloatConversionsVector { public class TestFloatConversionsVector {
private static final int ARRLEN = 1024; private static final int ARRLEN = 1024;
private static final int ITERS = 11000; private static final int ITERS = 11000;
private static float [] finp; private static float [] finp;
private static short [] sout; private static short [] sout;
private static short [] sinp; private static short [] sinp;
private static float [] fout; private static float [] fout;
public static void main(String args[]) { public static void main(String args[]) {
TestFramework.runWithFlags("-XX:-TieredCompilation", TestFramework.runWithFlags("-XX:-TieredCompilation",
"-XX:CompileThresholdScaling=0.3"); "-XX:CompileThresholdScaling=0.3");
System.out.println("PASSED"); System.out.println("PASSED");
} }
@Test @Test
@IR(counts = {IRNode.VECTOR_CAST_F2HF, "> 0"}, applyIfCPUFeatureOr = {"avx512f", "true", "f16c", "true"}) @IR(counts = {IRNode.VECTOR_CAST_F2HF, "> 0"})
public void test_float_float16(short[] sout, float[] finp) { public void test_float_float16(short[] sout, float[] finp) {
for (int i = 0; i < finp.length; i++) { for (int i = 0; i < finp.length; i++) {
sout[i] = Float.floatToFloat16(finp[i]); sout[i] = Float.floatToFloat16(finp[i]);
} }
} }
@Run(test = {"test_float_float16"}, mode = RunMode.STANDALONE) @Run(test = {"test_float_float16"}, mode = RunMode.STANDALONE)
public void kernel_test_float_float16() { public void kernel_test_float_float16() {
finp = new float[ARRLEN]; finp = new float[ARRLEN];
sout = new short[ARRLEN]; sout = new short[ARRLEN];
for (int i = 0; i < ARRLEN; i++) { for (int i = 0; i < ARRLEN; i++) {
finp[i] = (float) i * 1.4f; finp[i] = (float) i * 1.4f;
} }
for (int i = 0; i < ITERS; i++) { for (int i = 0; i < ITERS; i++) {
test_float_float16(sout, finp); test_float_float16(sout, finp);
} }
}
@Test // Verifying the result
@IR(counts = {IRNode.VECTOR_CAST_HF2F, "> 0"}, applyIfCPUFeatureOr = {"avx512f", "true", "f16c", "true"}) for (int i = 0; i < ARRLEN; i++) {
public void test_float16_float(float[] fout, short[] sinp) { Asserts.assertEquals(Float.floatToFloat16(finp[i]), sout[i]);
for (int i = 0; i < sinp.length; i++) { }
fout[i] = Float.float16ToFloat(sinp[i]); }
}
}
@Run(test = {"test_float16_float"}, mode = RunMode.STANDALONE) @Test
public void kernel_test_float16_float() { @IR(counts = {IRNode.VECTOR_CAST_HF2F, "> 0"})
sinp = new short[ARRLEN]; public void test_float16_float(float[] fout, short[] sinp) {
fout = new float[ARRLEN]; for (int i = 0; i < sinp.length; i++) {
fout[i] = Float.float16ToFloat(sinp[i]);
}
}
for (int i = 0; i < ARRLEN; i++) { @Run(test = {"test_float16_float"}, mode = RunMode.STANDALONE)
sinp[i] = (short)i; public void kernel_test_float16_float() {
} sinp = new short[ARRLEN];
fout = new float[ARRLEN];
for (int i = 0; i < ITERS; i++) { for (int i = 0; i < ARRLEN; i++) {
test_float16_float(fout , sinp); sinp[i] = (short)i;
} }
}
for (int i = 0; i < ITERS; i++) {
test_float16_float(fout, sinp);
}
// Verifying the result
for (int i = 0; i < ARRLEN; i++) {
Asserts.assertEquals(Float.float16ToFloat(sinp[i]), fout[i]);
}
}
} }