8289552: Make intrinsic conversions between bit representations of half precision values and floats

Reviewed-by: kvn, sviswanathan, jbhateja
This commit is contained in:
Smita Kamath 2022-10-11 17:06:19 +00:00 committed by Sandhya Viswanathan
parent 2586b1a3c1
commit 07946aa49c
19 changed files with 345 additions and 11 deletions

View File

@ -1930,6 +1930,34 @@ void Assembler::vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len) {
emit_int16((unsigned char)0xE6, (0xC0 | encode));
}
void Assembler::vcvtps2ph(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
assert(VM_Version::supports_avx512vl() || VM_Version::supports_f16c(), "");
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /*uses_vl */ true);
int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x1D, (0xC0 | encode), imm8);
}
void Assembler::evcvtps2ph(Address dst, KRegister mask, XMMRegister src, int imm8, int vector_len) {
assert(VM_Version::supports_avx512vl(), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /*uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_64bit);
attributes.reset_is_clear_context();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x1D);
emit_operand(src, dst, 1);
emit_int8(imm8);
}
void Assembler::vcvtph2ps(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_avx512vl() || VM_Version::supports_f16c(), "");
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x13, (0xC0 | encode));
}
void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);

View File

@ -1156,6 +1156,11 @@ private:
void cvtdq2pd(XMMRegister dst, XMMRegister src);
void vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len);
// Convert Halffloat to Single Precision Floating-Point value
void vcvtps2ph(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
void vcvtph2ps(XMMRegister dst, XMMRegister src, int vector_len);
void evcvtps2ph(Address dst, KRegister mask, XMMRegister src, int imm8, int vector_len);
// Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
void cvtdq2ps(XMMRegister dst, XMMRegister src);
void vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len);

View File

@ -2883,6 +2883,8 @@ uint64_t VM_Version::feature_flags() {
_cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
result |= CPU_AVX;
result |= CPU_VZEROUPPER;
if (_cpuid_info.std_cpuid1_ecx.bits.f16c != 0)
result |= CPU_F16C;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
result |= CPU_AVX2;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&

View File

@ -89,7 +89,8 @@ class VM_Version : public Abstract_VM_Version {
: 1,
osxsave : 1,
avx : 1,
: 2,
f16c : 1,
: 1,
hv : 1;
} bits;
};
@ -374,7 +375,8 @@ protected:
decl(RDPID, "rdpid", 49) /* RDPID instruction */ \
decl(FSRM, "fsrm", 50) /* Fast Short REP MOV */ \
decl(GFNI, "gfni", 51) /* Vector GFNI instructions */ \
decl(AVX512_BITALG, "avx512_bitalg", 52) /* Vector sub-word popcount and bit gather instructions */
decl(AVX512_BITALG, "avx512_bitalg", 52) /* Vector sub-word popcount and bit gather instructions */\
decl(F16C, "f16c", 53) /* Half-precision and single precision FP conversion instructions*/
#define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
@ -681,6 +683,7 @@ public:
static bool supports_avx512_vbmi2() { return (_features & CPU_AVX512_VBMI2) != 0; }
static bool supports_hv() { return (_features & CPU_HV) != 0; }
static bool supports_serialize() { return (_features & CPU_SERIALIZE) != 0; }
static bool supports_f16c() { return (_features & CPU_F16C) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&

View File

@ -1678,6 +1678,12 @@ const bool Matcher::match_rule_supported(int opcode) {
// Together with common x86 rules, this handles all UseSSE cases.
#endif
break;
case Op_ConvF2HF:
case Op_ConvHF2F:
if (!VM_Version::supports_f16c() && !VM_Version::supports_avx512vl()) {
return false;
}
break;
}
return true; // Match rules are supported by default.
}
@ -3652,6 +3658,41 @@ instruct sqrtD_reg(regD dst) %{
ins_pipe(pipe_slow);
%}
instruct convF2HF_reg_reg(rRegI dst, regF src, regF tmp) %{
effect(TEMP tmp);
match(Set dst (ConvF2HF src));
ins_cost(125);
format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
ins_encode %{
__ vcvtps2ph($tmp$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
__ movdl($dst$$Register, $tmp$$XMMRegister);
__ movswl($dst$$Register, $dst$$Register);
%}
ins_pipe( pipe_slow );
%}
instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
effect(TEMP ktmp, TEMP rtmp);
match(Set mem (StoreC mem (ConvF2HF src)));
format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
ins_encode %{
__ movl($rtmp$$Register, 0x1);
__ kmovwl($ktmp$$KRegister, $rtmp$$Register);
__ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}
instruct convHF2F_reg_reg(regF dst, rRegI src) %{
match(Set dst (ConvHF2F src));
format %{ "vcvtph2ps $dst,$src" %}
ins_encode %{
__ movdl($dst$$XMMRegister, $src$$Register);
__ vcvtph2ps($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}
// ---------------------------------------- VectorReinterpret ------------------------------------
instruct reinterpret_mask(kReg dst) %{

View File

@ -224,6 +224,12 @@ class methodHandle;
do_name( doubleToLongBits_name, "doubleToLongBits") \
do_intrinsic(_longBitsToDouble, java_lang_Double, longBitsToDouble_name, long_double_signature, F_SN)\
do_name( longBitsToDouble_name, "longBitsToDouble") \
do_intrinsic(_float16ToFloat, java_lang_Float, float16ToFloat_name, f16_float_signature, F_S) \
do_name( float16ToFloat_name, "float16ToFloat") \
do_signature(f16_float_signature, "(S)F") \
do_intrinsic(_floatToFloat16, java_lang_Float, floatToFloat16_name, float_f16_signature, F_S) \
do_name( floatToFloat16_name, "floatToFloat16") \
do_signature(float_f16_signature, "(F)S") \
\
do_intrinsic(_compareUnsigned_i, java_lang_Integer, compareUnsigned_name, int2_int_signature, F_S) \
do_intrinsic(_compareUnsigned_l, java_lang_Long, compareUnsigned_name, long2_int_signature, F_S) \

View File

@ -300,6 +300,12 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_remainderUnsigned_l:
if (!Matcher::match_rule_supported(Op_UModL)) return false;
break;
case vmIntrinsics::_float16ToFloat:
if (!Matcher::match_rule_supported(Op_ConvHF2F)) return false;
break;
case vmIntrinsics::_floatToFloat16:
if (!Matcher::match_rule_supported(Op_ConvF2HF)) return false;
break;
/* CompareAndSet, Object: */
case vmIntrinsics::_compareAndSetReference:

View File

@ -149,6 +149,8 @@ macro(ConvI2L)
macro(ConvL2D)
macro(ConvL2F)
macro(ConvL2I)
macro(ConvF2HF)
macro(ConvHF2F)
macro(CountedLoop)
macro(CountedLoopEnd)
macro(OuterStripMinedLoop)

View File

@ -161,6 +161,21 @@ const Type* ConvF2DNode::Value(PhaseGVN* phase) const {
return TypeD::make( (double)tf->getf() );
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* ConvF2HFNode::Value(PhaseGVN* phase) const {
const Type *t = phase->type( in(1) );
if( t == Type::TOP ) return Type::TOP;
if( t == Type::FLOAT ) return TypeInt::SHORT;
const TypeF *tf = t->is_float_constant();
return TypeInt::make( SharedRuntime::f2hf( tf->getf() ) );
}
//------------------------------Identity---------------------------------------
Node* ConvF2HFNode::Identity(PhaseGVN* phase) {
return (in(1)->Opcode() == Op_ConvHF2F) ? in(1)->in(1) : this;
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* ConvF2INode::Value(PhaseGVN* phase) const {
@ -219,6 +234,18 @@ Node *ConvF2LNode::Ideal(PhaseGVN *phase, bool can_reshape) {
return NULL;
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* ConvHF2FNode::Value(PhaseGVN* phase) const {
const Type *t = phase->type( in(1) );
if( t == Type::TOP ) return Type::TOP;
if( t == TypeInt::SHORT ) return Type::FLOAT;
const TypeInt *ti = t->is_int();
if ( ti->is_con() ) return TypeF::make( SharedRuntime::hf2f( ti->get_con() ) );
return bottom_type();
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* ConvI2DNode::Value(PhaseGVN* phase) const {

View File

@ -100,6 +100,18 @@ class ConvF2DNode : public Node {
virtual uint ideal_reg() const { return Op_RegD; }
};
//------------------------------ConvF2HFNode------------------------------------
// Convert Float to Halffloat
class ConvF2HFNode : public Node {
public:
ConvF2HFNode( Node *in1 ) : Node(0,in1) {}
virtual int Opcode() const;
virtual const Type *bottom_type() const { return TypeInt::SHORT; }
virtual const Type* Value(PhaseGVN* phase) const;
virtual Node* Identity(PhaseGVN* phase);
virtual uint ideal_reg() const { return Op_RegI; }
};
//------------------------------ConvF2INode------------------------------------
// Convert float to integer
class ConvF2INode : public Node {
@ -127,6 +139,17 @@ class ConvF2LNode : public Node {
virtual uint ideal_reg() const { return Op_RegL; }
};
//------------------------------ConvHF2FNode------------------------------------
// Convert Halffloat to float
class ConvHF2FNode : public Node {
public:
ConvHF2FNode( Node *in1 ) : Node(0,in1) {}
virtual int Opcode() const;
virtual const Type *bottom_type() const { return Type::FLOAT; }
virtual const Type* Value(PhaseGVN* phase) const;
virtual uint ideal_reg() const { return Op_RegF; }
};
//------------------------------ConvI2DNode------------------------------------
// Convert Integer to Double
class ConvI2DNode : public Node {

View File

@ -514,7 +514,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_intBitsToFloat:
case vmIntrinsics::_doubleToRawLongBits:
case vmIntrinsics::_doubleToLongBits:
case vmIntrinsics::_longBitsToDouble: return inline_fp_conversions(intrinsic_id());
case vmIntrinsics::_longBitsToDouble:
case vmIntrinsics::_floatToFloat16:
case vmIntrinsics::_float16ToFloat: return inline_fp_conversions(intrinsic_id());
case vmIntrinsics::_floatIsFinite:
case vmIntrinsics::_floatIsInfinite:
@ -4440,6 +4442,8 @@ bool LibraryCallKit::inline_fp_conversions(vmIntrinsics::ID id) {
case vmIntrinsics::_intBitsToFloat: result = new MoveI2FNode(arg); break;
case vmIntrinsics::_doubleToRawLongBits: result = new MoveD2LNode(arg); break;
case vmIntrinsics::_longBitsToDouble: result = new MoveL2DNode(arg); break;
case vmIntrinsics::_floatToFloat16: result = new ConvF2HFNode(arg); break;
case vmIntrinsics::_float16ToFloat: result = new ConvHF2FNode(arg); break;
case vmIntrinsics::_doubleToLongBits: {
// two paths (plus control) merge in a wood

View File

@ -271,6 +271,10 @@ JRT_LEAF(jdouble, SharedRuntime::drem(jdouble x, jdouble y))
#endif
JRT_END
JRT_LEAF(jfloat, SharedRuntime::i2f(jint x))
return (jfloat)x;
JRT_END
#ifdef __SOFTFP__
JRT_LEAF(jfloat, SharedRuntime::fadd(jfloat x, jfloat y))
return x + y;
@ -304,10 +308,6 @@ JRT_LEAF(jdouble, SharedRuntime::ddiv(jdouble x, jdouble y))
return x / y;
JRT_END
JRT_LEAF(jfloat, SharedRuntime::i2f(jint x))
return (jfloat)x;
JRT_END
JRT_LEAF(jdouble, SharedRuntime::i2d(jint x))
return (jdouble)x;
JRT_END
@ -448,6 +448,86 @@ JRT_LEAF(jdouble, SharedRuntime::l2d(jlong x))
return (jdouble)x;
JRT_END
// Reference implementation at src/java.base/share/classes/java/lang/Float.java:floatToFloat16
JRT_LEAF(jshort, SharedRuntime::f2hf(jfloat x))
jint doppel = SharedRuntime::f2i(x);
jshort sign_bit = (jshort) ((doppel & 0x80000000) >> 16);
if (g_isnan(x))
return (jshort)(sign_bit | 0x7c00 | (doppel & 0x007fe000) >> 13 | (doppel & 0x00001ff0) >> 4 | (doppel & 0x0000000f));
jfloat abs_f = (x >= 0.0f) ? x : (x * -1.0f);
// Overflow threshold is halffloat max value + 1/2 ulp
if (abs_f >= (65504.0f + 16.0f)) {
return (jshort)(sign_bit | 0x7c00); // Positive or negative infinity
}
// Smallest magnitude of Halffloat is 0x1.0p-24, half-way or smaller rounds to zero
if (abs_f <= (pow(2, -24) * 0.5f)) { // Covers float zeros and subnormals.
return sign_bit; // Positive or negative zero
}
jint exp = 0x7f800000 & doppel;
// For binary16 subnormals, beside forcing exp to -15, retain
// the difference exp_delta = E_min - exp. This is the excess
// shift value, in addition to 13, to be used in the
// computations below. Further the (hidden) msb with value 1
// in f must be involved as well
jint exp_delta = 0;
jint msb = 0x00000000;
if (exp < -14) {
exp_delta = -14 - exp;
exp = -15;
msb = 0x00800000;
}
jint f_signif_bits = ((doppel & 0x007fffff) | msb);
// Significand bits as if using rounding to zero
jshort signif_bits = (jshort)(f_signif_bits >> (13 + exp_delta));
jint lsb = f_signif_bits & (1 << (13 + exp_delta));
jint round = f_signif_bits & (1 << (12 + exp_delta));
jint sticky = f_signif_bits & ((1 << (12 + exp_delta)) - 1);
if (round != 0 && ((lsb | sticky) != 0 )) {
signif_bits++;
}
return (jshort)(sign_bit | ( ((exp + 15) << 10) + signif_bits ) );
JRT_END
// Reference implementation at src/java.base/share/classes/java/lang/Float.java:float16ToFloat
JRT_LEAF(jfloat, SharedRuntime::hf2f(jshort x))
// Halffloat format has 1 signbit, 5 exponent bits and
// 10 significand bits
jint hf_arg = (jint)x;
jint hf_sign_bit = 0x8000 & hf_arg;
jint hf_exp_bits = 0x7c00 & hf_arg;
jint hf_significand_bits = 0x03ff & hf_arg;
jint significand_shift = 13; //difference between float and halffloat precision
jfloat sign = (hf_sign_bit != 0) ? -1.0f : 1.0f;
// Extract halffloat exponent, remove its bias
jint hf_exp = (hf_exp_bits >> 10) - 15;
if (hf_exp == -15) {
// For subnormal values, return 2^-24 * significand bits
return (sign * (pow(2,-24)) * hf_significand_bits);
}else if (hf_exp == 16) {
return (hf_significand_bits == 0) ? sign * float_infinity : (SharedRuntime::i2f((hf_sign_bit << 16) | 0x7f800000 |
(hf_significand_bits << significand_shift)));
}
// Add the bias of float exponent and shift
int float_exp_bits = (hf_exp + 127) << (24 - 1);
// Combine sign, exponent and significand bits
return SharedRuntime::i2f((hf_sign_bit << 16) | float_exp_bits | (hf_significand_bits << significand_shift));
JRT_END
// Exception handling across interpreter/compiler boundaries
//
// exception_handler_for_return_address(...) returns the continuation address.

View File

@ -129,9 +129,11 @@ class SharedRuntime: AllStatic {
static jfloat d2f (jdouble x);
static jfloat l2f (jlong x);
static jdouble l2d (jlong x);
static jfloat hf2f(jshort x);
static jshort f2hf(jfloat x);
static jfloat i2f (jint x);
#ifdef __SOFTFP__
static jfloat i2f (jint x);
static jdouble i2d (jint x);
static jdouble f2d (jfloat x);
#endif // __SOFTFP__

View File

@ -1013,7 +1013,7 @@ public final class Float extends Number
* @param floatBinary16 the binary16 value to convert to {@code float}
* @since 20
*/
// @IntrinsicCandidate
@IntrinsicCandidate
public static float float16ToFloat(short floatBinary16) {
/*
* The binary16 format has 1 sign bit, 5 exponent bits, and 10
@ -1088,7 +1088,7 @@ public final class Float extends Number
* @param f the {@code float} value to convert to binary16
* @since 20
*/
// @IntrinsicCandidate
@IntrinsicCandidate
public static short floatToFloat16(float f) {
int doppel = Float.floatToRawIntBits(f);
short sign_bit = (short)((doppel & 0x8000_0000) >> 16);

View File

@ -226,6 +226,7 @@ public class AMD64 extends Architecture {
FSRM,
GFNI,
AVX512_BITALG,
F16C,
}
private final EnumSet<CPUFeature> features;

View File

@ -28,6 +28,8 @@
* @library ../Math
* @build FloatConsts
* @run main Binary16Conversion
* @run main/othervm -XX:+UnlockDiagnosticVMOptions
* -XX:DisableIntrinsic=_float16ToFloat,_floatToFloat16 Binary16Conversion
*/
public class Binary16Conversion {

View File

@ -26,6 +26,9 @@
* @bug 8289551
* @requires (os.arch != "x86" & os.arch != "i386") | vm.opt.UseSSE == "null" | vm.opt.UseSSE > 0
* @summary Verify NaN sign and significand bits are preserved across conversions
* @run main/othervm -XX:-TieredCompilation -XX:CompileThresholdScaling=0.1 Binary16ConversionNaN
* @run main/othervm -XX:+UnlockDiagnosticVMOptions
* -XX:DisableIntrinsic=_float16ToFloat,_floatToFloat16 Binary16ConversionNaN
*/
/*

View File

@ -63,7 +63,8 @@ public class CPUInfoTest {
"vzeroupper", "avx512_vpopcntdq", "avx512_vpclmulqdq", "avx512_vaes",
"avx512_vnni", "clflush", "clflushopt", "clwb",
"avx512_vbmi2", "avx512_vbmi", "rdtscp", "rdpid",
"hv", "fsrm", "avx512_bitalg", "gfni"
"hv", "fsrm", "avx512_bitalg", "gfni",
"f16c"
);
// @formatter:on
// Checkstyle: resume

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.math;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.openjdk.jmh.annotations.*;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 3)
public class Fp16ConversionBenchmark {
@Param({"2048"})
public int size;
public short[] f16in;
public short[] f16out;
public float[] fin;
public float[] fout;
public static short f16, s;
public static float f;
@Setup(Level.Trial)
public void BmSetup() {
int i = 0;
Random r = new Random(1024);
f16in = new short[size];
f16out = new short[size];
f16 = (short) r.nextInt();
for (; i < size; i++) {
f16in[i] = Float.floatToFloat16(r.nextFloat());;
}
fin = new float[size];
fout = new float[size];
f = r.nextFloat();
i = 0;
for (; i < size; i++) {
fin[i] = Float.float16ToFloat((short)r.nextInt());
}
}
@Benchmark
public short[] floatToFloat16() {
for (int i = 0; i < fin.length; i++) {
f16out[i] = Float.floatToFloat16(fin[i]);
}
return f16out;
}
@Benchmark
public float[] float16ToFloat() {
for (int i = 0; i < f16in.length; i++) {
fout[i] = Float.float16ToFloat(f16in[i]);
}
return fout;
}
@Benchmark
public float float16ToFloatMemory() {
f = Float.float16ToFloat(f16);
return f;
}
@Benchmark
public short floatToFloat16Memory() {
s = Float.floatToFloat16(f);
return s;
}
}