From 87d2761f1b4572633de020b2d4681918c6f10f06 Mon Sep 17 00:00:00 2001 From: Marcus G K Williams Date: Sat, 14 Aug 2021 00:34:51 +0000 Subject: [PATCH] 8271883: Math CopySign optimization for x86 Reviewed-by: jbhateja, sviswanathan, kvn --- src/hotspot/cpu/x86/vm_version_x86.cpp | 3 + src/hotspot/cpu/x86/x86.ad | 58 ++++++++++++++++++- .../org/openjdk/bench/vm/compiler/Signum.java | 51 ++++++++++++++++ 3 files changed, 111 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 525fea564fa..6d940f3274c 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1736,6 +1736,9 @@ void VM_Version::get_processor_features() { if (FLAG_IS_DEFAULT(UseSignumIntrinsic)) { FLAG_SET_DEFAULT(UseSignumIntrinsic, true); } + if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) { + FLAG_SET_DEFAULT(UseCopySignIntrinsic, true); + } } void VM_Version::print_platform_virtualization_info(outputStream* st) { diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 4453d01e3d5..1ea22eee800 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1560,6 +1560,15 @@ const bool Matcher::match_rule_supported(int opcode) { return false; } break; + case Op_CopySignD: + case Op_CopySignF: + if (UseAVX < 3 || !is_LP64) { + return false; + } + if (!VM_Version::supports_avx512vl()) { + return false; + } + break; #ifndef _LP64 case Op_AddReductionVF: case Op_AddReductionVD: @@ -5776,7 +5785,7 @@ instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktm ins_pipe( pipe_slow ); %} -// --------------------------------- Signum --------------------------- +// --------------------------------- Signum/CopySign --------------------------- instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{ match(Set dst (SignumF dst (Binary zero one))); @@ -5800,6 +5809,53 @@ instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) ins_pipe( pipe_slow ); %} +// --------------------------------------- +// For copySign use 0xE4 as writemask for vpternlog +// Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit +// C (xmm2) is set to 0x7FFFFFFF +// Wherever xmm2 is 0, we want to pick from B (sign) +// Wherever xmm2 is 1, we want to pick from A (src) +// +// A B C Result +// 0 0 0 0 +// 0 0 1 0 +// 0 1 0 1 +// 0 1 1 0 +// 1 0 0 0 +// 1 0 1 1 +// 1 1 0 1 +// 1 1 1 1 +// +// Result going from high bit to low bit is 0x11100100 = 0xe4 +// --------------------------------------- + +#ifdef _LP64 +instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{ + match(Set dst (CopySignF dst src)); + effect(TEMP tmp1, TEMP tmp2); + format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %} + ins_encode %{ + __ movl($tmp2$$Register, 0x7FFFFFFF); + __ movdl($tmp1$$XMMRegister, $tmp2$$Register); + __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit); + %} + ins_pipe( pipe_slow ); +%} + +instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{ + match(Set dst (CopySignD dst (Binary src zero))); + ins_cost(100); + effect(TEMP tmp1, TEMP tmp2); + format %{ "CopySignD $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %} + ins_encode %{ + __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF); + __ movq($tmp1$$XMMRegister, $tmp2$$Register); + __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit); + %} + ins_pipe( pipe_slow ); +%} +#endif // _LP64 + // --------------------------------- Sqrt -------------------------------------- instruct vsqrtF_reg(vec dst, vec src) %{ diff --git a/test/micro/org/openjdk/bench/vm/compiler/Signum.java b/test/micro/org/openjdk/bench/vm/compiler/Signum.java index a5e406dbedc..65292814521 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/Signum.java +++ b/test/micro/org/openjdk/bench/vm/compiler/Signum.java @@ -100,6 +100,16 @@ public class Signum { return Math.signum(data); } + private static double Copysign_Kernel(double data, double sign) + { + return Math.copySign(data, sign); + } + + private static float Copysign_Kernel(float data, float sign) + { + return Math.copySign(data, sign); + } + @Benchmark @OperationsPerInvocation(ITERATIONS * 17) public void _1_signumFloatTest(Blackhole bh) { @@ -139,4 +149,45 @@ public class Signum { } } } + + @Benchmark + @OperationsPerInvocation(ITERATIONS * 17) + public void _5_copySignFloatTest(Blackhole bh) { + for (int i = 0; i < ITERATIONS; i++) { + for (float f : float_values) { + bh.consume(Copysign_Kernel(floatValue, f)); + } + } + } + + @Benchmark + @OperationsPerInvocation(ITERATIONS * 17) + public void _6_overheadCopySignFloat(Blackhole bh) { + for (int i = 0; i < ITERATIONS; i++) { + for (float f : float_values) { + bh.consume(f); + } + } + } + + @Benchmark + @OperationsPerInvocation(ITERATIONS * 17) + public void _7_copySignDoubleTest(Blackhole bh) { + for (int i = 0; i < ITERATIONS; i++) { + for (double d : double_values) { + bh.consume(Copysign_Kernel(doubleValue, d)); + } + } + } + + @Benchmark + @OperationsPerInvocation(ITERATIONS * 17) + public void _8_overheadCopySignDouble(Blackhole bh) { + for (int i = 0; i < ITERATIONS; i++) { + for (double d : double_values) { + bh.consume(d); + } + } + } + }