8271883: Math CopySign optimization for x86

Reviewed-by: jbhateja, sviswanathan, kvn
This commit is contained in:
Marcus G K Williams 2021-08-14 00:34:51 +00:00 committed by Sandhya Viswanathan
parent 6b8b160e37
commit 87d2761f1b
3 changed files with 111 additions and 1 deletions

View File

@ -1736,6 +1736,9 @@ void VM_Version::get_processor_features() {
if (FLAG_IS_DEFAULT(UseSignumIntrinsic)) {
FLAG_SET_DEFAULT(UseSignumIntrinsic, true);
}
if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
}
}
void VM_Version::print_platform_virtualization_info(outputStream* st) {

View File

@ -1560,6 +1560,15 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CopySignD:
case Op_CopySignF:
if (UseAVX < 3 || !is_LP64) {
return false;
}
if (!VM_Version::supports_avx512vl()) {
return false;
}
break;
#ifndef _LP64
case Op_AddReductionVF:
case Op_AddReductionVD:
@ -5776,7 +5785,7 @@ instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktm
ins_pipe( pipe_slow );
%}
// --------------------------------- Signum ---------------------------
// --------------------------------- Signum/CopySign ---------------------------
instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
match(Set dst (SignumF dst (Binary zero one)));
@ -5800,6 +5809,53 @@ instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr)
ins_pipe( pipe_slow );
%}
// ---------------------------------------
// For copySign use 0xE4 as writemask for vpternlog
// Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
// C (xmm2) is set to 0x7FFFFFFF
// Wherever xmm2 is 0, we want to pick from B (sign)
// Wherever xmm2 is 1, we want to pick from A (src)
//
// A B C Result
// 0 0 0 0
// 0 0 1 0
// 0 1 0 1
// 0 1 1 0
// 1 0 0 0
// 1 0 1 1
// 1 1 0 1
// 1 1 1 1
//
// Result going from high bit to low bit is 0x11100100 = 0xe4
// ---------------------------------------
#ifdef _LP64
instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
match(Set dst (CopySignF dst src));
effect(TEMP tmp1, TEMP tmp2);
format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
ins_encode %{
__ movl($tmp2$$Register, 0x7FFFFFFF);
__ movdl($tmp1$$XMMRegister, $tmp2$$Register);
__ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}
instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
match(Set dst (CopySignD dst (Binary src zero)));
ins_cost(100);
effect(TEMP tmp1, TEMP tmp2);
format %{ "CopySignD $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
ins_encode %{
__ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
__ movq($tmp1$$XMMRegister, $tmp2$$Register);
__ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64
// --------------------------------- Sqrt --------------------------------------
instruct vsqrtF_reg(vec dst, vec src) %{

View File

@ -100,6 +100,16 @@ public class Signum {
return Math.signum(data);
}
private static double Copysign_Kernel(double data, double sign)
{
return Math.copySign(data, sign);
}
private static float Copysign_Kernel(float data, float sign)
{
return Math.copySign(data, sign);
}
@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _1_signumFloatTest(Blackhole bh) {
@ -139,4 +149,45 @@ public class Signum {
}
}
}
@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _5_copySignFloatTest(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (float f : float_values) {
bh.consume(Copysign_Kernel(floatValue, f));
}
}
}
@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _6_overheadCopySignFloat(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (float f : float_values) {
bh.consume(f);
}
}
}
@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _7_copySignDoubleTest(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (double d : double_values) {
bh.consume(Copysign_Kernel(doubleValue, d));
}
}
}
@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _8_overheadCopySignDouble(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (double d : double_values) {
bh.consume(d);
}
}
}
}