From 0881f2b0c43870ed10b1166d04cef9832e58629e Mon Sep 17 00:00:00 2001 From: Sandhya Viswanathan Date: Fri, 17 Nov 2023 20:10:17 +0000 Subject: [PATCH] 8318562: Computational test more than 2x slower when AVX instructions are used Reviewed-by: kvn --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 86 +++++++++++ src/hotspot/cpu/x86/macroAssembler_x86.hpp | 17 +++ src/hotspot/cpu/x86/x86_64.ad | 4 +- .../bench/vm/compiler/x86/ComputePI.java | 142 ++++++++++++++++++ 4 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 test/micro/org/openjdk/bench/vm/compiler/x86/ComputePI.java diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 306c4ef67f1..ed8a23771d3 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -1871,6 +1871,92 @@ void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) { } #endif +void MacroAssembler::cvtss2sd(XMMRegister dst, XMMRegister src) { + if ((UseAVX > 0) && (dst != src)) { + xorpd(dst, dst); + } + Assembler::cvtss2sd(dst, src); +} + +void MacroAssembler::cvtss2sd(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorpd(dst, dst); + } + Assembler::cvtss2sd(dst, src); +} + +void MacroAssembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { + if ((UseAVX > 0) && (dst != src)) { + xorps(dst, dst); + } + Assembler::cvtsd2ss(dst, src); +} + +void MacroAssembler::cvtsd2ss(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorps(dst, dst); + } + Assembler::cvtsd2ss(dst, src); +} + +void MacroAssembler::cvtsi2sdl(XMMRegister dst, Register src) { + if (UseAVX > 0) { + xorpd(dst, dst); + } + Assembler::cvtsi2sdl(dst, src); +} + +void MacroAssembler::cvtsi2sdl(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorpd(dst, dst); + } + Assembler::cvtsi2sdl(dst, src); +} + +void MacroAssembler::cvtsi2ssl(XMMRegister dst, Register src) { + if (UseAVX > 0) { + xorps(dst, dst); + } + Assembler::cvtsi2ssl(dst, src); +} + +void MacroAssembler::cvtsi2ssl(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorps(dst, dst); + } + Assembler::cvtsi2ssl(dst, src); +} + +#ifdef _LP64 +void MacroAssembler::cvtsi2sdq(XMMRegister dst, Register src) { + if (UseAVX > 0) { + xorpd(dst, dst); + } + Assembler::cvtsi2sdq(dst, src); +} + +void MacroAssembler::cvtsi2sdq(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorpd(dst, dst); + } + Assembler::cvtsi2sdq(dst, src); +} + +void MacroAssembler::cvtsi2ssq(XMMRegister dst, Register src) { + if (UseAVX > 0) { + xorps(dst, dst); + } + Assembler::cvtsi2ssq(dst, src); +} + +void MacroAssembler::cvtsi2ssq(XMMRegister dst, Address src) { + if (UseAVX > 0) { + xorps(dst, dst); + } + Assembler::cvtsi2ssq(dst, src); +} +#endif // _LP64 + void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) { assert(rscratch != noreg || always_reachable(adr), "missing"); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index ab461aaffdf..f875089ac67 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -800,6 +800,23 @@ public: void cmpxchgptr(Register reg, Address adr); + + // cvt instructions + void cvtss2sd(XMMRegister dst, XMMRegister src); + void cvtss2sd(XMMRegister dst, Address src); + void cvtsd2ss(XMMRegister dst, XMMRegister src); + void cvtsd2ss(XMMRegister dst, Address src); + void cvtsi2sdl(XMMRegister dst, Register src); + void cvtsi2sdl(XMMRegister dst, Address src); + void cvtsi2ssl(XMMRegister dst, Register src); + void cvtsi2ssl(XMMRegister dst, Address src); +#ifdef _LP64 + void cvtsi2sdq(XMMRegister dst, Register src); + void cvtsi2sdq(XMMRegister dst, Address src); + void cvtsi2ssq(XMMRegister dst, Register src); + void cvtsi2ssq(XMMRegister dst, Address src); +#endif + void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg); void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); } diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index 4cdd3bb7bbb..e41cadb4d4f 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -10149,7 +10149,7 @@ instruct cmpD_imm(rRegI dst, regD src, immD con, rFlagsReg cr) %{ instruct convF2D_reg_reg(regD dst, regF src) %{ match(Set dst (ConvF2D src)); - + effect(TEMP dst); format %{ "cvtss2sd $dst, $src" %} ins_encode %{ __ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister); @@ -10171,7 +10171,7 @@ instruct convF2D_reg_mem(regD dst, memory src) instruct convD2F_reg_reg(regF dst, regD src) %{ match(Set dst (ConvD2F src)); - + effect(TEMP dst); format %{ "cvtsd2ss $dst, $src" %} ins_encode %{ __ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister); diff --git a/test/micro/org/openjdk/bench/vm/compiler/x86/ComputePI.java b/test/micro/org/openjdk/bench/vm/compiler/x86/ComputePI.java new file mode 100644 index 00000000000..7d8e479172e --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/x86/ComputePI.java @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.util.concurrent.TimeUnit; + +@State(Scope.Thread) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Warmup(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS) +@Fork(value = 3) +public class ComputePI { + + @Benchmark + public double compute_pi_int_dbl() { + double pi = 4.0; + boolean sign = false; + + for (int i = 3; i < 1000; i += 2) { + if (sign) { + pi += 4.0 / i; + } else { + pi -= 4.0 / i; + } + sign = !sign; + } + return pi; + } + + @Benchmark + public double compute_pi_int_flt() { + float pi = 4.0f; + boolean sign = false; + + for (int i = 3; i < 1000; i += 2) { + if (sign) { + pi += 4.0f / i; + } else { + pi -= 4.0f / i; + } + sign = !sign; + } + return pi; + } + + @Benchmark + public double compute_pi_long_dbl() { + double pi = 4.0; + boolean sign = false; + + for (long i = 3; i < 1000; i += 2) { + if (sign) { + pi += 4.0 / i; + } else { + pi -= 4.0 / i; + } + sign = !sign; + } + return pi; + } + + @Benchmark + public double compute_pi_long_flt() { + float pi = 4.0f; + boolean sign = false; + + for (long i = 3; i < 1000; i += 2) { + if (sign) { + pi += 4.0f / i; + } else { + pi -= 4.0f / i; + } + sign = !sign; + } + return pi; + } + + @Benchmark + public double compute_pi_flt_dbl() { + double pi = 4.0; + boolean sign = false; + + for (float i = 3.0f; i < 1000.0f; i += 2.0f) { + if (sign) { + pi += 4.0 / i; + } else { + pi -= 4.0 / i; + } + sign = !sign; + } + return pi; + } + + @Benchmark + public double compute_pi_dbl_flt() { + float pi = 4.0f; + boolean sign = false; + + for (float i = 3.0f; i < 1000.0f; i += 2.0f) { + if (sign) { + pi += 4.0f / i; + } else { + pi -= 4.0f / i; + } + sign = !sign; + } + return pi; + } +}