From dc9a6ef6100d73a431cd0cfa2c252acf7743f8a3 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Thu, 21 Nov 2024 18:13:32 +0000 Subject: [PATCH] 8341137: Optimize long vector multiplication using x86 VPMUL[U]DQ instruction Co-authored-by: Vladimir Ivanov Reviewed-by: vlivanov, sviswanathan --- src/hotspot/cpu/x86/x86.ad | 28 ++ src/hotspot/share/opto/node.hpp | 3 + src/hotspot/share/opto/vectornode.cpp | 49 ++++ src/hotspot/share/opto/vectornode.hpp | 6 +- .../compiler/vectorapi/VectorMultiplyOpt.java | 249 ++++++++++++++++++ .../vector/VectorMultiplyOptBenchmark.java | 125 +++++++++ .../vector/VectorXXH3HashingBenchmark.java | 85 ++++++ 7 files changed, 544 insertions(+), 1 deletion(-) create mode 100644 test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java create mode 100644 test/micro/org/openjdk/bench/jdk/incubator/vector/VectorMultiplyOptBenchmark.java create mode 100644 test/micro/org/openjdk/bench/jdk/incubator/vector/VectorXXH3HashingBenchmark.java diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index d135c7bacfa..09023562dd6 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -6179,6 +6179,7 @@ instruct evmulL_reg(vec dst, vec src1, vec src2) %{ VM_Version::supports_avx512dq()) || VM_Version::supports_avx512vldq()); match(Set dst (MulVL src1 src2)); + ins_cost(500); format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %} ins_encode %{ assert(UseAVX > 2, "required"); @@ -6195,6 +6196,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{ VM_Version::supports_avx512vldq())); match(Set dst (MulVL src (LoadVector mem))); format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %} + ins_cost(500); ins_encode %{ assert(UseAVX > 2, "required"); int vlen_enc = vector_length_encoding(this); @@ -6206,6 +6208,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{ instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{ predicate(UseAVX == 0); match(Set dst (MulVL src1 src2)); + ins_cost(500); effect(TEMP dst, TEMP xtmp); format %{ "mulVL $dst, $src1, $src2\t! using $xtmp as TEMP" %} ins_encode %{ @@ -6232,6 +6235,7 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{ !VM_Version::supports_avx512vldq()))); match(Set dst (MulVL src1 src2)); effect(TEMP xtmp1, TEMP xtmp2); + ins_cost(500); format %{ "vmulVL $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); @@ -6248,6 +6252,30 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{ ins_pipe( pipe_slow ); %} +instruct vmuludq_reg(vec dst, vec src1, vec src2) %{ + predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs()); + match(Set dst (MulVL src1 src2)); + ins_cost(100); + format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuldq_reg(vec dst, vec src1, vec src2) %{ + predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs()); + match(Set dst (MulVL src1 src2)); + ins_cost(100); + format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + // Floats vector mul instruct vmulF(vec dst, vec src) %{ predicate(UseAVX == 0); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index db8b00c0bda..678b12b04ac 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -193,6 +193,7 @@ class VectorUnboxNode; class VectorSet; class VectorReinterpretNode; class ShiftVNode; +class MulVLNode; class ExpandVNode; class CompressVNode; class CompressMNode; @@ -743,6 +744,7 @@ public: DEFINE_CLASS_ID(Reduction, Vector, 7) DEFINE_CLASS_ID(NegV, Vector, 8) DEFINE_CLASS_ID(SaturatingVector, Vector, 9) + DEFINE_CLASS_ID(MulVL, Vector, 10) DEFINE_CLASS_ID(Con, Type, 8) DEFINE_CLASS_ID(ConI, Con, 0) DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9) @@ -970,6 +972,7 @@ public: DEFINE_CLASS_QUERY(Mul) DEFINE_CLASS_QUERY(Multi) DEFINE_CLASS_QUERY(MultiBranch) + DEFINE_CLASS_QUERY(MulVL) DEFINE_CLASS_QUERY(Neg) DEFINE_CLASS_QUERY(NegV) DEFINE_CLASS_QUERY(NeverBranch) diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index cc2fff23acc..dedac80d102 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -2085,6 +2085,55 @@ Node* VectorBlendNode::Identity(PhaseGVN* phase) { } return this; } +static bool is_replicate_uint_constant(const Node* n) { + return n->Opcode() == Op_Replicate && + n->in(1)->is_Con() && + n->in(1)->bottom_type()->isa_long() && + n->in(1)->bottom_type()->is_long()->get_con() <= 0xFFFFFFFFL; +} + +static bool has_vector_elements_fit_uint(Node* n) { + auto is_lower_doubleword_mask_pattern = [](const Node* n) { + return n->Opcode() == Op_AndV && + (is_replicate_uint_constant(n->in(1)) || + is_replicate_uint_constant(n->in(2))); + }; + + auto is_clear_upper_doubleword_uright_shift_pattern = [](const Node* n) { + return n->Opcode() == Op_URShiftVL && + n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() && + n->in(2)->in(1)->bottom_type()->isa_int() && + n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32; + }; + return is_lower_doubleword_mask_pattern(n) || // (AndV SRC (Replicate C)) where C <= 0xFFFFFFFF + is_clear_upper_doubleword_uright_shift_pattern(n); // (URShiftV SRC S) where S >= 32 +} + +static bool has_vector_elements_fit_int(Node* n) { + auto is_cast_integer_to_long_pattern = [](const Node* n) { + return n->Opcode() == Op_VectorCastI2X && Matcher::vector_element_basic_type(n) == T_LONG; + }; + + auto is_clear_upper_doubleword_right_shift_pattern = [](const Node* n) { + return n->Opcode() == Op_RShiftVL && + n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() && + n->in(2)->in(1)->bottom_type()->isa_int() && + n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32; + }; + + return is_cast_integer_to_long_pattern(n) || // (VectorCastI2X SRC) + is_clear_upper_doubleword_right_shift_pattern(n); // (RShiftV SRC S) where S >= 32 +} + +bool MulVLNode::has_int_inputs() const { + return has_vector_elements_fit_int(in(1)) && + has_vector_elements_fit_int(in(2)); +} + +bool MulVLNode::has_uint_inputs() const { + return has_vector_elements_fit_uint(in(1)) && + has_vector_elements_fit_uint(in(2)); +} #ifndef PRODUCT void VectorBoxAllocateNode::dump_spec(outputStream *st) const { diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 25a381408ca..3f737e6e881 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -441,8 +441,12 @@ class MulVINode : public VectorNode { // Vector multiply long class MulVLNode : public VectorNode { public: - MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) { + init_class_id(Class_MulVL); + } virtual int Opcode() const; + bool has_int_inputs() const; + bool has_uint_inputs() const; }; //------------------------------MulVFNode-------------------------------------- diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java b/test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java new file mode 100644 index 00000000000..a48cd25e47f --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import jdk.incubator.vector.*; +import java.util.Random; +import java.util.stream.IntStream; +import compiler.lib.ir_framework.*; +import java.lang.reflect.Array; + +/** + * @test + * @bug 8341137 + * @summary Optimize long vector multiplication using x86 VPMUL[U]DQ instruction. + * @modules jdk.incubator.vector + * @library /test/lib / + * @run driver compiler.vectorapi.VectorMultiplyOpt + */ + +public class VectorMultiplyOpt { + + public static int[] isrc1; + public static int[] isrc2; + public static long[] lsrc1; + public static long[] lsrc2; + public static long[] res; + + public static final int SIZE = 1024; + public static final Random r = jdk.test.lib.Utils.getRandomInstance(); + public static final VectorSpecies LSP = LongVector.SPECIES_PREFERRED; + public static final VectorSpecies ISP = IntVector.SPECIES_PREFERRED; + + public static final long mask1 = r.nextLong(0xFFFFFFFFL); + public static final long mask2 = r.nextLong(0xFFFFFFFFL); + public static final long mask3 = r.nextLong(0xFFFFFFFFL); + public static final long mask4 = r.nextLong(0xFFFFFFFFL); + public static final long mask5 = r.nextLong(0xFFFFFFFFL); + public static final long mask6 = r.nextLong(0xFFFFFFFFL); + + public static final int shift1 = r.nextInt(32) + 32; + public static final int shift2 = r.nextInt(32) + 32; + public static final int shift3 = r.nextInt(32) + 32; + public static final int shift4 = r.nextInt(32) + 32; + public static final int shift5 = r.nextInt(32) + 32; + + public VectorMultiplyOpt() { + lsrc1 = new long[SIZE]; + lsrc2 = new long[SIZE]; + res = new long[SIZE]; + isrc1 = new int[SIZE + 16]; + isrc2 = new int[SIZE + 16]; + IntStream.range(0, SIZE).forEach(i -> { lsrc1[i] = Long.MAX_VALUE * r.nextLong(); }); + IntStream.range(0, SIZE).forEach(i -> { lsrc2[i] = Long.MAX_VALUE * r.nextLong(); }); + IntStream.range(0, SIZE).forEach(i -> { isrc1[i] = Integer.MAX_VALUE * r.nextInt(); }); + IntStream.range(0, SIZE).forEach(i -> { isrc2[i] = Integer.MAX_VALUE * r.nextInt(); }); + } + + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(5000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + System.out.println("PASSED"); + } + + interface Validator { + public long apply(long src1, long src2); + } + + public static void validate(String msg, long[] actual, Object src1, Object src2, Validator func) { + for (int i = 0; i < actual.length; i++) { + long expected; + if (long[].class == src1.getClass()) { + expected = func.apply(Array.getLong(src1, i), Array.getLong(src2, i)); + } else { + assert int[].class == src1.getClass(); + expected = func.apply(Array.getInt(src1, i), Array.getInt(src2, i)); + } + if (actual[i] != expected) { + throw new AssertionError(msg + "index " + i + ": src1 = " + Array.get(src1, i) + " src2 = " + + Array.get(src2, i) + " actual = " + actual[i] + " expected = " + expected); + } + } + } + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.AND_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"}) + @Warmup(value = 10000) + public static void test_pattern1() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.AND, mask1) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, mask1)) + .intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = (lsrc1[i] & mask1) * (lsrc2[i] & mask1); + } + } + + @Check(test = "test_pattern1") + public void test_pattern1_validate() { + validate("pattern1 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 & mask1) * (l2 & mask1)); + } + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.AND_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"}) + @Warmup(value = 10000) + public static void test_pattern2() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.AND, mask2) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, shift1)) + .intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = (lsrc1[i] & mask2) * (lsrc2[i] >>> shift1); + } + } + + @Check(test = "test_pattern2") + public void test_pattern2_validate() { + validate("pattern2 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 & mask2) * (l2 >>> shift1)); + } + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"}) + @Warmup(value = 10000) + public static void test_pattern3() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.LSHR, shift2) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, shift3)) + .intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = (lsrc1[i] >>> shift2) * (lsrc2[i] >>> shift3); + } + } + + @Check(test = "test_pattern3") + public void test_pattern3_validate() { + validate("pattern3 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >>> shift2) * (l2 >>> shift3)); + } + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuludq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE) + @Warmup(value = 10000) + public static void test_pattern4() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.LSHR, shift4) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, mask4)) + .intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = (lsrc1[i] >>> shift4) * (lsrc2[i] & mask4); + } + } + + @Check(test = "test_pattern4") + public void test_pattern4_validate() { + validate("pattern4 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >>> shift4) * (l2 & mask4)); + } + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.VECTOR_CAST_I2L, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuldq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE) + @Warmup(value = 10000) + public static void test_pattern5() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = IntVector.fromArray(ISP, isrc1, i) + .convert(VectorOperators.I2L, 0) + .reinterpretAsLongs(); + LongVector vsrc2 = IntVector.fromArray(ISP, isrc2, i) + .convert(VectorOperators.I2L, 0) + .reinterpretAsLongs(); + vsrc1.lanewise(VectorOperators.MUL, vsrc2).intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = Math.multiplyFull(isrc1[i], isrc2[i]); + } + } + + @Check(test = "test_pattern5") + public void test_pattern5_validate() { + validate("pattern5 ", res, isrc1, isrc2, (i1, i2) -> Math.multiplyFull((int)i1, (int)i2)); + } + + + @Test + @IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.RSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {"vmuldq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE) + @Warmup(value = 10000) + public static void test_pattern6() { + int i = 0; + for (; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.ASHR, shift5) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.ASHR, shift5)) + .intoArray(res, i); + } + for (; i < res.length; i++) { + res[i] = (lsrc1[i] >> shift5) * (lsrc2[i] >> shift5); + } + } + + @Check(test = "test_pattern6") + public void test_pattern6_validate() { + validate("pattern6 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >> shift5) * (l2 >> shift5)); + } + +} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorMultiplyOptBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorMultiplyOptBenchmark.java new file mode 100644 index 00000000000..51f8300a044 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorMultiplyOptBenchmark.java @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.*; +import jdk.incubator.vector.*; +import java.util.stream.*; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) +public class VectorMultiplyOptBenchmark { + @Param({"1024", "2048", "4096"}) + private int SIZE; + private int [] isrc1; + private int [] isrc2; + private long [] lsrc1; + private long [] lsrc2; + private long [] res; + + private static final VectorSpecies LSP = LongVector.SPECIES_PREFERRED; + private static final VectorSpecies ISP = IntVector.SPECIES_PREFERRED; + + @Setup(Level.Trial) + public void Setup() { + lsrc1 = LongStream.range(Long.MAX_VALUE - SIZE, Long.MAX_VALUE).toArray(); + lsrc2 = LongStream.range(0, SIZE).toArray(); + isrc1 = IntStream.range(Integer.MAX_VALUE - 2 * SIZE, Integer.MAX_VALUE).toArray(); + isrc2 = IntStream.range(0, 2 * SIZE).toArray(); + res = new long[SIZE]; + } + + @Benchmark + public void test_bm_pattern1() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.AND, 0xFFFFFFFFL) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, 0xFFFFFFFFL)) + .intoArray(res, i); + } + } + + @Benchmark + public void test_bm_pattern2() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.AND, 0xFFFFFFFFL) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, 32)) + .intoArray(res, i); + } + } + + @Benchmark + public void test_bm_pattern3() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.LSHR, 32) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, 32)) + .intoArray(res, i); + } + } + + @Benchmark + public void test_bm_pattern4() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.LSHR, 32) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, 0xFFFFFFFFL)) + .intoArray(res, i); + } + } + + @Benchmark + public void test_bm_pattern5() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = IntVector.fromArray(ISP, isrc1, i) + .convert(VectorOperators.I2L, 0) + .reinterpretAsLongs(); + LongVector vsrc2 = IntVector.fromArray(ISP, isrc2, i) + .convert(VectorOperators.I2L, 0) + .reinterpretAsLongs(); + vsrc1.lanewise(VectorOperators.MUL, vsrc2).intoArray(res, i); + } + } + + @Benchmark + public void test_bm_pattern6() { + for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) { + LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i); + LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i); + vsrc1.lanewise(VectorOperators.ASHR, 32) + .lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.ASHR, 32)) + .intoArray(res, i); + } + } + +} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorXXH3HashingBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorXXH3HashingBenchmark.java new file mode 100644 index 00000000000..32056da49aa --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorXXH3HashingBenchmark.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.*; +import jdk.incubator.vector.*; +import java.util.stream.*; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) +public class VectorXXH3HashingBenchmark { + @Param({"1024", "2048", "4096", "8192"}) + private int SIZE; + private long [] accumulators; + private byte [] input; + private byte [] SECRET; + + private static final VectorShuffle LONG_SHUFFLE_PREFERRED = VectorShuffle.fromOp(LongVector.SPECIES_PREFERRED, i -> i ^ 1); + + @Setup(Level.Trial) + public void Setup() { + accumulators = new long[SIZE]; + input = new byte[SIZE * 8]; + SECRET = new byte[SIZE*8]; + IntStream.range(0, SIZE*8).forEach( + i -> { + input[i] = (byte)i; + SECRET[i] = (byte)-i; + } + ); + } + + @Benchmark + public void hashingKernel() { + for (int block = 0; block < input.length / 1024; block++) { + for (int stripe = 0; stripe < 16; stripe++) { + int inputOffset = block * 1024 + stripe * 64; + int secretOffset = stripe * 8; + + for (int i = 0; i < 8; i += LongVector.SPECIES_PREFERRED.length()) { + LongVector accumulatorsVector = LongVector.fromArray(LongVector.SPECIES_PREFERRED, accumulators, i); + LongVector inputVector = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, input, inputOffset + i * 8).reinterpretAsLongs(); + LongVector secretVector = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, SECRET, secretOffset + i * 8).reinterpretAsLongs(); + + LongVector key = inputVector + .lanewise(VectorOperators.XOR, secretVector) + .reinterpretAsLongs(); + + LongVector low = key.and(0xFFFF_FFFFL); + LongVector high = key.lanewise(VectorOperators.LSHR, 32); + + accumulatorsVector + .add(inputVector.rearrange(LONG_SHUFFLE_PREFERRED)) + .add(high.mul(low)) + .intoArray(accumulators, i); + } + } + } + } +}