8341137: Optimize long vector multiplication using x86 VPMUL[U]DQ instruction
Co-authored-by: Vladimir Ivanov <vlivanov@openjdk.org> Reviewed-by: vlivanov, sviswanathan
This commit is contained in:
parent
191b38e712
commit
dc9a6ef610
@ -6179,6 +6179,7 @@ instruct evmulL_reg(vec dst, vec src1, vec src2) %{
|
|||||||
VM_Version::supports_avx512dq()) ||
|
VM_Version::supports_avx512dq()) ||
|
||||||
VM_Version::supports_avx512vldq());
|
VM_Version::supports_avx512vldq());
|
||||||
match(Set dst (MulVL src1 src2));
|
match(Set dst (MulVL src1 src2));
|
||||||
|
ins_cost(500);
|
||||||
format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
|
format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
assert(UseAVX > 2, "required");
|
assert(UseAVX > 2, "required");
|
||||||
@ -6195,6 +6196,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
|
|||||||
VM_Version::supports_avx512vldq()));
|
VM_Version::supports_avx512vldq()));
|
||||||
match(Set dst (MulVL src (LoadVector mem)));
|
match(Set dst (MulVL src (LoadVector mem)));
|
||||||
format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
|
format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
|
||||||
|
ins_cost(500);
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
assert(UseAVX > 2, "required");
|
assert(UseAVX > 2, "required");
|
||||||
int vlen_enc = vector_length_encoding(this);
|
int vlen_enc = vector_length_encoding(this);
|
||||||
@ -6206,6 +6208,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
|
|||||||
instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
|
instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
|
||||||
predicate(UseAVX == 0);
|
predicate(UseAVX == 0);
|
||||||
match(Set dst (MulVL src1 src2));
|
match(Set dst (MulVL src1 src2));
|
||||||
|
ins_cost(500);
|
||||||
effect(TEMP dst, TEMP xtmp);
|
effect(TEMP dst, TEMP xtmp);
|
||||||
format %{ "mulVL $dst, $src1, $src2\t! using $xtmp as TEMP" %}
|
format %{ "mulVL $dst, $src1, $src2\t! using $xtmp as TEMP" %}
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
@ -6232,6 +6235,7 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
|
|||||||
!VM_Version::supports_avx512vldq())));
|
!VM_Version::supports_avx512vldq())));
|
||||||
match(Set dst (MulVL src1 src2));
|
match(Set dst (MulVL src1 src2));
|
||||||
effect(TEMP xtmp1, TEMP xtmp2);
|
effect(TEMP xtmp1, TEMP xtmp2);
|
||||||
|
ins_cost(500);
|
||||||
format %{ "vmulVL $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
|
format %{ "vmulVL $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
|
||||||
ins_encode %{
|
ins_encode %{
|
||||||
int vlen_enc = vector_length_encoding(this);
|
int vlen_enc = vector_length_encoding(this);
|
||||||
@ -6248,6 +6252,30 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
|
|||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
|
||||||
|
predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
|
||||||
|
match(Set dst (MulVL src1 src2));
|
||||||
|
ins_cost(100);
|
||||||
|
format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vlen_enc = vector_length_encoding(this);
|
||||||
|
__ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
|
||||||
|
predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
|
||||||
|
match(Set dst (MulVL src1 src2));
|
||||||
|
ins_cost(100);
|
||||||
|
format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vlen_enc = vector_length_encoding(this);
|
||||||
|
__ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
// Floats vector mul
|
// Floats vector mul
|
||||||
instruct vmulF(vec dst, vec src) %{
|
instruct vmulF(vec dst, vec src) %{
|
||||||
predicate(UseAVX == 0);
|
predicate(UseAVX == 0);
|
||||||
|
@ -193,6 +193,7 @@ class VectorUnboxNode;
|
|||||||
class VectorSet;
|
class VectorSet;
|
||||||
class VectorReinterpretNode;
|
class VectorReinterpretNode;
|
||||||
class ShiftVNode;
|
class ShiftVNode;
|
||||||
|
class MulVLNode;
|
||||||
class ExpandVNode;
|
class ExpandVNode;
|
||||||
class CompressVNode;
|
class CompressVNode;
|
||||||
class CompressMNode;
|
class CompressMNode;
|
||||||
@ -743,6 +744,7 @@ public:
|
|||||||
DEFINE_CLASS_ID(Reduction, Vector, 7)
|
DEFINE_CLASS_ID(Reduction, Vector, 7)
|
||||||
DEFINE_CLASS_ID(NegV, Vector, 8)
|
DEFINE_CLASS_ID(NegV, Vector, 8)
|
||||||
DEFINE_CLASS_ID(SaturatingVector, Vector, 9)
|
DEFINE_CLASS_ID(SaturatingVector, Vector, 9)
|
||||||
|
DEFINE_CLASS_ID(MulVL, Vector, 10)
|
||||||
DEFINE_CLASS_ID(Con, Type, 8)
|
DEFINE_CLASS_ID(Con, Type, 8)
|
||||||
DEFINE_CLASS_ID(ConI, Con, 0)
|
DEFINE_CLASS_ID(ConI, Con, 0)
|
||||||
DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9)
|
DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9)
|
||||||
@ -970,6 +972,7 @@ public:
|
|||||||
DEFINE_CLASS_QUERY(Mul)
|
DEFINE_CLASS_QUERY(Mul)
|
||||||
DEFINE_CLASS_QUERY(Multi)
|
DEFINE_CLASS_QUERY(Multi)
|
||||||
DEFINE_CLASS_QUERY(MultiBranch)
|
DEFINE_CLASS_QUERY(MultiBranch)
|
||||||
|
DEFINE_CLASS_QUERY(MulVL)
|
||||||
DEFINE_CLASS_QUERY(Neg)
|
DEFINE_CLASS_QUERY(Neg)
|
||||||
DEFINE_CLASS_QUERY(NegV)
|
DEFINE_CLASS_QUERY(NegV)
|
||||||
DEFINE_CLASS_QUERY(NeverBranch)
|
DEFINE_CLASS_QUERY(NeverBranch)
|
||||||
|
@ -2085,6 +2085,55 @@ Node* VectorBlendNode::Identity(PhaseGVN* phase) {
|
|||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
static bool is_replicate_uint_constant(const Node* n) {
|
||||||
|
return n->Opcode() == Op_Replicate &&
|
||||||
|
n->in(1)->is_Con() &&
|
||||||
|
n->in(1)->bottom_type()->isa_long() &&
|
||||||
|
n->in(1)->bottom_type()->is_long()->get_con() <= 0xFFFFFFFFL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool has_vector_elements_fit_uint(Node* n) {
|
||||||
|
auto is_lower_doubleword_mask_pattern = [](const Node* n) {
|
||||||
|
return n->Opcode() == Op_AndV &&
|
||||||
|
(is_replicate_uint_constant(n->in(1)) ||
|
||||||
|
is_replicate_uint_constant(n->in(2)));
|
||||||
|
};
|
||||||
|
|
||||||
|
auto is_clear_upper_doubleword_uright_shift_pattern = [](const Node* n) {
|
||||||
|
return n->Opcode() == Op_URShiftVL &&
|
||||||
|
n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
|
||||||
|
n->in(2)->in(1)->bottom_type()->isa_int() &&
|
||||||
|
n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
|
||||||
|
};
|
||||||
|
return is_lower_doubleword_mask_pattern(n) || // (AndV SRC (Replicate C)) where C <= 0xFFFFFFFF
|
||||||
|
is_clear_upper_doubleword_uright_shift_pattern(n); // (URShiftV SRC S) where S >= 32
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool has_vector_elements_fit_int(Node* n) {
|
||||||
|
auto is_cast_integer_to_long_pattern = [](const Node* n) {
|
||||||
|
return n->Opcode() == Op_VectorCastI2X && Matcher::vector_element_basic_type(n) == T_LONG;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto is_clear_upper_doubleword_right_shift_pattern = [](const Node* n) {
|
||||||
|
return n->Opcode() == Op_RShiftVL &&
|
||||||
|
n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
|
||||||
|
n->in(2)->in(1)->bottom_type()->isa_int() &&
|
||||||
|
n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
|
||||||
|
};
|
||||||
|
|
||||||
|
return is_cast_integer_to_long_pattern(n) || // (VectorCastI2X SRC)
|
||||||
|
is_clear_upper_doubleword_right_shift_pattern(n); // (RShiftV SRC S) where S >= 32
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MulVLNode::has_int_inputs() const {
|
||||||
|
return has_vector_elements_fit_int(in(1)) &&
|
||||||
|
has_vector_elements_fit_int(in(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MulVLNode::has_uint_inputs() const {
|
||||||
|
return has_vector_elements_fit_uint(in(1)) &&
|
||||||
|
has_vector_elements_fit_uint(in(2));
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef PRODUCT
|
#ifndef PRODUCT
|
||||||
void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
|
void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
|
||||||
|
@ -441,8 +441,12 @@ class MulVINode : public VectorNode {
|
|||||||
// Vector multiply long
|
// Vector multiply long
|
||||||
class MulVLNode : public VectorNode {
|
class MulVLNode : public VectorNode {
|
||||||
public:
|
public:
|
||||||
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {
|
||||||
|
init_class_id(Class_MulVL);
|
||||||
|
}
|
||||||
virtual int Opcode() const;
|
virtual int Opcode() const;
|
||||||
|
bool has_int_inputs() const;
|
||||||
|
bool has_uint_inputs() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
//------------------------------MulVFNode--------------------------------------
|
//------------------------------MulVFNode--------------------------------------
|
||||||
|
249
test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java
Normal file
249
test/hotspot/jtreg/compiler/vectorapi/VectorMultiplyOpt.java
Normal file
@ -0,0 +1,249 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package compiler.vectorapi;
|
||||||
|
|
||||||
|
import jdk.incubator.vector.*;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
import compiler.lib.ir_framework.*;
|
||||||
|
import java.lang.reflect.Array;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @test
|
||||||
|
* @bug 8341137
|
||||||
|
* @summary Optimize long vector multiplication using x86 VPMUL[U]DQ instruction.
|
||||||
|
* @modules jdk.incubator.vector
|
||||||
|
* @library /test/lib /
|
||||||
|
* @run driver compiler.vectorapi.VectorMultiplyOpt
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class VectorMultiplyOpt {
|
||||||
|
|
||||||
|
public static int[] isrc1;
|
||||||
|
public static int[] isrc2;
|
||||||
|
public static long[] lsrc1;
|
||||||
|
public static long[] lsrc2;
|
||||||
|
public static long[] res;
|
||||||
|
|
||||||
|
public static final int SIZE = 1024;
|
||||||
|
public static final Random r = jdk.test.lib.Utils.getRandomInstance();
|
||||||
|
public static final VectorSpecies<Long> LSP = LongVector.SPECIES_PREFERRED;
|
||||||
|
public static final VectorSpecies<Integer> ISP = IntVector.SPECIES_PREFERRED;
|
||||||
|
|
||||||
|
public static final long mask1 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
public static final long mask2 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
public static final long mask3 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
public static final long mask4 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
public static final long mask5 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
public static final long mask6 = r.nextLong(0xFFFFFFFFL);
|
||||||
|
|
||||||
|
public static final int shift1 = r.nextInt(32) + 32;
|
||||||
|
public static final int shift2 = r.nextInt(32) + 32;
|
||||||
|
public static final int shift3 = r.nextInt(32) + 32;
|
||||||
|
public static final int shift4 = r.nextInt(32) + 32;
|
||||||
|
public static final int shift5 = r.nextInt(32) + 32;
|
||||||
|
|
||||||
|
public VectorMultiplyOpt() {
|
||||||
|
lsrc1 = new long[SIZE];
|
||||||
|
lsrc2 = new long[SIZE];
|
||||||
|
res = new long[SIZE];
|
||||||
|
isrc1 = new int[SIZE + 16];
|
||||||
|
isrc2 = new int[SIZE + 16];
|
||||||
|
IntStream.range(0, SIZE).forEach(i -> { lsrc1[i] = Long.MAX_VALUE * r.nextLong(); });
|
||||||
|
IntStream.range(0, SIZE).forEach(i -> { lsrc2[i] = Long.MAX_VALUE * r.nextLong(); });
|
||||||
|
IntStream.range(0, SIZE).forEach(i -> { isrc1[i] = Integer.MAX_VALUE * r.nextInt(); });
|
||||||
|
IntStream.range(0, SIZE).forEach(i -> { isrc2[i] = Integer.MAX_VALUE * r.nextInt(); });
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
TestFramework testFramework = new TestFramework();
|
||||||
|
testFramework.setDefaultWarmup(5000)
|
||||||
|
.addFlags("--add-modules=jdk.incubator.vector")
|
||||||
|
.start();
|
||||||
|
System.out.println("PASSED");
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Validator {
|
||||||
|
public long apply(long src1, long src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void validate(String msg, long[] actual, Object src1, Object src2, Validator func) {
|
||||||
|
for (int i = 0; i < actual.length; i++) {
|
||||||
|
long expected;
|
||||||
|
if (long[].class == src1.getClass()) {
|
||||||
|
expected = func.apply(Array.getLong(src1, i), Array.getLong(src2, i));
|
||||||
|
} else {
|
||||||
|
assert int[].class == src1.getClass();
|
||||||
|
expected = func.apply(Array.getInt(src1, i), Array.getInt(src2, i));
|
||||||
|
}
|
||||||
|
if (actual[i] != expected) {
|
||||||
|
throw new AssertionError(msg + "index " + i + ": src1 = " + Array.get(src1, i) + " src2 = " +
|
||||||
|
Array.get(src2, i) + " actual = " + actual[i] + " expected = " + expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.AND_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern1() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.AND, mask1)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, mask1))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = (lsrc1[i] & mask1) * (lsrc2[i] & mask1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern1")
|
||||||
|
public void test_pattern1_validate() {
|
||||||
|
validate("pattern1 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 & mask1) * (l2 & mask1));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.AND_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern2() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.AND, mask2)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, shift1))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = (lsrc1[i] & mask2) * (lsrc2[i] >>> shift1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern2")
|
||||||
|
public void test_pattern2_validate() {
|
||||||
|
validate("pattern2 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 & mask2) * (l2 >>> shift1));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuludq", " >0 "}, phase = CompilePhase.FINAL_CODE, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern3() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.LSHR, shift2)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, shift3))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = (lsrc1[i] >>> shift2) * (lsrc2[i] >>> shift3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern3")
|
||||||
|
public void test_pattern3_validate() {
|
||||||
|
validate("pattern3 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >>> shift2) * (l2 >>> shift3));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.URSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuludq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE)
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern4() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.LSHR, shift4)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, mask4))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = (lsrc1[i] >>> shift4) * (lsrc2[i] & mask4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern4")
|
||||||
|
public void test_pattern4_validate() {
|
||||||
|
validate("pattern4 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >>> shift4) * (l2 & mask4));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.VECTOR_CAST_I2L, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuldq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE)
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern5() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = IntVector.fromArray(ISP, isrc1, i)
|
||||||
|
.convert(VectorOperators.I2L, 0)
|
||||||
|
.reinterpretAsLongs();
|
||||||
|
LongVector vsrc2 = IntVector.fromArray(ISP, isrc2, i)
|
||||||
|
.convert(VectorOperators.I2L, 0)
|
||||||
|
.reinterpretAsLongs();
|
||||||
|
vsrc1.lanewise(VectorOperators.MUL, vsrc2).intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = Math.multiplyFull(isrc1[i], isrc2[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern5")
|
||||||
|
public void test_pattern5_validate() {
|
||||||
|
validate("pattern5 ", res, isrc1, isrc2, (i1, i2) -> Math.multiplyFull((int)i1, (int)i2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.MUL_VL, " >0 ", IRNode.RSHIFT_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"})
|
||||||
|
@IR(counts = {"vmuldq", " >0 "}, applyIfCPUFeature = {"avx", "true"}, phase = CompilePhase.FINAL_CODE)
|
||||||
|
@Warmup(value = 10000)
|
||||||
|
public static void test_pattern6() {
|
||||||
|
int i = 0;
|
||||||
|
for (; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.ASHR, shift5)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.ASHR, shift5))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
for (; i < res.length; i++) {
|
||||||
|
res[i] = (lsrc1[i] >> shift5) * (lsrc2[i] >> shift5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Check(test = "test_pattern6")
|
||||||
|
public void test_pattern6_validate() {
|
||||||
|
validate("pattern6 ", res, lsrc1, lsrc2, (l1, l2) -> (l1 >> shift5) * (l2 >> shift5));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,125 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package org.openjdk.bench.jdk.incubator.vector;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
import jdk.incubator.vector.*;
|
||||||
|
import java.util.stream.*;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 3, time = 1)
|
||||||
|
@Measurement(iterations = 5, time = 1)
|
||||||
|
@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
|
||||||
|
public class VectorMultiplyOptBenchmark {
|
||||||
|
@Param({"1024", "2048", "4096"})
|
||||||
|
private int SIZE;
|
||||||
|
private int [] isrc1;
|
||||||
|
private int [] isrc2;
|
||||||
|
private long [] lsrc1;
|
||||||
|
private long [] lsrc2;
|
||||||
|
private long [] res;
|
||||||
|
|
||||||
|
private static final VectorSpecies<Long> LSP = LongVector.SPECIES_PREFERRED;
|
||||||
|
private static final VectorSpecies<Integer> ISP = IntVector.SPECIES_PREFERRED;
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void Setup() {
|
||||||
|
lsrc1 = LongStream.range(Long.MAX_VALUE - SIZE, Long.MAX_VALUE).toArray();
|
||||||
|
lsrc2 = LongStream.range(0, SIZE).toArray();
|
||||||
|
isrc1 = IntStream.range(Integer.MAX_VALUE - 2 * SIZE, Integer.MAX_VALUE).toArray();
|
||||||
|
isrc2 = IntStream.range(0, 2 * SIZE).toArray();
|
||||||
|
res = new long[SIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern1() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.AND, 0xFFFFFFFFL)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, 0xFFFFFFFFL))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern2() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.AND, 0xFFFFFFFFL)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, 32))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern3() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.LSHR, 32)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.LSHR, 32))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern4() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.LSHR, 32)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.AND, 0xFFFFFFFFL))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern5() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = IntVector.fromArray(ISP, isrc1, i)
|
||||||
|
.convert(VectorOperators.I2L, 0)
|
||||||
|
.reinterpretAsLongs();
|
||||||
|
LongVector vsrc2 = IntVector.fromArray(ISP, isrc2, i)
|
||||||
|
.convert(VectorOperators.I2L, 0)
|
||||||
|
.reinterpretAsLongs();
|
||||||
|
vsrc1.lanewise(VectorOperators.MUL, vsrc2).intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void test_bm_pattern6() {
|
||||||
|
for (int i = 0; i < LSP.loopBound(res.length); i += LSP.length()) {
|
||||||
|
LongVector vsrc1 = LongVector.fromArray(LSP, lsrc1, i);
|
||||||
|
LongVector vsrc2 = LongVector.fromArray(LSP, lsrc2, i);
|
||||||
|
vsrc1.lanewise(VectorOperators.ASHR, 32)
|
||||||
|
.lanewise(VectorOperators.MUL, vsrc2.lanewise(VectorOperators.ASHR, 32))
|
||||||
|
.intoArray(res, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,85 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package org.openjdk.bench.jdk.incubator.vector;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
import jdk.incubator.vector.*;
|
||||||
|
import java.util.stream.*;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 3, time = 1)
|
||||||
|
@Measurement(iterations = 5, time = 1)
|
||||||
|
@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
|
||||||
|
public class VectorXXH3HashingBenchmark {
|
||||||
|
@Param({"1024", "2048", "4096", "8192"})
|
||||||
|
private int SIZE;
|
||||||
|
private long [] accumulators;
|
||||||
|
private byte [] input;
|
||||||
|
private byte [] SECRET;
|
||||||
|
|
||||||
|
private static final VectorShuffle<Long> LONG_SHUFFLE_PREFERRED = VectorShuffle.fromOp(LongVector.SPECIES_PREFERRED, i -> i ^ 1);
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void Setup() {
|
||||||
|
accumulators = new long[SIZE];
|
||||||
|
input = new byte[SIZE * 8];
|
||||||
|
SECRET = new byte[SIZE*8];
|
||||||
|
IntStream.range(0, SIZE*8).forEach(
|
||||||
|
i -> {
|
||||||
|
input[i] = (byte)i;
|
||||||
|
SECRET[i] = (byte)-i;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void hashingKernel() {
|
||||||
|
for (int block = 0; block < input.length / 1024; block++) {
|
||||||
|
for (int stripe = 0; stripe < 16; stripe++) {
|
||||||
|
int inputOffset = block * 1024 + stripe * 64;
|
||||||
|
int secretOffset = stripe * 8;
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i += LongVector.SPECIES_PREFERRED.length()) {
|
||||||
|
LongVector accumulatorsVector = LongVector.fromArray(LongVector.SPECIES_PREFERRED, accumulators, i);
|
||||||
|
LongVector inputVector = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, input, inputOffset + i * 8).reinterpretAsLongs();
|
||||||
|
LongVector secretVector = ByteVector.fromArray(ByteVector.SPECIES_PREFERRED, SECRET, secretOffset + i * 8).reinterpretAsLongs();
|
||||||
|
|
||||||
|
LongVector key = inputVector
|
||||||
|
.lanewise(VectorOperators.XOR, secretVector)
|
||||||
|
.reinterpretAsLongs();
|
||||||
|
|
||||||
|
LongVector low = key.and(0xFFFF_FFFFL);
|
||||||
|
LongVector high = key.lanewise(VectorOperators.LSHR, 32);
|
||||||
|
|
||||||
|
accumulatorsVector
|
||||||
|
.add(inputVector.rearrange(LONG_SHUFFLE_PREFERRED))
|
||||||
|
.add(high.mul(low))
|
||||||
|
.intoArray(accumulators, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user