8289186: Support predicated vector load/store operations over X86 AVX2 targets.

Reviewed-by: xgong, kvn
This commit is contained in:
Jatin Bhateja 2022-07-09 15:13:25 +00:00
parent 3c08e6b311
commit 81ee7d28f8
8 changed files with 291 additions and 40 deletions

View File

@ -3038,6 +3038,60 @@ void Assembler::vmovdqu(Address dst, XMMRegister src) {
emit_operand(src, dst);
}
void Assembler::vpmaskmovd(XMMRegister dst, XMMRegister mask, Address src, int vector_len) {
assert((VM_Version::supports_avx2() && vector_len == AVX_256bit), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0x8C);
emit_operand(dst, src);
}
void Assembler::vpmaskmovq(XMMRegister dst, XMMRegister mask, Address src, int vector_len) {
assert((VM_Version::supports_avx2() && vector_len == AVX_256bit), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0x8C);
emit_operand(dst, src);
}
void Assembler::vmaskmovps(XMMRegister dst, Address src, XMMRegister mask, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x2C);
emit_operand(dst, src);
}
void Assembler::vmaskmovpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x2D);
emit_operand(dst, src);
}
void Assembler::vmaskmovps(Address dst, XMMRegister src, XMMRegister mask, int vector_len) {
assert(UseAVX > 0, "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(dst, mask->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x2E);
emit_operand(src, dst);
}
void Assembler::vmaskmovpd(Address dst, XMMRegister src, XMMRegister mask, int vector_len) {
assert(UseAVX > 0, "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(dst, mask->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x2F);
emit_operand(src, dst);
}
// Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
void Assembler::evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512vlbw(), "");
@ -4394,14 +4448,6 @@ void Assembler::vmovmskpd(Register dst, XMMRegister src, int vec_enc) {
emit_int16(0x50, (0xC0 | encode));
}
void Assembler::vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
assert((VM_Version::supports_avx2() && vector_len == AVX_256bit), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0x8C);
emit_operand(dst, src);
}
void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
assert(VM_Version::supports_sse4_1(), "");

View File

@ -1804,6 +1804,13 @@ private:
void vmovmskps(Register dst, XMMRegister src, int vec_enc);
void vmovmskpd(Register dst, XMMRegister src, int vec_enc);
void vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmaskmovq(XMMRegister dst, XMMRegister mask, Address src, int vector_len);
void vmaskmovps(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
void vmaskmovpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
void vmaskmovps(Address dst, XMMRegister src, XMMRegister mask, int vector_len);
void vmaskmovpd(Address dst, XMMRegister src, XMMRegister mask, int vector_len);
// SSE 4.1 extract
void pextrd(Register dst, XMMRegister src, int imm8);

View File

@ -2025,6 +2025,39 @@ void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, X
MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
}
void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
int vec_enc) {
switch(elem_bt) {
case T_INT:
case T_FLOAT:
vmaskmovps(dst, src, mask, vec_enc);
break;
case T_LONG:
case T_DOUBLE:
vmaskmovpd(dst, src, mask, vec_enc);
break;
default:
fatal("Unsupported type %s", type2name(elem_bt));
break;
}
}
void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
int vec_enc) {
switch(elem_bt) {
case T_INT:
case T_FLOAT:
vmaskmovps(dst, src, mask, vec_enc);
break;
case T_LONG:
case T_DOUBLE:
vmaskmovpd(dst, src, mask, vec_enc);
break;
default:
fatal("Unsupported type %s", type2name(elem_bt));
break;
}
}
void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,

View File

@ -442,4 +442,9 @@ public:
void vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
KRegister ktmp1, int vec_enc);
void vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, int vec_enc);
void vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, int vec_enc);
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP

View File

@ -1589,8 +1589,6 @@ const bool Matcher::match_rule_supported(int opcode) {
case Op_VectorCmpMasked:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!is_LP64 || UseAVX < 3 || !VM_Version::supports_bmi2()) {
return false;
}
@ -1753,8 +1751,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
case Op_ClearArray:
case Op_VectorMaskGen:
case Op_VectorCmpMasked:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!is_LP64 || !VM_Version::supports_avx512bw()) {
return false;
}
@ -1762,6 +1758,12 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
return false;
}
break;
case Op_CMoveVD:
if (vlen != 4) {
return false; // implementation limitation (only vcmov4D_reg is present)
@ -9082,9 +9084,59 @@ instruct vprorate(vec dst, vec src, vec shift) %{
ins_pipe( pipe_slow );
%}
#ifdef _LP64
// ---------------------------------- Masked Operations ------------------------------------
instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
predicate(!n->in(3)->bottom_type()->isa_vectmask());
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vlen_enc = vector_length_encoding(this);
__ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
predicate(n->in(3)->bottom_type()->isa_vectmask());
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(this);
__ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
match(Set mem (StoreVectorMasked mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
int vlen_enc = vector_length_encoding(src_node);
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
__ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
match(Set mem (StoreVectorMasked mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
int vlen_enc = vector_length_encoding(src_node);
__ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#ifdef _LP64
instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
@ -9111,17 +9163,6 @@ instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kR
%}
instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(this);
__ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
match(Set dst (VectorMaskGen len));
effect(TEMP temp);
@ -9143,18 +9184,6 @@ instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
ins_pipe( pipe_slow );
%}
instruct vmasked_store64(memory mem, vec src, kReg mask) %{
match(Set mem (StoreVectorMasked mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(src_node);
__ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask());
match(Set dst (VectorMaskToLong mask));

View File

@ -1217,8 +1217,7 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) {
int mem_num_elem = using_byte_array ? num_elem * type2aelembytes(elem_bt) : num_elem;
BasicType mem_elem_bt = using_byte_array ? T_BYTE : elem_bt;
bool supports_predicate = arch_supports_vector(is_store ? Op_StoreVectorMasked : Op_LoadVectorMasked,
mem_num_elem, mem_elem_bt,
(VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred));
mem_num_elem, mem_elem_bt, VecMaskUseLoad);
// If current arch does not support the predicated operations, we have to bail
// out when current case uses the predicate feature.

View File

@ -922,7 +922,6 @@ class StoreVectorMaskedNode : public StoreVectorNode {
public:
StoreVectorMaskedNode(Node* c, Node* mem, Node* dst, Node* src, const TypePtr* at, Node* mask)
: StoreVectorNode(c, mem, dst, at, src) {
assert(mask->bottom_type()->isa_vectmask(), "sanity");
init_class_id(Class_StoreVectorMasked);
set_mismatched_access();
add_req(mask);
@ -943,7 +942,6 @@ class LoadVectorMaskedNode : public LoadVectorNode {
LoadVectorMaskedNode(Node* c, Node* mem, Node* src, const TypePtr* at, const TypeVect* vt, Node* mask,
ControlDependency control_dependency = LoadNode::DependsOnlyOnTest)
: LoadVectorNode(c, mem, src, at, vt, control_dependency) {
assert(mask->bottom_type()->isa_vectmask(), "sanity");
init_class_id(Class_LoadVectorMasked);
set_mismatched_access();
add_req(mask);

View File

@ -0,0 +1,134 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.jdk.incubator.vector;
import java.util.concurrent.TimeUnit;
import jdk.incubator.vector.*;
import org.openjdk.jmh.annotations.*;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public class StoreMaskedIOOBEBenchmark {
@Param({"1024"})
private int inSize;
@Param({"1022"})
private int outSize;
private byte[] byteIn;
private byte[] byteOut;
private short[] shortIn;
private short[] shortOut;
private int[] intIn;
private int[] intOut;
private long[] longIn;
private long[] longOut;
private float[] floatIn;
private float[] floatOut;
private double[] doubleIn;
private double[] doubleOut;
private static final VectorSpecies<Byte> bspecies = VectorSpecies.ofLargestShape(byte.class);
private static final VectorSpecies<Short> sspecies = VectorSpecies.ofLargestShape(short.class);
private static final VectorSpecies<Integer> ispecies = VectorSpecies.ofLargestShape(int.class);
private static final VectorSpecies<Long> lspecies = VectorSpecies.ofLargestShape(long.class);
private static final VectorSpecies<Float> fspecies = VectorSpecies.ofLargestShape(float.class);
private static final VectorSpecies<Double> dspecies = VectorSpecies.ofLargestShape(double.class);
@Setup(Level.Trial)
public void Setup() {
byteIn = new byte[inSize];
byteOut = new byte[outSize];
shortIn = new short[inSize];
shortOut = new short[outSize];
intIn = new int[inSize];
intOut = new int[outSize];
longIn = new long[inSize];
longOut = new long[outSize];
floatIn = new float[inSize];
floatOut = new float[outSize];
doubleIn = new double[inSize];
doubleOut = new double[outSize];
for (int i = 0; i < inSize; i++) {
byteIn[i] = (byte) i;
shortIn[i] = (short) i;
intIn[i] = i;
longIn[i] = i;
floatIn[i] = (float) i;
doubleIn[i] = (double) i;
}
}
@Benchmark
public void byteStoreArrayMaskIOOBE() {
VectorMask<Byte> mask = VectorMask.fromLong(bspecies, (1 << (bspecies.length() - 2)) - 1);
for (int i = 0; i < inSize; i += bspecies.length()) {
ByteVector.fromArray(bspecies, byteIn, i, mask).intoArray(byteOut, i, mask);
}
}
@Benchmark
public void shortStoreArrayMaskIOOBE() {
VectorMask<Short> mask = VectorMask.fromLong(sspecies, (1 << (sspecies.length() - 2)) - 1);
for (int i = 0; i < inSize; i += sspecies.length()) {
ShortVector.fromArray(sspecies, shortIn, i).intoArray(shortOut, i, mask);
}
}
@Benchmark
public void intStoreArrayMaskIOOBE() {
VectorMask<Integer> mask = VectorMask.fromLong(ispecies, (1 << (ispecies.length() - 2)) - 1);
for (int i = 0; i < inSize; i += ispecies.length()) {
IntVector.fromArray(ispecies, intIn, i).intoArray(intOut, i, mask);
}
}
@Benchmark
public void longStoreArrayMaskIOOBE() {
VectorMask<Long> mask = VectorMask.fromLong(lspecies, (1 << (lspecies.length() - 2)) -1);
for (int i = 0; i < inSize; i += lspecies.length()) {
LongVector.fromArray(lspecies, longIn, i).intoArray(longOut, i, mask);
}
}
@Benchmark
public void floatStoreArrayMaskIOOBE() {
VectorMask<Float> mask = VectorMask.fromLong(fspecies, (1 << (fspecies.length() - 2)) - 1);
for (int i = 0; i < inSize; i += fspecies.length()) {
FloatVector.fromArray(fspecies, floatIn, i).intoArray(floatOut, i, mask);
}
}
@Benchmark
public void doubleStoreArrayMaskIOOBE() {
VectorMask<Double> mask = VectorMask.fromLong(dspecies, (1 << (dspecies.length() - 2)) - 1);
for (int i = 0; i < inSize; i += dspecies.length()) {
DoubleVector.fromArray(dspecies, doubleIn, i).intoArray(doubleOut, i, mask);
}
}
}