diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 58c4428bf73..e361f69b6c9 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -5121,6 +5121,18 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) { emit_int16(0x00, (0xC0 | encode)); } +void Assembler::evpshufb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x00, (0xC0 | encode)); +} + void Assembler::vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(vector_len == AVX_128bit? VM_Version::supports_avx() : vector_len == AVX_256bit? VM_Version::supports_avx2() : diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index d130ed24df8..f0731837e84 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1915,6 +1915,8 @@ private: void pshufb(XMMRegister dst, XMMRegister src); void pshufb(XMMRegister dst, Address src); void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpshufb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + // Shuffle Packed Doublewords void pshufd(XMMRegister dst, XMMRegister src, int mode); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 53799f94762..de99f867b1d 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -5680,3 +5680,49 @@ void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, R } #endif +void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, + int vlen_enc) { + assert(VM_Version::supports_avx512bw(), ""); + // Byte shuffles are inlane operations and indices are determined using + // lower 4 bit of each shuffle lane, thus all shuffle indices are + // normalized to index range 0-15. This makes sure that all the multiples + // of an index value are placed at same relative position in 128 bit + // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 + // will be 16th element in their respective 128 bit lanes. + movl(rtmp, 16); + evpbroadcastb(xtmp1, rtmp, vlen_enc); + + // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, + // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using + // original shuffle indices and move the shuffled lanes corresponding to true + // mask to destination vector. + evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); + evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); + evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); + + // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 + // and broadcasting second 128 bit lane. + evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); + vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); + evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); + evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); + evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); + + // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 + // and broadcasting third 128 bit lane. + evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); + vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); + evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); + evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); + evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); + + // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 + // and broadcasting third 128 bit lane. + evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); + vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); + evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); + evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); + evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); +} + diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index dd9a401dbbe..6a6dd060aca 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -458,4 +458,7 @@ public: void vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, int vec_enc); + void rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc); + #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index c6e955290a7..3c2eee1bf88 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1848,10 +1848,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType return false; // Implementation limitation due to how shuffle is loaded } else if (size_in_bits == 256 && UseAVX < 2) { return false; // Implementation limitation - } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi()) { - return false; // Implementation limitation - } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw()) { - return false; // Implementation limitation } break; case Op_VectorLoadMask: @@ -8529,7 +8525,23 @@ instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVe ins_pipe( pipe_slow ); %} -instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{ + +instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{ + predicate(Matcher::vector_element_basic_type(n) == T_BYTE && + Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi()); + match(Set dst (VectorRearrange src shuffle)); + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp); + format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, + $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, + $rtmp$$Register, $ktmp$$KRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{ predicate(Matcher::vector_element_basic_type(n) == T_BYTE && Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi()); match(Set dst (VectorRearrange src shuffle)); diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/RearrangeBytesBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/RearrangeBytesBenchmark.java new file mode 100644 index 00000000000..daa15d41067 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/RearrangeBytesBenchmark.java @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.Random; +import jdk.incubator.vector.*; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) +public class RearrangeBytesBenchmark { + @Param({"256", "512", "1024"}) + int size; + + int [][] shuffles; + byte[] byteinp; + byte[] byteres; + + static final VectorSpecies bspecies64 = ByteVector.SPECIES_64; + static final VectorSpecies bspecies128 = ByteVector.SPECIES_128; + static final VectorSpecies bspecies256 = ByteVector.SPECIES_256; + static final VectorSpecies bspecies512 = ByteVector.SPECIES_512; + + static final byte[] specialvalsbyte = {0, -0, Byte.MIN_VALUE, Byte.MAX_VALUE}; + + @Setup(Level.Trial) + public void BmSetup() { + Random r = new Random(1024); + int [] bits = {64, 128, 256, 512}; + byteinp = new byte[size]; + byteres = new byte[size]; + + for (int i = 4; i < size; i++) { + byteinp[i] = (byte)i; + } + for (int i = 0; i < specialvalsbyte.length; i++) { + byteinp[i] = specialvalsbyte[i]; + } + + shuffles = new int[4][]; + for (int i = 0; i < bits.length; i++) { + int bytes = bits[i] >> 3; + shuffles[i] = new int[bytes]; + for (int j = 0; j < bytes ; j++) { + shuffles[i][j] = r.nextInt(bytes - 1); + } + } + } + + @Benchmark + public void testRearrangeBytes64() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspecies512, shuffles[3], 0); + for (int j = 0; j < bspecies512.loopBound(size); j += bspecies512.length()) { + ByteVector.fromArray(bspecies512, byteinp, j) + .rearrange(shuffle) + .intoArray(byteres, j); + } + } + @Benchmark + public void testRearrangeBytes32() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspecies256, shuffles[2], 0); + for (int j = 0; j < bspecies256.loopBound(size); j += bspecies256.length()) { + ByteVector.fromArray(bspecies256, byteinp, j) + .rearrange(shuffle) + .intoArray(byteres, j); + } + } + @Benchmark + public void testRearrangeBytes16() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspecies128, shuffles[1], 0); + for (int j = 0; j < bspecies128.loopBound(size); j += bspecies128.length()) { + ByteVector.fromArray(bspecies128, byteinp, j) + .rearrange(shuffle) + .intoArray(byteres, j); + } + } + @Benchmark + public void testRearrangeBytes8() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspecies64, shuffles[0], 0); + for (int j = 0; j < bspecies64.loopBound(size); j += bspecies64.length()) { + ByteVector.fromArray(bspecies64, byteinp, j) + .rearrange(shuffle) + .intoArray(byteres, j); + } + } +}