From 38e17148faef7799515478bd834ed2fa1a5153de Mon Sep 17 00:00:00 2001 From: Quan Anh Mai Date: Sat, 25 Mar 2023 05:30:16 +0000 Subject: [PATCH] 8304258: x86: Improve the code generation of VectorRearrange with int and float Reviewed-by: kvn, jbhateja, sviswanathan --- src/hotspot/cpu/x86/assembler_x86.cpp | 22 +++++++++++++++++-- src/hotspot/cpu/x86/assembler_x86.hpp | 2 ++ src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp | 11 ++++++++++ src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp | 3 +++ src/hotspot/cpu/x86/x86.ad | 18 +++++++-------- 5 files changed, 44 insertions(+), 12 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 9c374ac3d15..f2166acdb25 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -4209,7 +4209,8 @@ void Assembler::vpermw(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(vector_len <= AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex(), ""); + assert((vector_len == AVX_256bit && VM_Version::supports_avx2()) || + (vector_len == AVX_512bit && VM_Version::supports_evex()), ""); // VEX.NDS.256.66.0F38.W0 36 /r InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); @@ -4217,7 +4218,8 @@ void Assembler::vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { - assert(vector_len <= AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex(), ""); + assert((vector_len == AVX_256bit && VM_Version::supports_avx2()) || + (vector_len == AVX_512bit && VM_Version::supports_evex()), ""); // VEX.NDS.256.66.0F38.W0 36 /r InstructionMark im(this); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -4226,6 +4228,15 @@ void Assembler::vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector emit_operand(dst, src, 0); } +void Assembler::vpermps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert((vector_len == AVX_256bit && VM_Version::supports_avx2()) || + (vector_len == AVX_512bit && VM_Version::supports_evex()), ""); + // VEX.NDS.XXX.66.0F38.W0 16 /r + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x16, (0xC0 | encode)); +} + void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { assert(VM_Version::supports_avx2(), ""); InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -4247,6 +4258,13 @@ void Assembler::vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector emit_int24(0x04, (0xC0 | encode), imm8); } +void Assembler::vpermilps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x0C, (0xC0 | encode)); +} + void Assembler::vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(),/* legacy_mode */ false,/* no_mask_reg */ true, /* uses_vl */ false); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 3fdca51dddb..a8acf7aacb1 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1762,9 +1762,11 @@ private: void vpermw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpermps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len); + void vpermilps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index f5d56bf04a2..8a723b3e2b6 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -6095,3 +6095,14 @@ void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XM evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); } +void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, + XMMRegister shuffle, XMMRegister src, int vlen_enc) { + if (vlen_enc == AVX_128bit) { + vpermilps(dst, src, shuffle, vlen_enc); + } else if (bt == T_INT) { + vpermd(dst, shuffle, src, vlen_enc); + } else { + assert(bt == T_FLOAT, ""); + vpermps(dst, shuffle, src, vlen_enc); + } +} diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 61c46ada32d..e91937b6e34 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -485,4 +485,7 @@ public: void rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc); + void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle, + XMMRegister src, int vlen_enc); + #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index e3d35fac61e..56e754965d3 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -8591,7 +8591,7 @@ instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{ instruct loadShuffleI(vec dst, vec src, vec vtmp) %{ predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) && - Matcher::vector_length(n) == 4 && UseAVX < 2); + Matcher::vector_length(n) == 4 && UseAVX == 0); match(Set dst (VectorLoadShuffle src)); effect(TEMP dst, TEMP vtmp); format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %} @@ -8620,8 +8620,8 @@ instruct loadShuffleI(vec dst, vec src, vec vtmp) %{ %} instruct rearrangeI(vec dst, vec shuffle) %{ - predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) && - Matcher::vector_length(n) == 4 && UseAVX < 2); + predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) && + UseAVX == 0); match(Set dst (VectorRearrange dst shuffle)); format %{ "vector_rearrange $dst, $shuffle, $dst" %} ins_encode %{ @@ -8633,11 +8633,11 @@ instruct rearrangeI(vec dst, vec shuffle) %{ instruct loadShuffleI_avx(vec dst, vec src) %{ predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) && - UseAVX >= 2); + UseAVX > 0); match(Set dst (VectorLoadShuffle src)); format %{ "vector_load_shuffle $dst, $src" %} ins_encode %{ - int vlen_enc = vector_length_encoding(this); + int vlen_enc = vector_length_encoding(this); __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); @@ -8645,15 +8645,13 @@ instruct loadShuffleI_avx(vec dst, vec src) %{ instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{ predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) && - UseAVX >= 2); + UseAVX > 0); match(Set dst (VectorRearrange src shuffle)); format %{ "vector_rearrange $dst, $shuffle, $src" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); - if (vlen_enc == Assembler::AVX_128bit) { - vlen_enc = Assembler::AVX_256bit; - } - __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + BasicType bt = Matcher::vector_element_basic_type(this); + __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %}