diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index b256fb2f152..58c4428bf73 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -896,6 +896,8 @@ address Assembler::locate_operand(address inst, WhichOperand which) { tail_size = 1; break; + case 0x10: // movups + case 0x11: // movups case 0x12: // movlps case 0x28: // movaps case 0x2E: // ucomiss @@ -2561,10 +2563,22 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) { emit_int16(0x12, 0xC0 | encode); } +void Assembler::movddup(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse3(), "")); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_DUP, /* input_size_in_bits */ EVEX_64bit); + attributes.set_rex_vex_w_reverted(); + simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8(0x12); + emit_operand(dst, src); +} + void Assembler::vmovddup(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_DUP, /* input_size_in_bits */ EVEX_64bit); attributes.set_rex_vex_w_reverted(); simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x12); @@ -3505,6 +3519,46 @@ void Assembler::movswl(Register dst, Register src) { // movsxw emit_int24(0x0F, (unsigned char)0xBF, (0xC0 | encode)); } +void Assembler::movups(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x10); + emit_operand(dst, src); +} + +void Assembler::vmovups(XMMRegister dst, Address src, int vector_len) { + assert(vector_len == AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x10); + emit_operand(dst, src); +} + +void Assembler::movups(Address dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x11); + emit_operand(src, dst); +} + +void Assembler::vmovups(Address dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x11); + emit_operand(src, dst); +} + void Assembler::movw(Address dst, int imm16) { InstructionMark im(this); @@ -5156,7 +5210,7 @@ void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, i emit_int24(0x43, (0xC0 | encode), imm8 & 0xFF); } -void Assembler::pshufpd(XMMRegister dst, XMMRegister src, int imm8) { +void Assembler::shufpd(XMMRegister dst, XMMRegister src, int imm8) { assert(isByte(imm8), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -5164,14 +5218,14 @@ void Assembler::pshufpd(XMMRegister dst, XMMRegister src, int imm8) { emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); } -void Assembler::vpshufpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { +void Assembler::vshufpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_rex_vex_w_reverted(); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); } -void Assembler::pshufps(XMMRegister dst, XMMRegister src, int imm8) { +void Assembler::shufps(XMMRegister dst, XMMRegister src, int imm8) { assert(isByte(imm8), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -5179,7 +5233,7 @@ void Assembler::pshufps(XMMRegister dst, XMMRegister src, int imm8) { emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); } -void Assembler::vpshufps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { +void Assembler::vshufps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 901a6330494..66605cc2bca 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1492,6 +1492,7 @@ private: void movb(Register dst, Address src); void movddup(XMMRegister dst, XMMRegister src); + void movddup(XMMRegister dst, Address src); void vmovddup(XMMRegister dst, Address src, int vector_len); void kandbl(KRegister dst, KRegister src1, KRegister src2); @@ -1663,6 +1664,11 @@ private: void movswq(Register dst, Register src); #endif + void movups(XMMRegister dst, Address src); + void vmovups(XMMRegister dst, Address src, int vector_len); + void movups(Address dst, XMMRegister src); + void vmovups(Address dst, XMMRegister src, int vector_len); + void movw(Address dst, int imm16); void movw(Register dst, Address src); void movw(Address dst, Register src); @@ -1942,10 +1948,10 @@ private: void pshuflw(XMMRegister dst, Address src, int mode); //shuffle floats and doubles - void pshufps(XMMRegister, XMMRegister, int); - void pshufpd(XMMRegister, XMMRegister, int); - void vpshufps(XMMRegister, XMMRegister, XMMRegister, int, int); - void vpshufpd(XMMRegister, XMMRegister, XMMRegister, int, int); + void shufps(XMMRegister, XMMRegister, int); + void shufpd(XMMRegister, XMMRegister, int); + void vshufps(XMMRegister, XMMRegister, XMMRegister, int, int); + void vshufpd(XMMRegister, XMMRegister, XMMRegister, int, int); // Shuffle packed values at 128 bit granularity void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 8284cd071c2..f6fa13b620a 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -1643,12 +1643,12 @@ void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegi void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { switch (vlen_in_bytes) { - case 4: movdl(dst, src); break; - case 8: movq(dst, src); break; - case 16: movdqu(dst, src); break; - case 32: vmovdqu(dst, src); break; - case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break; - default: ShouldNotReachHere(); + case 4: movdl(dst, src); break; + case 8: movq(dst, src); break; + case 16: movdqu(dst, src); break; + case 32: vmovdqu(dst, src); break; + case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; + default: ShouldNotReachHere(); } } @@ -1661,6 +1661,38 @@ void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vle } } +void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { + int vlen_enc = vector_length_encoding(vlen); + if (VM_Version::supports_avx()) { + if (bt == T_LONG) { + if (VM_Version::supports_avx2()) { + vpbroadcastq(dst, src, vlen_enc, noreg); + } else { + vmovddup(dst, src, vlen_enc, noreg); + } + } else if (bt == T_DOUBLE) { + if (vlen_enc != Assembler::AVX_128bit) { + vbroadcastsd(dst, src, vlen_enc, noreg); + } else { + vmovddup(dst, src, vlen_enc, noreg); + } + } else { + if (VM_Version::supports_avx2() && is_integral_type(bt)) { + vpbroadcastd(dst, src, vlen_enc, noreg); + } else { + vbroadcastss(dst, src, vlen_enc, noreg); + } + } + } else if (VM_Version::supports_sse3()) { + movddup(dst, src); + } else { + movq(dst, src); + if (vlen == 16) { + punpcklqdq(dst, dst); + } + } +} + void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); if (vlen_in_bytes <= 4) { @@ -2317,9 +2349,9 @@ void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src if (typ == T_FLOAT) { if (UseAVX == 0) { movdqu(dst, src); - pshufps(dst, dst, eindex); + shufps(dst, dst, eindex); } else { - vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); + vshufps(dst, src, src, eindex, Assembler::AVX_128bit); } } else { if (UseAVX == 0) { diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 21b294a7ff7..dd9a401dbbe 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -159,6 +159,7 @@ public: void load_vector(XMMRegister dst, Address src, int vlen_in_bytes); void load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch = rscratch1); + void load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen); void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes); // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 4660ac22e31..746c2d115e3 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -2732,6 +2732,15 @@ void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) { } } +void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) { + if (reachable(src)) { + Assembler::movddup(dst, as_Address(src)); + } else { + lea(rscratch, src); + Assembler::movddup(dst, Address(rscratch, 0)); + } +} + void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { if (reachable(src)) { Assembler::vmovddup(dst, as_Address(src), vector_len); @@ -3288,9 +3297,13 @@ void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, } } -void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { - assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); - Assembler::vpbroadcastw(dst, src, vector_len); +void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { + if (reachable(src)) { + Assembler::vpbroadcastd(dst, as_Address(src), vector_len); + } else { + lea(rscratch, src); + Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len); + } } void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { @@ -3311,6 +3324,15 @@ void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vecto } } +void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { + if (reachable(src)) { + Assembler::vbroadcastss(dst, as_Address(src), vector_len); + } else { + lea(rscratch, src); + Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len); + } +} + void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); Assembler::vpcmpeqb(dst, nds, src, vector_len); @@ -4354,10 +4376,14 @@ void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, void MacroAssembler::vallones(XMMRegister dst, int vector_len) { if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without + // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog vpternlogd(dst, 0xFF, dst, dst, vector_len); + } else if (VM_Version::supports_avx()) { + vpcmpeqd(dst, dst, dst, vector_len); } else { - assert(UseAVX > 0, ""); - vpcmpeqb(dst, dst, dst, vector_len); + assert(VM_Version::supports_sse2(), ""); + pcmpeqd(dst, dst); } } diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 998903f5797..e01a1d591dc 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -1114,6 +1114,12 @@ public: void addpd(XMMRegister dst, Address src) { Assembler::addpd(dst, src); } void addpd(XMMRegister dst, AddressLiteral src); + using Assembler::vbroadcastsd; + void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); + + using Assembler::vbroadcastss; + void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); + void divsd(XMMRegister dst, XMMRegister src) { Assembler::divsd(dst, src); } void divsd(XMMRegister dst, Address src) { Assembler::divsd(dst, src); } void divsd(XMMRegister dst, AddressLiteral src); @@ -1150,6 +1156,11 @@ public: void kmov(Register dst, KRegister src); void kmov(KRegister dst, Register src); + using Assembler::movddup; + void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = rscratch1); + using Assembler::vmovddup; + void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); + // AVX Unaligned forms void vmovdqu(Address dst, XMMRegister src); void vmovdqu(XMMRegister dst, Address src); @@ -1157,7 +1168,6 @@ public: void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len); - // AVX512 Unaligned void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len); void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len); @@ -1229,9 +1239,6 @@ public: void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); } void movsd(XMMRegister dst, AddressLiteral src); - using Assembler::vmovddup; - void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); - void mulpd(XMMRegister dst, XMMRegister src) { Assembler::mulpd(dst, src); } void mulpd(XMMRegister dst, Address src) { Assembler::mulpd(dst, src); } void mulpd(XMMRegister dst, AddressLiteral src); @@ -1337,16 +1344,11 @@ public: void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); } void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); - void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len); - void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); } + using Assembler::vpbroadcastd; + void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); - using Assembler::vbroadcastsd; - void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); + using Assembler::vpbroadcastq; void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1); - void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); } - void vpbroadcastq(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); } - - void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 31b5347a06d..c6e955290a7 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -4107,37 +4107,43 @@ instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rReg // ====================REPLICATE======================================= // Replicate byte scalar to be vector -instruct ReplB_reg(vec dst, rRegI src) %{ +instruct vReplB_reg(vec dst, rRegI src) %{ + predicate(UseAVX >= 2); match(Set dst (ReplicateB src)); format %{ "replicateB $dst,$src" %} ins_encode %{ uint vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW - int vlen_enc = vector_length_encoding(this); __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc); - } else if (VM_Version::supports_avx2()) { - int vlen_enc = vector_length_encoding(this); - __ movdl($dst$$XMMRegister, $src$$Register); - __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { __ movdl($dst$$XMMRegister, $src$$Register); - __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - if (vlen >= 16) { - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - if (vlen >= 32) { - assert(vlen == 32, "sanity"); - __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister); - } - } + __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct ReplB_reg(vec dst, rRegI src) %{ + predicate(UseAVX < 2); + match(Set dst (ReplicateB src)); + format %{ "replicateB $dst,$src" %} + ins_encode %{ + uint vlen = Matcher::vector_length(this); + __ movdl($dst$$XMMRegister, $src$$Register); + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + if (vlen >= 16) { + assert(vlen == 16, ""); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( pipe_slow ); %} instruct ReplB_mem(vec dst, memory mem) %{ - predicate(VM_Version::supports_avx2()); + predicate(UseAVX >= 2); match(Set dst (ReplicateB (LoadB mem))); format %{ "replicateB $dst,$mem" %} ins_encode %{ @@ -4147,48 +4153,45 @@ instruct ReplB_mem(vec dst, memory mem) %{ ins_pipe( pipe_slow ); %} -instruct ReplB_imm(vec dst, immI con) %{ - match(Set dst (ReplicateB con)); - format %{ "replicateB $dst,$con" %} - ins_encode %{ - InternalAddress addr = $constantaddress(T_BYTE, vreplicate_imm(T_BYTE, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); - %} - ins_pipe( pipe_slow ); -%} - // ====================ReplicateS======================================= -instruct ReplS_reg(vec dst, rRegI src) %{ +instruct vReplS_reg(vec dst, rRegI src) %{ + predicate(UseAVX >= 2); match(Set dst (ReplicateS src)); format %{ "replicateS $dst,$src" %} ins_encode %{ uint vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW - int vlen_enc = vector_length_encoding(this); __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc); - } else if (VM_Version::supports_avx2()) { - int vlen_enc = vector_length_encoding(this); - __ movdl($dst$$XMMRegister, $src$$Register); - __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { __ movdl($dst$$XMMRegister, $src$$Register); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - if (vlen >= 8) { - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - if (vlen >= 16) { - assert(vlen == 16, "sanity"); - __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister); - } - } + __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct ReplS_reg(vec dst, rRegI src) %{ + predicate(UseAVX < 2); + match(Set dst (ReplicateS src)); + format %{ "replicateS $dst,$src" %} + ins_encode %{ + uint vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + if (vlen >= 8) { + assert(vlen == 8, ""); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( pipe_slow ); %} instruct ReplS_mem(vec dst, memory mem) %{ - predicate(VM_Version::supports_avx2()); + predicate(UseAVX >= 2); match(Set dst (ReplicateS (LoadS mem))); format %{ "replicateS $dst,$mem" %} ins_encode %{ @@ -4198,16 +4201,6 @@ instruct ReplS_mem(vec dst, memory mem) %{ ins_pipe( pipe_slow ); %} -instruct ReplS_imm(vec dst, immI con) %{ - match(Set dst (ReplicateS con)); - format %{ "replicateS $dst,$con" %} - ins_encode %{ - InternalAddress addr = $constantaddress(T_SHORT, vreplicate_imm(T_SHORT, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); - %} - ins_pipe( pipe_slow ); -%} - // ====================ReplicateI======================================= instruct ReplI_reg(vec dst, rRegI src) %{ @@ -4215,20 +4208,15 @@ instruct ReplI_reg(vec dst, rRegI src) %{ format %{ "replicateI $dst,$src" %} ins_encode %{ uint vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands - int vlen_enc = vector_length_encoding(this); __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc); } else if (VM_Version::supports_avx2()) { - int vlen_enc = vector_length_encoding(this); __ movdl($dst$$XMMRegister, $src$$Register); __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { __ movdl($dst$$XMMRegister, $src$$Register); __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - if (vlen >= 8) { - assert(vlen == 8, "sanity"); - __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister); - } } %} ins_pipe( pipe_slow ); @@ -4238,25 +4226,32 @@ instruct ReplI_mem(vec dst, memory mem) %{ match(Set dst (ReplicateI (LoadI mem))); format %{ "replicateI $dst,$mem" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen <= 4) { + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_avx2()) { + __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc); + } else if (VM_Version::supports_avx()) { + __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc); + } else { __ movdl($dst$$XMMRegister, $mem$$Address); __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - } else { - assert(VM_Version::supports_avx2(), "sanity"); - int vlen_enc = vector_length_encoding(this); - __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc); } %} ins_pipe( pipe_slow ); %} instruct ReplI_imm(vec dst, immI con) %{ + match(Set dst (ReplicateB con)); + match(Set dst (ReplicateS con)); match(Set dst (ReplicateI con)); format %{ "replicateI $dst,$con" %} ins_encode %{ - InternalAddress addr = $constantaddress(T_INT, vreplicate_imm(T_INT, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); + InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this), + vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant, + (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) / + type2aelembytes(Matcher::vector_element_basic_type(this)))); + BasicType bt = Matcher::vector_element_basic_type(this); + int vlen = Matcher::vector_length_in_bytes(this); + __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen); %} ins_pipe( pipe_slow ); %} @@ -4268,23 +4263,21 @@ instruct ReplI_zero(vec dst, immI_0 zero) %{ match(Set dst (ReplicateI zero)); format %{ "replicateI $dst,$zero" %} ins_encode %{ - uint vsize = Matcher::vector_length_in_bytes(this); - if (vsize <= 16) { - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - } else { - int vlen_enc = vector_length_encoding(this); + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } else { + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( fpu_reg_reg ); %} instruct ReplI_M1(vec dst, immI_M1 con) %{ - predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16); + predicate(UseSSE >= 2); match(Set dst (ReplicateB con)); match(Set dst (ReplicateS con)); match(Set dst (ReplicateI con)); - effect(TEMP dst); format %{ "vallones $dst" %} ins_encode %{ int vector_len = vector_length_encoding(this); @@ -4301,23 +4294,16 @@ instruct ReplL_reg(vec dst, rRegL src) %{ match(Set dst (ReplicateL src)); format %{ "replicateL $dst,$src" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen == 2) { - __ movdq($dst$$XMMRegister, $src$$Register); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands - int vlen_enc = vector_length_encoding(this); + int vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); + if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc); } else if (VM_Version::supports_avx2()) { - assert(vlen == 4, "sanity"); - int vlen_enc = vector_length_encoding(this); __ movdq($dst$$XMMRegister, $src$$Register); __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { - assert(vlen == 4, "sanity"); __ movdq($dst$$XMMRegister, $src$$Register); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( pipe_slow ); @@ -4382,14 +4368,14 @@ instruct ReplL_mem(vec dst, memory mem) %{ match(Set dst (ReplicateL (LoadL mem))); format %{ "replicateL $dst,$mem" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen == 2) { + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_avx2()) { + __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc); + } else if (VM_Version::supports_sse3()) { + __ movddup($dst$$XMMRegister, $mem$$Address); + } else { __ movq($dst$$XMMRegister, $mem$$Address); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - } else { - assert(VM_Version::supports_avx2(), "sanity"); - int vlen_enc = vector_length_encoding(this); - __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -4400,8 +4386,9 @@ instruct ReplL_imm(vec dst, immL con) %{ match(Set dst (ReplicateL con)); format %{ "replicateL $dst,$con" %} ins_encode %{ - InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); + InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1)); + int vlen = Matcher::vector_length_in_bytes(this); + __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen); %} ins_pipe( pipe_slow ); %} @@ -4410,21 +4397,19 @@ instruct ReplL_zero(vec dst, immL0 zero) %{ match(Set dst (ReplicateL zero)); format %{ "replicateL $dst,$zero" %} ins_encode %{ - int vlen = Matcher::vector_length(this); - if (vlen == 2) { - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - } else { - int vlen_enc = vector_length_encoding(this); + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } else { + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( fpu_reg_reg ); %} instruct ReplL_M1(vec dst, immL_M1 con) %{ - predicate(UseAVX > 0); + predicate(UseSSE >= 2); match(Set dst (ReplicateL con)); - effect(TEMP dst); format %{ "vallones $dst" %} ins_encode %{ int vector_len = vector_length_encoding(this); @@ -4435,38 +4420,43 @@ instruct ReplL_M1(vec dst, immL_M1 con) %{ // ====================ReplicateF======================================= -instruct ReplF_reg(vec dst, vlRegF src) %{ +instruct vReplF_reg(vec dst, vlRegF src) %{ + predicate(UseAVX > 0); match(Set dst (ReplicateF src)); format %{ "replicateF $dst,$src" %} ins_encode %{ uint vlen = Matcher::vector_length(this); + int vlen_enc = vector_length_encoding(this); if (vlen <= 4) { - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); - } else if (VM_Version::supports_avx2()) { - int vlen_enc = vector_length_encoding(this); + __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit); + } else if (VM_Version::supports_avx2()) { __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2 } else { assert(vlen == 8, "sanity"); - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit); __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( pipe_slow ); %} +instruct ReplF_reg(vec dst, vlRegF src) %{ + predicate(UseAVX == 0); + match(Set dst (ReplicateF src)); + format %{ "replicateF $dst,$src" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + %} + ins_pipe( pipe_slow ); +%} + instruct ReplF_mem(vec dst, memory mem) %{ + predicate(UseAVX > 0); match(Set dst (ReplicateF (LoadF mem))); format %{ "replicateF $dst,$mem" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen <= 4) { - __ movdl($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - } else { - assert(VM_Version::supports_avx(), "sanity"); - int vlen_enc = vector_length_encoding(this); - __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc); - } + int vlen_enc = vector_length_encoding(this); + __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4476,8 +4466,10 @@ instruct ReplF_imm(vec dst, immF con) %{ match(Set dst (ReplicateF con)); format %{ "replicateF $dst,$con" %} ins_encode %{ - InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); + InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, + VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2)); + int vlen = Matcher::vector_length_in_bytes(this); + __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen); %} ins_pipe( pipe_slow ); %} @@ -4486,12 +4478,11 @@ instruct ReplF_zero(vec dst, immF0 zero) %{ match(Set dst (ReplicateF zero)); format %{ "replicateF $dst,$zero" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen <= 4) { - __ xorps($dst$$XMMRegister, $dst$$XMMRegister); + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) { + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { - int vlen_enc = vector_length_encoding(this); - __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( fpu_reg_reg ); @@ -4500,37 +4491,46 @@ instruct ReplF_zero(vec dst, immF0 zero) %{ // ====================ReplicateD======================================= // Replicate double (8 bytes) scalar to be vector -instruct ReplD_reg(vec dst, vlRegD src) %{ +instruct vReplD_reg(vec dst, vlRegD src) %{ + predicate(UseSSE >= 3); match(Set dst (ReplicateD src)); format %{ "replicateD $dst,$src" %} ins_encode %{ uint vlen = Matcher::vector_length(this); - if (vlen == 2) { - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + int vlen_enc = vector_length_encoding(this); + if (vlen <= 2) { + __ movddup($dst$$XMMRegister, $src$$XMMRegister); } else if (VM_Version::supports_avx2()) { - int vlen_enc = vector_length_encoding(this); __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2 } else { assert(vlen == 4, "sanity"); - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + __ movddup($dst$$XMMRegister, $src$$XMMRegister); __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( pipe_slow ); %} +instruct ReplD_reg(vec dst, vlRegD src) %{ + predicate(UseSSE < 3); + match(Set dst (ReplicateD src)); + format %{ "replicateD $dst,$src" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + %} + ins_pipe( pipe_slow ); +%} + instruct ReplD_mem(vec dst, memory mem) %{ + predicate(UseSSE >= 3); match(Set dst (ReplicateD (LoadD mem))); format %{ "replicateD $dst,$mem" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen == 2) { - __ movq($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44); - } else { - assert(VM_Version::supports_avx(), "sanity"); + if (Matcher::vector_length(this) >= 4) { int vlen_enc = vector_length_encoding(this); __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc); + } else { + __ movddup($dst$$XMMRegister, $mem$$Address); } %} ins_pipe( pipe_slow ); @@ -4541,8 +4541,9 @@ instruct ReplD_imm(vec dst, immD con) %{ match(Set dst (ReplicateD con)); format %{ "replicateD $dst,$con" %} ins_encode %{ - InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, Matcher::vector_length(this))); - __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this)); + InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1)); + int vlen = Matcher::vector_length_in_bytes(this); + __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen); %} ins_pipe( pipe_slow ); %} @@ -4551,12 +4552,11 @@ instruct ReplD_zero(vec dst, immD0 zero) %{ match(Set dst (ReplicateD zero)); format %{ "replicateD $dst,$zero" %} ins_encode %{ - uint vlen = Matcher::vector_length(this); - if (vlen == 2) { - __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); + int vlen_enc = vector_length_encoding(this); + if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) { + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { - int vlen_enc = vector_length_encoding(this); - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); } %} ins_pipe( fpu_reg_reg ); @@ -8335,7 +8335,7 @@ instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{ effect(TEMP_DEF dst, TEMP vtmp); ins_encode %{ int vlen_enc = Assembler::AVX_128bit; - __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit); + __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit); __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1); __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc); __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); diff --git a/src/hotspot/share/adlc/adlparse.cpp b/src/hotspot/share/adlc/adlparse.cpp index e13a25b8a25..26a3c3a7d5f 100644 --- a/src/hotspot/share/adlc/adlparse.cpp +++ b/src/hotspot/share/adlc/adlparse.cpp @@ -3334,20 +3334,20 @@ void ADLParser::constant_parse_expression(EncClass* encoding, char* ec_name) { if (_curchar == '(') { parens_depth++; encoding->add_code("("); - next_char(); + next_char_or_line(); } else if (_curchar == ')') { parens_depth--; if (parens_depth > 0) encoding->add_code(")"); - next_char(); + next_char_or_line(); } else { // (1) // Check if there is a string to pass through to output char *start = _ptr; // Record start of the next string while ((_curchar != '$') && (_curchar != '(') && (_curchar != ')')) { - next_char(); + next_char_or_line(); } // If a string was found, terminate it and record in EncClass if (start != _ptr) { diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index 7921cff886c..d09c6a5ca92 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -65,7 +65,7 @@ InstructForm::InstructForm(const char *id, InstructForm *instr, MatchRule *rule) : _ident(id), _ideal_only(false), _localNames(instr->_localNames), _effects(instr->_effects), - _is_mach_constant(false), + _is_mach_constant(instr->_is_mach_constant), _needs_constant_base(false), _has_call(false) { @@ -4090,12 +4090,6 @@ int MatchRule::is_expensive() const { strcmp(opType,"ReverseBytesL")==0 || strcmp(opType,"ReverseBytesUS")==0 || strcmp(opType,"ReverseBytesS")==0 || - strcmp(opType,"ReplicateB")==0 || - strcmp(opType,"ReplicateS")==0 || - strcmp(opType,"ReplicateI")==0 || - strcmp(opType,"ReplicateL")==0 || - strcmp(opType,"ReplicateF")==0 || - strcmp(opType,"ReplicateD")==0 || strcmp(opType,"PopulateIndex")==0 || strcmp(opType,"AddReductionVI")==0 || strcmp(opType,"AddReductionVL")==0 || @@ -4111,8 +4105,9 @@ int MatchRule::is_expensive() const { strcmp(opType,"OrReductionV")==0 || strcmp(opType,"XorReductionV")==0 || strcmp(opType,"MaskAll")==0 || - 0 /* 0 to line up columns nicely */ ) + 0 /* 0 to line up columns nicely */ ) { return 1; + } } return 0; } diff --git a/src/hotspot/share/asm/assembler.hpp b/src/hotspot/share/asm/assembler.hpp index 21229d37462..7b5ddb657e9 100644 --- a/src/hotspot/share/asm/assembler.hpp +++ b/src/hotspot/share/asm/assembler.hpp @@ -429,11 +429,11 @@ class AbstractAssembler : public ResourceObj { } return ptr; } - address array_constant(BasicType bt, GrowableArray* c) { + address array_constant(BasicType bt, GrowableArray* c, int alignment) { CodeSection* c1 = _code_section; int len = c->length(); int size = type2aelembytes(bt) * len; - address ptr = start_a_const(size, MIN2(round_up_power_of_2(size), 8)); + address ptr = start_a_const(size, alignment); if (ptr != NULL) { for (int i = 0; i < len; i++) { jvalue e = c->at(i); diff --git a/src/hotspot/share/opto/constantTable.cpp b/src/hotspot/share/opto/constantTable.cpp index c3bb5c97730..91452408c2b 100644 --- a/src/hotspot/share/opto/constantTable.cpp +++ b/src/hotspot/share/opto/constantTable.cpp @@ -36,7 +36,30 @@ bool ConstantTable::Constant::operator==(const Constant& other) { if (type() != other.type() ) return false; if (can_be_reused() != other.can_be_reused()) return false; if (is_array() || other.is_array()) { - return is_array() && other.is_array() && _v._array == other._v._array; + if (is_array() != other.is_array() || + get_array()->length() != other.get_array()->length()) { + return false; + } + for (int i = 0; i < get_array()->length(); i++) { + jvalue ele1 = get_array()->at(i); + jvalue ele2 = other.get_array()->at(i); + bool is_eq; + switch (type()) { + case T_BOOLEAN: is_eq = ele1.z == ele2.z; break; + case T_BYTE: is_eq = ele1.b == ele2.b; break; + case T_CHAR: is_eq = ele1.c == ele2.c; break; + case T_SHORT: is_eq = ele1.s == ele2.s; break; + case T_INT: is_eq = ele1.i == ele2.i; break; + case T_LONG: is_eq = ele1.j == ele2.j; break; + case T_FLOAT: is_eq = jint_cast(ele1.f) == jint_cast(ele2.f); break; + case T_DOUBLE: is_eq = jlong_cast(ele1.d) == jlong_cast(ele2.d); break; + default: ShouldNotReachHere(); is_eq = false; + } + if (!is_eq) { + return false; + } + } + return true; } // For floating point values we compare the bit pattern. switch (type()) { @@ -104,7 +127,7 @@ void ConstantTable::calculate_offsets_and_size() { // Align offset for type. int typesize = constant_size(con); assert(typesize <= 8 || con->is_array(), "sanity"); - offset = align_up(offset, MIN2(round_up_power_of_2(typesize), 8)); + offset = align_up(offset, con->alignment()); con->set_offset(offset); // set constant's offset if (con->type() == T_VOID) { @@ -127,7 +150,7 @@ bool ConstantTable::emit(CodeBuffer& cb) const { Constant con = _constants.at(i); address constant_addr = NULL; if (con.is_array()) { - constant_addr = _masm.array_constant(con.type(), con.get_array()); + constant_addr = _masm.array_constant(con.type(), con.get_array(), con.alignment()); } else { switch (con.type()) { case T_INT: constant_addr = _masm.int_constant( con.get_jint() ); break; @@ -229,12 +252,18 @@ ConstantTable::Constant ConstantTable::add(Metadata* metadata) { return con; } -ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt, GrowableArray* array) { - Constant con(bt, array); +ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt, + GrowableArray* array, int alignment) { + Constant con(bt, array, alignment); add(con); return con; } +ConstantTable::Constant ConstantTable::add(MachConstantNode* n, BasicType bt, + GrowableArray* array) { + return add(n, bt, array, array->length() * type2aelembytes(bt)); +} + ConstantTable::Constant ConstantTable::add(MachConstantNode* n, MachOper* oper) { jvalue value; BasicType type = oper->type()->basic_type(); diff --git a/src/hotspot/share/opto/constantTable.hpp b/src/hotspot/share/opto/constantTable.hpp index 1452a27d046..001193f6d0d 100644 --- a/src/hotspot/share/opto/constantTable.hpp +++ b/src/hotspot/share/opto/constantTable.hpp @@ -39,6 +39,7 @@ public: private: BasicType _type; bool _is_array; + int _alignment; union { jvalue _value; Metadata* _metadata; @@ -49,7 +50,7 @@ public: bool _can_be_reused; // true (default) if the value can be shared with other users. public: - Constant() : _type(T_ILLEGAL), _is_array(false), _offset(-1), _freq(0.0f), _can_be_reused(true) { _v._value.l = 0; } + Constant() : _type(T_ILLEGAL), _is_array(false), _alignment(-1), _offset(-1), _freq(0.0f), _can_be_reused(true) { _v._value.l = 0; } Constant(BasicType type, jvalue value, float freq = 0.0f, bool can_be_reused = true) : _type(type), _is_array(false), @@ -59,24 +60,28 @@ public: { assert(type != T_METADATA, "wrong constructor"); _v._value = value; + _alignment = type == T_VOID ? sizeof(jobject) : type2aelembytes(type); } Constant(Metadata* metadata, bool can_be_reused = true) : _type(T_METADATA), _is_array(false), + _alignment(sizeof(Metadata*)), _offset(-1), _freq(0.0f), _can_be_reused(can_be_reused) { _v._metadata = metadata; } - Constant(BasicType type, GrowableArray* array) : + Constant(BasicType type, GrowableArray* array, int alignment, bool can_be_reused = true) : _type(type), _is_array(true), + _alignment(alignment), _offset(-1), _freq(0.0f), - _can_be_reused(false) + _can_be_reused(can_be_reused) { assert(is_java_primitive(type), "not applicable for %s", type2name(type)); + assert(is_power_of_2(alignment), "invalid alignment %d", alignment); _v._array = new GrowableArray(array->length()); for (jvalue ele : *array) { _v._array->append(ele); @@ -87,6 +92,7 @@ public: BasicType type() const { return _type; } bool is_array() const { return _is_array; } + int alignment() const { return _alignment; } jint get_jint() const { return _v._value.i; } jlong get_jlong() const { return _v._value.j; } @@ -145,6 +151,7 @@ public: Constant add(MachConstantNode* n, BasicType type, jvalue value); Constant add(Metadata* metadata); Constant add(MachConstantNode* n, BasicType bt, GrowableArray* array); + Constant add(MachConstantNode* n, BasicType bt, GrowableArray* array, int alignment); Constant add(MachConstantNode* n, MachOper* oper); Constant add(MachConstantNode* n, jint i) { jvalue value; value.i = i; diff --git a/src/hotspot/share/opto/machnode.cpp b/src/hotspot/share/opto/machnode.cpp index 9ba8b231acc..bccccc7b6e3 100644 --- a/src/hotspot/share/opto/machnode.cpp +++ b/src/hotspot/share/opto/machnode.cpp @@ -474,14 +474,15 @@ bool MachNode::rematerialize() const { } // Stretching lots of inputs - don't do it. - if (req() > 2) { + // A MachContant has the last input being the constant base + if (req() > (is_MachConstant() ? 3U : 2U)) { return false; } - if (req() == 2 && in(1) && in(1)->ideal_reg() == Op_RegFlags) { + if (req() >= 2 && in(1) && in(1)->ideal_reg() == Op_RegFlags) { // In(1) will be rematerialized, too. // Stretching lots of inputs - don't do it. - if (in(1)->req() > 2) { + if (in(1)->req() > (in(1)->is_MachConstant() ? 3U : 2U)) { return false; } } @@ -491,7 +492,7 @@ bool MachNode::rematerialize() const { uint idx = oper_input_base(); if (req() > idx) { const RegMask &rm = in_RegMask(idx); - if (rm.is_bound(ideal_reg())) { + if (rm.is_NotEmpty() && rm.is_bound(ideal_reg())) { return false; } } diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/SpiltReplicate.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/SpiltReplicate.java new file mode 100644 index 00000000000..9f717908569 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/SpiltReplicate.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.concurrent.TimeUnit; +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.LongVector; +import org.openjdk.jmh.annotations.*; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(1) +public class SpiltReplicate { + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public long broadcastInt() { + var species = IntVector.SPECIES_PREFERRED; + var sum = IntVector.zero(species); + return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .reinterpretAsLongs() + .lane(0); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public long broadcastLong() { + var species = LongVector.SPECIES_PREFERRED; + var sum = LongVector.zero(species); + return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .reinterpretAsLongs() + .lane(0); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public long broadcastFloat() { + var species = FloatVector.SPECIES_PREFERRED; + var sum = FloatVector.zero(species); + return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .reinterpretAsLongs() + .lane(0); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public long broadcastDouble() { + var species = DoubleVector.SPECIES_PREFERRED; + var sum = DoubleVector.zero(species); + return sum.add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .add(1).add(2).add(3).add(4).add(5).add(6).add(7).add(8) + .add(9).add(10).add(11).add(12).add(13).add(14).add(15).add(16) + .add(17).add(18).add(19).add(20).add(21).add(22).add(23).add(24) + .add(25).add(26).add(27).add(28).add(29).add(30).add(31).add(32) + .reinterpretAsLongs() + .lane(0); + } + + @Benchmark + public void testInt() { + broadcastInt(); + } + + @Benchmark + public void testLong() { + broadcastLong(); + } + + @Benchmark + public void testFloat() { + broadcastFloat(); + } + + @Benchmark + public void testDouble() { + broadcastDouble(); + } +} \ No newline at end of file