From 2ceb80c60f2c1a479e5d79aac7d983e0bf29b253 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Wed, 12 Oct 2022 01:05:44 +0000 Subject: [PATCH] 8288043: Optimize FP to word/sub-word integral type conversion on X86 AVX2 platforms Reviewed-by: kvn, sviswanathan --- src/hotspot/cpu/x86/assembler_x86.cpp | 19 +- src/hotspot/cpu/x86/assembler_x86.hpp | 3 + src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp | 245 +++++-- src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp | 65 +- src/hotspot/cpu/x86/matcher_x86.hpp | 7 + src/hotspot/cpu/x86/x86.ad | 141 ++-- src/hotspot/share/opto/loopTransform.cpp | 2 + .../vectorapi/VectorFPtoIntCastTest.java | 17 +- .../reshape/utils/TestCastMethods.java | 16 +- .../vector/VectorFPtoIntCastOperations.java | 624 +++++++++++++++++- 10 files changed, 940 insertions(+), 199 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 0871e99ac77..6c9dfb8b7cd 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -2157,6 +2157,13 @@ void Assembler::vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len) { emit_int16(0x5B, (0xC0 | encode)); } +void Assembler::vcvttpd2dq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xE6, (0xC0 | encode)); +} + void Assembler::vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len) { assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -2165,7 +2172,7 @@ void Assembler::vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evcvttps2qq(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -2173,7 +2180,7 @@ void Assembler::evcvttps2qq(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -2181,7 +2188,7 @@ void Assembler::evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); @@ -2189,7 +2196,7 @@ void Assembler::evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -2197,7 +2204,7 @@ void Assembler::evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); @@ -2205,7 +2212,7 @@ void Assembler::evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::evpmovwb(XMMRegister dst, XMMRegister src, int vector_len) { - assert(UseAVX > 2 && VM_Version::supports_avx512bw(), ""); + assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 2ff29dffa04..71136469c95 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1201,6 +1201,9 @@ private: void evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len); void evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len); + // Convert vector double to int + void vcvttpd2dq(XMMRegister dst, XMMRegister src, int vector_len); + // Evex casts with truncation void evpmovwb(XMMRegister dst, XMMRegister src, int vector_len); void evpmovdw(XMMRegister dst, XMMRegister src, int vector_len); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 3c370afaa7b..9027bafeb17 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -4359,9 +4359,11 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, * the result is equal to the value of Integer.MAX_VALUE. */ -void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, - Register rscratch) { +void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + Register rscratch, AddressLiteral float_sign_flip, + int vec_enc) { + assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); Label done; vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); @@ -4387,10 +4389,11 @@ void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMM bind(done); } -void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, - KRegister ktmp1, KRegister ktmp2, - Register rscratch) { +void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register rscratch, AddressLiteral float_sign_flip, + int vec_enc) { + assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); Label done; evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); @@ -4408,11 +4411,10 @@ void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XM bind(done); } -void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, - AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, - KRegister ktmp1, KRegister ktmp2, - Register rscratch) { +void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register rscratch, AddressLiteral double_sign_flip, + int vec_enc) { assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); Label done; @@ -4432,6 +4434,28 @@ void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister bind(done); } +void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register rscratch, AddressLiteral float_sign_flip, + int vec_enc) { + assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); + Label done; + evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); + Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); + kortestwl(ktmp1, ktmp1); + jccb(Assembler::equal, done); + + vpxor(xtmp2, xtmp2, xtmp2, vec_enc); + evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); + evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); + + kxorwl(ktmp1, ktmp1, ktmp2); + evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); + vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); + evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); + bind(done); +} + /* * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. * If src is NaN, the result is 0. @@ -4440,10 +4464,10 @@ void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, * the result is equal to the value of Long.MAX_VALUE. */ -void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, - AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, - Register rscratch) { +void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register rscratch, AddressLiteral double_sign_flip, + int vec_enc) { assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); Label done; @@ -4463,6 +4487,82 @@ void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, X bind(done); } +void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, + XMMRegister xtmp, int index, int vec_enc) { + assert(vec_enc < Assembler::AVX_512bit, ""); + if (vec_enc == Assembler::AVX_256bit) { + vextractf128_high(xtmp, src); + vshufps(dst, src, xtmp, index, vec_enc); + } else { + vshufps(dst, src, zero, index, vec_enc); + } +} + +void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, + AddressLiteral float_sign_flip, int src_vec_enc) { + assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); + + Label done; + // Compare the destination lanes with float_sign_flip + // value to get mask for all special values. + movdqu(xtmp1, float_sign_flip, rscratch); + vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); + ptest(xtmp2, xtmp2); + jccb(Assembler::equal, done); + + // Flip float_sign_flip to get max integer value. + vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); + pxor(xtmp1, xtmp4); + + // Set detination lanes corresponding to unordered source lanes as zero. + vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); + vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); + + // Shuffle mask vector and pack lower doubles word from each quadword lane. + vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); + vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); + + // Recompute the mask for remaining special value. + pxor(xtmp2, xtmp3); + // Extract mask corresponding to non-negative source lanes. + vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); + + // Shuffle mask vector and pack lower doubles word from each quadword lane. + vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); + pand(xtmp3, xtmp2); + + // Replace destination lanes holding special value(0x80000000) with max int + // if corresponding source lane holds a +ve value. + vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); + bind(done); +} + + +void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, + XMMRegister xtmp, Register rscratch, int vec_enc) { + switch(to_elem_bt) { + case T_SHORT: + assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); + vpackusdw(dst, dst, zero, vec_enc); + if (vec_enc == Assembler::AVX_256bit) { + vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); + } + break; + case T_BYTE: + assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); + vpackusdw(dst, dst, zero, vec_enc); + if (vec_enc == Assembler::AVX_256bit) { + vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); + } + vpackuswb(dst, dst, zero, vec_enc); + break; + default: assert(false, "%s", type2name(to_elem_bt)); + } +} + /* * Algorithm for vector D2L and F2I conversions:- * a) Perform vector D2L/F2I cast. @@ -4473,50 +4573,71 @@ void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, X * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. */ -void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) { - assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); - - evcvttpd2qq(dst, src, vec_enc); - vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, rscratch); -} - -void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch) { - assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); - +void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { + int to_elem_sz = type2aelembytes(to_elem_bt); + assert(to_elem_sz <= 4, ""); vcvttps2dq(dst, src, vec_enc); - vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc, - xtmp1, xtmp2, xtmp3, xtmp4, rscratch); + vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); + if (to_elem_sz < 4) { + vpxor(xtmp4, xtmp4, xtmp4, vec_enc); + vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); + } } -void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) { - assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); - +void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, + Register rscratch, int vec_enc) { + int to_elem_sz = type2aelembytes(to_elem_bt); + assert(to_elem_sz <= 4, ""); vcvttps2dq(dst, src, vec_enc); - vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, rscratch); + vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); + switch(to_elem_bt) { + case T_INT: + break; + case T_SHORT: + evpmovdw(dst, dst, vec_enc); + break; + case T_BYTE: + evpmovdb(dst, dst, vec_enc); + break; + default: assert(false, "%s", type2name(to_elem_bt)); + } } -void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) { - assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); - +void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + Register rscratch, int vec_enc) { evcvttps2qq(dst, src, vec_enc); - vector_cast_float_to_long_special_cases_evex(dst, src, float_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, rscratch); + vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); } -void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) { - assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); +// Handling for downcasting from double to integer or sub-word types on AVX2. +void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, + AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { + int to_elem_sz = type2aelembytes(to_elem_bt); + assert(to_elem_sz < 8, ""); + vcvttpd2dq(dst, src, vec_enc); + vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, + float_sign_flip, vec_enc); + if (to_elem_sz < 4) { + // xtmp4 holds all zero lanes. + vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); + } +} - vector_castD2L_evex(dst, src, double_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, rscratch); - if (to_elem_bt != T_LONG) { +void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, + XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, + KRegister ktmp2, AddressLiteral sign_flip, + Register rscratch, int vec_enc) { + if (VM_Version::supports_avx512dq()) { + evcvttpd2qq(dst, src, vec_enc); + vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); switch(to_elem_bt) { + case T_LONG: + break; case T_INT: evpmovsqd(dst, dst, vec_enc); break; @@ -4530,6 +4651,21 @@ void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister ds break; default: assert(false, "%s", type2name(to_elem_bt)); } + } else { + assert(type2aelembytes(to_elem_bt) <= 4, ""); + vcvttpd2dq(dst, src, vec_enc); + vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); + switch(to_elem_bt) { + case T_INT: + break; + case T_SHORT: + evpmovdw(dst, dst, vec_enc); + break; + case T_BYTE: + evpmovdb(dst, dst, vec_enc); + break; + default: assert(false, "%s", type2name(to_elem_bt)); + } } } @@ -4545,8 +4681,8 @@ void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister sr evpbroadcastq(xtmp1, tmp, vec_enc); vaddpd(xtmp1, src , xtmp1, vec_enc); evcvtpd2qq(dst, xtmp1, vec_enc); - vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, tmp); + vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, + double_sign_flip, vec_enc);; ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); } @@ -4563,8 +4699,8 @@ void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src vbroadcastss(xtmp1, xtmp1, vec_enc); vaddps(xtmp1, src , xtmp1, vec_enc); vcvtps2dq(dst, xtmp1, vec_enc); - vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc, - xtmp1, xtmp2, ktmp1, ktmp2, tmp); + vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, + float_sign_flip, vec_enc); ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); } @@ -4581,8 +4717,7 @@ void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, vbroadcastss(xtmp1, xtmp1, vec_enc); vaddps(xtmp1, src , xtmp1, vec_enc); vcvtps2dq(dst, xtmp1, vec_enc); - vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc, - xtmp1, xtmp2, xtmp3, xtmp4, tmp); + vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); } diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index a7c5cee5847..4d063f0ae3c 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -308,36 +308,59 @@ public: void masked_op(int ideal_opc, int mask_len, KRegister dst, KRegister src1, KRegister src2); - void vector_castF2I_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch = noreg); + void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, + BasicType from_elem_bt, BasicType to_elem_bt); - void vector_castF2I_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg); + void vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, + XMMRegister xtmp, Register rscratch, int vec_enc); - void vector_castF2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg); + void vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + AddressLiteral float_sign_flip, Register rscratch, int vec_enc); - void vector_castD2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg ); + void vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, + Register rscratch, int vec_enc); - void vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg); + void vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + Register rscratch, int vec_enc); - void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, BasicType from_elem_bt, BasicType to_elem_bt); + void vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral sign_flip, + Register rscratch, int vec_enc); - void vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg ); + void vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, + AddressLiteral float_sign_flip, Register rscratch, int vec_enc); - void vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch = noreg); - void vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, - Register rscratch = noreg); + void vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, + AddressLiteral float_sign_flip, int vec_enc); - void vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc, - XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, - Register rscratch = noreg); + void vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral float_sign_flip, + int vec_enc); + + void vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral double_sign_flip, + int vec_enc); + + void vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral float_sign_flip, + int vec_enc); + + void vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral double_sign_flip, + int vec_enc); + + void vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, + XMMRegister xtmp4, Register rscratch, AddressLiteral float_sign_flip, + int vec_enc); + + void vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, + XMMRegister xtmp, int index, int vec_enc); #ifdef _LP64 void vector_round_double_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, diff --git a/src/hotspot/cpu/x86/matcher_x86.hpp b/src/hotspot/cpu/x86/matcher_x86.hpp index a9cd1b37d49..f24aff27b79 100644 --- a/src/hotspot/cpu/x86/matcher_x86.hpp +++ b/src/hotspot/cpu/x86/matcher_x86.hpp @@ -184,10 +184,17 @@ static const bool supports_encode_ascii_array = true; // Returns pre-selection estimated size of a vector operation. + // Currently, it's a rudimentary heuristic based on emitted code size for complex + // IR nodes used by unroll policy. Idea is to constrain unrolling factor and prevent + // generating bloated loop bodies. static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) { switch(vopc) { default: return 0; + case Op_VectorCastF2X: // fall through + case Op_VectorCastD2X: { + return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30); + } case Op_CountTrailingZerosV: case Op_CountLeadingZerosV: return VM_Version::supports_avx512cd() && (ety == T_INT || ety == T_LONG) ? 0 : 40; diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index f2924e33def..7637b3d4981 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1392,6 +1392,8 @@ Assembler::Width widthForType(BasicType bt) { static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); } static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); } static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); } + static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();} + static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();} //============================================================================= const bool Matcher::match_rule_supported(int opcode) { @@ -1883,16 +1885,19 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType return false; } break; + case Op_VectorCastF2X: { + // As per JLS section 5.1.3 narrowing conversion to sub-word types + // happen after intermediate conversion to integer and special handling + // code needs AVX2 vpcmpeqd instruction for 256 bit vectors. + int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte; + if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) { + return false; + } + } + // fallthrough case Op_VectorCastD2X: - // Conversion to integral type is only supported on AVX-512 platforms with avx512dq. - // Need avx512vl for size_in_bits < 512 - if (is_integral_type(bt)) { - if (!VM_Version::supports_avx512dq()) { - return false; - } - if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) { - return false; - } + if (bt == T_LONG && !VM_Version::supports_avx512dq()) { + return false; } break; case Op_RoundVD: @@ -1900,23 +1905,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType return false; } break; - case Op_VectorCastF2X: - // F2I is supported on all AVX and above platforms - // For conversion to other integral types need AVX512: - // Conversion to long in addition needs avx512dq - // Need avx512vl for size_in_bits < 512 - if (is_integral_type(bt) && (bt != T_INT)) { - if (UseAVX <= 2) { - return false; - } - if ((bt == T_LONG) && !VM_Version::supports_avx512dq()) { - return false; - } - if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) { - return false; - } - } - break; case Op_MulReductionVI: if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) { return false; @@ -7376,66 +7364,47 @@ instruct vcastFtoD_reg(vec dst, vec src) %{ %} -instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{ - // F2I conversion for < 64 byte vector using AVX instructions - // AVX512 platforms that dont support avx512vl also use AVX instructions to support F2I - predicate(!VM_Version::supports_avx512vl() && - Matcher::vector_length_in_bytes(n) < 64 && - Matcher::vector_element_basic_type(n) == T_INT); +instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4); match(Set dst (VectorCastF2X src)); effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr); - format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %} + format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %} ins_encode %{ - int vlen_enc = vector_length_encoding(this); - __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc, - $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{ - predicate((VM_Version::supports_avx512vl() || - Matcher::vector_length_in_bytes(n) == 64) && - Matcher::vector_element_basic_type(n) == T_INT); - match(Set dst (VectorCastF2X src)); - effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr); - format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc, - $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); + int vlen_enc = vector_length_encoding(this, $src); + BasicType to_elem_bt = Matcher::vector_element_basic_type(this); + // JDK-8292878 removed the need for an explicit scratch register needed to load greater than + // 32 bit addresses for register indirect addressing mode since stub constants + // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently. + // However, targets are free to increase this limit, but having a large code cache size + // greater than 2G looks unreasonable in practical scenario, on the hind side with given + // cap we save a temporary register allocation which in limiting case can prevent + // spilling in high register pressure blocks. + __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, + ExternalAddress(vector_float_signflip()), noreg, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{ - // F2X conversion for integral non T_INT target using AVX512 instructions - // Platforms that dont support avx512vl can only support 64 byte vectors - predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && - Matcher::vector_element_basic_type(n) != T_INT); + predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) && + is_integral_type(Matcher::vector_element_basic_type(n))); match(Set dst (VectorCastF2X src)); effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr); - format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %} + format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %} ins_encode %{ BasicType to_elem_bt = Matcher::vector_element_basic_type(this); if (to_elem_bt == T_LONG) { int vlen_enc = vector_length_encoding(this); - __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vlen_enc, - $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); + __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, + ExternalAddress(vector_double_signflip()), noreg, vlen_enc); } else { int vlen_enc = vector_length_encoding(this, $src); - __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc, - $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); - if (to_elem_bt == T_SHORT) { - __ evpmovdw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); - } else { - assert(to_elem_bt == T_BYTE, "required"); - __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); - } + __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, + ExternalAddress(vector_float_signflip()), noreg, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -7452,17 +7421,35 @@ instruct vcastDtoF_reg(vec dst, vec src) %{ ins_pipe( pipe_slow ); %} -instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{ - predicate(is_integral_type(Matcher::vector_element_basic_type(n))); +instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 && + is_integral_type(Matcher::vector_element_basic_type(n))); match(Set dst (VectorCastD2X src)); - effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr); - format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %} + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr); + format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this, $src); BasicType to_elem_bt = Matcher::vector_element_basic_type(this); - __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vlen_enc, - $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); + __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister, + ExternalAddress(vector_float_signflip()), noreg, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{ + predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) && + is_integral_type(Matcher::vector_element_basic_type(n))); + match(Set dst (VectorCastD2X src)); + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr); + format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this, $src); + BasicType to_elem_bt = Matcher::vector_element_basic_type(this); + AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) : + ExternalAddress(vector_float_signflip()); + __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc); %} ins_pipe( pipe_slow ); %} diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index cabf62a96d6..b4ba8decbda 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -1004,6 +1004,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) { case Op_ReverseV: case Op_RoundVF: case Op_RoundVD: + case Op_VectorCastD2X: + case Op_VectorCastF2X: case Op_PopCountVI: case Op_PopCountVL: { const TypeVect* vt = n->bottom_type()->is_vect(); diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorFPtoIntCastTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorFPtoIntCastTest.java index 3e0bdbb4524..ddc543e4a33 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorFPtoIntCastTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorFPtoIntCastTest.java @@ -27,7 +27,6 @@ * @summary Test float/double to integral cast * @modules jdk.incubator.vector * @requires vm.compiler2.enabled -* @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx512dq.*") * @library /test/lib / * @run driver compiler.vectorapi.VectorFPtoIntCastTest */ @@ -87,7 +86,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void float2int() { var cvec = (IntVector)fvec512.convertShape(VectorOperators.F2I, ispec512, 0); cvec.intoArray(int_arr, 0); @@ -104,7 +103,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}, applyIfCPUFeature = {"avx512dq", "true"}) public void float2long() { var cvec = (LongVector)fvec512.convertShape(VectorOperators.F2L, lspec512, 0); cvec.intoArray(long_arr, 0); @@ -121,7 +120,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void float2short() { var cvec = (ShortVector)fvec512.convertShape(VectorOperators.F2S, sspec256, 0); cvec.intoArray(short_arr, 0); @@ -138,7 +137,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_F2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void float2byte() { var cvec = (ByteVector)fvec512.convertShape(VectorOperators.F2B, bspec128, 0); cvec.intoArray(byte_arr, 0); @@ -155,7 +154,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void double2int() { var cvec = (IntVector)dvec512.convertShape(VectorOperators.D2I, ispec256, 0); cvec.intoArray(int_arr, 0); @@ -172,7 +171,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}, applyIfCPUFeature = {"avx512dq", "true"}) public void double2long() { var cvec = (LongVector)dvec512.convertShape(VectorOperators.D2L, lspec512, 0); cvec.intoArray(long_arr, 0); @@ -189,7 +188,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void double2short() { var cvec = (ShortVector)dvec512.convertShape(VectorOperators.D2S, sspec128, 0); cvec.intoArray(short_arr, 0); @@ -206,7 +205,7 @@ public class VectorFPtoIntCastTest { } @Test - @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}) + @IR(counts = {IRNode.VECTOR_CAST_D2X, "> 0"}, applyIfCPUFeature = {"avx512f", "true"}) public void double2byte() { var cvec = (ByteVector)dvec512.convertShape(VectorOperators.D2B, bspec64, 0); cvec.intoArray(byte_arr, 0); diff --git a/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java b/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java index 194295b2d2f..2c1c765f4bf 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java +++ b/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java @@ -62,8 +62,11 @@ public class TestCastMethods { makePair(FSPEC128, ISPEC128), makePair(FSPEC64, DSPEC128), makePair(FSPEC128, DSPEC256), + makePair(FSPEC128, ISPEC128), + makePair(FSPEC128, SSPEC64), makePair(DSPEC128, FSPEC64), makePair(DSPEC256, FSPEC128), + makePair(DSPEC128, ISPEC64), makePair(BSPEC64, SSPEC64, true), makePair(BSPEC64, SSPEC128, true), makePair(BSPEC64, ISPEC128, true), @@ -74,6 +77,11 @@ public class TestCastMethods { ); public static final List AVX2_CAST_TESTS = Stream.concat(AVX1_CAST_TESTS.stream(), Stream.of( + makePair(DSPEC256, ISPEC128), + makePair(DSPEC256, SSPEC64), + makePair(FSPEC256, ISPEC256), + makePair(FSPEC256, SSPEC128), + makePair(FSPEC256, BSPEC64), makePair(BSPEC128, SSPEC256), makePair(BSPEC64, ISPEC256), makePair(BSPEC64, LSPEC256), @@ -89,7 +97,6 @@ public class TestCastMethods { makePair(LSPEC256, BSPEC64), makePair(LSPEC256, SSPEC64), makePair(LSPEC256, ISPEC128), - makePair(FSPEC256, ISPEC256), makePair(BSPEC128, SSPEC256, true), makePair(BSPEC64, ISPEC256, true), makePair(BSPEC64, LSPEC256, true), @@ -115,9 +122,14 @@ public class TestCastMethods { makePair(LSPEC512, BSPEC64), makePair(LSPEC512, SSPEC128), makePair(LSPEC512, ISPEC256), - makePair(FSPEC512, ISPEC512), makePair(FSPEC256, DSPEC512), makePair(DSPEC512, FSPEC256), + makePair(DSPEC512, ISPEC256), + makePair(DSPEC512, SSPEC128), + makePair(DSPEC512, BSPEC64), + makePair(FSPEC512, ISPEC512), + makePair(FSPEC512, SSPEC256), + makePair(FSPEC512, BSPEC128), makePair(BSPEC128, ISPEC512, true), makePair(BSPEC64, LSPEC512, true), makePair(SSPEC256, ISPEC512, true), diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java index 02138975ed6..0318971ffef 100644 --- a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java @@ -24,6 +24,7 @@ package org.openjdk.bench.jdk.incubator.vector; +import java.util.Random; import jdk.incubator.vector.*; import java.util.concurrent.TimeUnit; import org.openjdk.jmh.annotations.*; @@ -32,65 +33,630 @@ import org.openjdk.jmh.annotations.*; @State(Scope.Thread) @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public class VectorFPtoIntCastOperations { + @Param({"512", "1024"}) + static int SIZE; - FloatVector fvec256; - FloatVector fvec512; - DoubleVector dvec512; - - static final float [] float_arr = { - 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, - 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f + static final float [] float_sp_vals = { + Float.NaN, + Float.POSITIVE_INFINITY, + Float.NEGATIVE_INFINITY, + 0.0f, + -0.0f }; - static final double [] double_arr = { - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, - 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 + static final double [] double_sp_vals = { + Double.NaN, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + 0.0, + -0.0 }; + static float [] float_arr; + + static double [] double_arr; + + static long [] long_res; + + static int [] int_res; + + static short [] short_res; + + static byte [] byte_res; + @Setup(Level.Trial) public void BmSetup() { - fvec256 = FloatVector.fromArray(FloatVector.SPECIES_256, float_arr, 0); - fvec512 = FloatVector.fromArray(FloatVector.SPECIES_512, float_arr, 0); - dvec512 = DoubleVector.fromArray(DoubleVector.SPECIES_512, double_arr, 0); + Random r = new Random(1024); + float_arr = new float[SIZE]; + double_arr = new double[SIZE]; + long_res = new long[SIZE]; + int_res = new int[SIZE * 2]; + short_res = new short[SIZE * 4]; + byte_res = new byte[SIZE * 8]; + for(int i = 0; i < SIZE; i++) { + float_arr[i] = SIZE * r.nextFloat(); + double_arr[i] = SIZE * r.nextDouble(); + } + for(int i = 0 ; i < SIZE; i += 100) { + System.arraycopy(float_sp_vals, 0, float_arr, i, float_sp_vals.length); + System.arraycopy(double_sp_vals, 0, double_arr, i, double_sp_vals.length); + } + } + + + @Benchmark + public void microFloat128ToByte128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } } @Benchmark - public Vector microFloat2Int() { - return fvec512.convertShape(VectorOperators.F2I, IntVector.SPECIES_512, 0); + public void microFloat128ToByte256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } } @Benchmark - public Vector microFloat2Long() { - return fvec256.convertShape(VectorOperators.F2L, LongVector.SPECIES_512, 0); + public void microFloat128ToByte512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 16); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } } @Benchmark - public Vector microFloat2Short() { - return fvec512.convertShape(VectorOperators.F2S, ShortVector.SPECIES_256, 0); + public void microFloat128ToShort128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } } @Benchmark - public Vector microFloat2Byte() { - return fvec512.convertShape(VectorOperators.F2B, ByteVector.SPECIES_128, 0); + public void microFloat128ToShort256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } } @Benchmark - public Vector microDouble2Int() { - return dvec512.convertShape(VectorOperators.D2I, IntVector.SPECIES_256, 0); + public void microFloat128ToShort512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } } @Benchmark - public Vector microDouble2Long() { - return dvec512.convertShape(VectorOperators.D2L, LongVector.SPECIES_512, 0); + public void microFloat128ToInteger128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } } @Benchmark - public Vector microDouble2Short() { - return dvec512.convertShape(VectorOperators.D2S, ShortVector.SPECIES_128, 0); + public void microFloat128ToInteger256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } } @Benchmark - public Vector microDouble2Byte() { - return dvec512.convertShape(VectorOperators.D2B, ByteVector.SPECIES_64, 0); + public void microFloat128ToInteger512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microFloat128ToLong128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microFloat128ToLong256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microFloat128ToLong512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microFloat256ToByte128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microFloat256ToByte256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microFloat256ToByte512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microFloat256ToShort128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microFloat256ToShort256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microFloat256ToShort512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microFloat256ToInteger128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microFloat256ToInteger256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microFloat256ToInteger512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microFloat256ToLong128() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microFloat256ToLong256() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microFloat256ToLong512() { + VectorSpecies ISPECIES = FloatVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + FloatVector.fromArray(ISPECIES, float_arr, i) + .convertShape(VectorOperators.F2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble128ToByte128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble128ToByte256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 16); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble128ToByte512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ByteVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 32); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble128ToShort128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble128ToShort256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble128ToShort512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = ShortVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 16); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble128ToInteger128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble128ToInteger256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble128ToInteger512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = IntVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble128ToLong128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble128ToLong256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble128ToLong512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_128; + VectorSpecies OSPECIES = LongVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble256ToByte128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble256ToByte256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble256ToByte512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ByteVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 16); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2B, OSPECIES, 0) + .reinterpretAsBytes() + .intoArray(byte_res, j); + } + } + + @Benchmark + public void microDouble256ToShort128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble256ToShort256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble256ToShort512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = ShortVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 8); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2S, OSPECIES, 0) + .reinterpretAsShorts() + .intoArray(short_res, j); + } + } + + @Benchmark + public void microDouble256ToInteger128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble256ToInteger256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble256ToInteger512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = IntVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 4); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2I, OSPECIES, 0) + .reinterpretAsInts() + .intoArray(int_res, j); + } + } + + @Benchmark + public void microDouble256ToLong128() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_128; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble256ToLong256() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_256; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } + } + + @Benchmark + public void microDouble256ToLong512() { + VectorSpecies ISPECIES = DoubleVector.SPECIES_256; + VectorSpecies OSPECIES = LongVector.SPECIES_512; + for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) { + DoubleVector.fromArray(ISPECIES, double_arr, i) + .convertShape(VectorOperators.D2L, OSPECIES, 0) + .reinterpretAsLongs() + .intoArray(long_res, j); + } } }