diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 3505e081d38..fbf95ae02b5 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -1995,6 +1995,13 @@ void Assembler::cvttsd2sil(Register dst, XMMRegister src) { emit_int16(0x2C, (0xC0 | encode)); } +void Assembler::cvtss2sil(Register dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); + emit_int16(0x2D, (0xC0 | encode)); +} + void Assembler::cvttss2sil(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); @@ -2088,6 +2095,21 @@ void Assembler::vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len) { emit_int16(0x5B, (0xC0 | encode)); } +void Assembler::vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x5B, (0xC0 | encode)); +} + +void Assembler::evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x7B, (0xC0 | encode)); +} + void Assembler::evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len) { assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -6493,7 +6515,6 @@ void Assembler::vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vec emit_int8((rmode)); } - void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -12285,6 +12306,13 @@ void Assembler::cvttsd2siq(Register dst, XMMRegister src) { emit_int16(0x2C, (0xC0 | encode)); } +void Assembler::cvtsd2siq(Register dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int16(0x2D, (0xC0 | encode)); +} + void Assembler::cvttss2siq(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 7141e4b96c4..6af93b52fc6 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1149,6 +1149,7 @@ private: void cvtss2sd(XMMRegister dst, Address src); // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer + void cvtsd2siq(Register dst, XMMRegister src); void cvttsd2sil(Register dst, Address src); void cvttsd2sil(Register dst, XMMRegister src); void cvttsd2siq(Register dst, Address src); @@ -1157,6 +1158,7 @@ private: // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer void cvttss2sil(Register dst, XMMRegister src); void cvttss2siq(Register dst, XMMRegister src); + void cvtss2sil(Register dst, XMMRegister src); // Convert vector double to int void cvttpd2dq(XMMRegister dst, XMMRegister src); @@ -1166,6 +1168,7 @@ private: void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len); // Convert vector float and int + void vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len); void vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len); // Convert vector long to vector FP @@ -1173,6 +1176,7 @@ private: void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len); // Convert vector double to long + void evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len); void evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len); // Evex casts with truncation diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 6d8b9101303..c4411be23cf 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -4061,41 +4061,18 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, } /* - * Algorithm for vector D2L and F2I conversions:- - * a) Perform vector D2L/F2I cast. - * b) Choose fast path if none of the result vector lane contains 0x80000000 value. - * It signifies that source value could be any of the special floating point - * values(NaN,-Inf,Inf,Max,-Min). - * c) Set destination to zero if source is NaN value. - * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. + * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. + * If src is NaN, the result is 0. + * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, + * the result is equal to the value of Integer.MIN_VALUE. + * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, + * the result is equal to the value of Integer.MAX_VALUE. */ - -void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, - KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, - Register scratch, int vec_enc) { +void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + Register scratch, AddressLiteral float_sign_flip, + int vec_enc) { Label done; - evcvttpd2qq(dst, src, vec_enc); - evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch); - evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); - kortestwl(ktmp1, ktmp1); - jccb(Assembler::equal, done); - - vpxor(xtmp2, xtmp2, xtmp2, vec_enc); - evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); - evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); - - kxorwl(ktmp1, ktmp1, ktmp2); - evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); - vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); - evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); - bind(done); -} - -void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, - XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, - AddressLiteral float_sign_flip, Register scratch, int vec_enc) { - Label done; - vcvttps2dq(dst, src, vec_enc); vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc); vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); vptest(xtmp2, xtmp2, vec_enc); @@ -4120,11 +4097,11 @@ void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMM bind(done); } -void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, - KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, - Register scratch, int vec_enc) { +void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register scratch, AddressLiteral float_sign_flip, + int vec_enc) { Label done; - vcvttps2dq(dst, src, vec_enc); evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch); Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); kortestwl(ktmp1, ktmp1); @@ -4141,6 +4118,115 @@ void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XM bind(done); } +/* + * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. + * If src is NaN, the result is 0. + * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, + * the result is equal to the value of Long.MIN_VALUE. + * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, + * the result is equal to the value of Long.MAX_VALUE. + */ +void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, + Register scratch, AddressLiteral double_sign_flip, + int vec_enc) { + Label done; + evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch); + evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); + kortestwl(ktmp1, ktmp1); + jccb(Assembler::equal, done); + + vpxor(xtmp2, xtmp2, xtmp2, vec_enc); + evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); + evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); + + kxorwl(ktmp1, ktmp1, ktmp2); + evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); + vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); + evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); + bind(done); +} + +/* + * Algorithm for vector D2L and F2I conversions:- + * a) Perform vector D2L/F2I cast. + * b) Choose fast path if none of the result vector lane contains 0x80000000 value. + * It signifies that source value could be any of the special floating point + * values(NaN,-Inf,Inf,Max,-Min). + * c) Set destination to zero if source is NaN value. + * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. + */ + +void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + Register scratch, int vec_enc) { + evcvttpd2qq(dst, src, vec_enc); + vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc); +} + +void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + AddressLiteral float_sign_flip, Register scratch, int vec_enc) { + vcvttps2dq(dst, src, vec_enc); + vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc); +} + +void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, + Register scratch, int vec_enc) { + vcvttps2dq(dst, src, vec_enc); + vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc); +} + +#ifdef _LP64 +void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc) { + // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. + // and re-instantiate original MXCSR.RC mode after that. + ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); + ldmxcsr(new_mxcsr, scratch); + mov64(scratch, julong_cast(0.5L)); + evpbroadcastq(xtmp1, scratch, vec_enc); + vaddpd(xtmp1, src , xtmp1, vec_enc); + evcvtpd2qq(dst, xtmp1, vec_enc); + vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc); + ldmxcsr(mxcsr_std, scratch); +} + +void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc) { + // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. + // and re-instantiate original MXCSR.RC mode after that. + ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); + ldmxcsr(new_mxcsr, scratch); + movl(scratch, jint_cast(0.5)); + movq(xtmp1, scratch); + vbroadcastss(xtmp1, xtmp1, vec_enc); + vaddps(xtmp1, src , xtmp1, vec_enc); + vcvtps2dq(dst, xtmp1, vec_enc); + vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc); + ldmxcsr(mxcsr_std, scratch); +} + +void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc) { + // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. + // and re-instantiate original MXCSR.RC mode after that. + ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); + ldmxcsr(new_mxcsr, scratch); + movl(scratch, jint_cast(0.5)); + movq(xtmp1, scratch); + vbroadcastss(xtmp1, xtmp1, vec_enc); + vaddps(xtmp1, src , xtmp1, vec_enc); + vcvtps2dq(dst, xtmp1, vec_enc); + vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc); + ldmxcsr(mxcsr_std, scratch); +} +#endif + void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, BasicType from_elem_bt, BasicType to_elem_bt) { switch (from_elem_bt) { diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 5ecdf20700d..5f8e38a93b2 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -303,6 +303,7 @@ public: KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, Register scratch, int vec_enc); + void vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, Register scratch, int vec_enc); @@ -310,6 +311,33 @@ public: void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, BasicType from_elem_bt, BasicType to_elem_bt); + void vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral double_sign_flip, + int vec_enc); + + void vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral float_sign_flip, + int vec_enc); + + void vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, + Register scratch, AddressLiteral float_sign_flip, + int vec_enc); + +#ifdef _LP64 + void vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc); + + void vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc); + + void vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, + XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip, + AddressLiteral new_mxcsr, Register scratch, int vec_enc); +#endif + void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, bool merge, BasicType bt, int vlen_enc); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 67faa195457..855c855089d 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -2252,12 +2252,12 @@ void MacroAssembler::fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); } -void MacroAssembler::ldmxcsr(AddressLiteral src) { +void MacroAssembler::ldmxcsr(AddressLiteral src, Register scratchReg) { if (reachable(src)) { Assembler::ldmxcsr(as_Address(src)); } else { - lea(rscratch1, src); - Assembler::ldmxcsr(Address(rscratch1, 0)); + lea(scratchReg, src); + Assembler::ldmxcsr(Address(scratchReg, 0)); } } @@ -9120,6 +9120,80 @@ void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { bind(done); } +void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) { + // Following code is line by line assembly translation rounding algorithm. + // Please refer to java.lang.Math.round(float) algorithm for details. + const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000; + const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24; + const int32_t FloatConsts_EXP_BIAS = 127; + const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF; + const int32_t MINUS_32 = 0xFFFFFFE0; + Label L_special_case, L_block1, L_exit; + movl(rtmp, FloatConsts_EXP_BIT_MASK); + movdl(dst, src); + andl(dst, rtmp); + sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1); + movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS); + subl(rtmp, dst); + movl(rcx, rtmp); + movl(dst, MINUS_32); + testl(rtmp, dst); + jccb(Assembler::notEqual, L_special_case); + movdl(dst, src); + andl(dst, FloatConsts_SIGNIF_BIT_MASK); + orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1); + movdl(rtmp, src); + testl(rtmp, rtmp); + jccb(Assembler::greaterEqual, L_block1); + negl(dst); + bind(L_block1); + sarl(dst); + addl(dst, 0x1); + sarl(dst, 0x1); + jmp(L_exit); + bind(L_special_case); + convert_f2i(dst, src); + bind(L_exit); +} + +void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) { + // Following code is line by line assembly translation rounding algorithm. + // Please refer to java.lang.Math.round(double) algorithm for details. + const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L; + const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53; + const int64_t DoubleConsts_EXP_BIAS = 1023; + const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL; + const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L; + Label L_special_case, L_block1, L_exit; + mov64(rtmp, DoubleConsts_EXP_BIT_MASK); + movq(dst, src); + andq(dst, rtmp); + sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1); + mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS); + subq(rtmp, dst); + movq(rcx, rtmp); + mov64(dst, MINUS_64); + testq(rtmp, dst); + jccb(Assembler::notEqual, L_special_case); + movq(dst, src); + mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK); + andq(dst, rtmp); + mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1); + orq(dst, rtmp); + movq(rtmp, src); + testq(rtmp, rtmp); + jccb(Assembler::greaterEqual, L_block1); + negq(dst); + bind(L_block1); + sarq(dst); + addq(dst, 0x1); + sarq(dst, 0x1); + jmp(L_exit); + bind(L_special_case); + convert_d2l(dst, src); + bind(L_exit); +} + void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { Label done; cvttsd2siq(dst, src); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 9b3da9d5de1..303b6b0c83c 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -906,7 +906,7 @@ public: void fld_x(AddressLiteral src); void ldmxcsr(Address src) { Assembler::ldmxcsr(src); } - void ldmxcsr(AddressLiteral src); + void ldmxcsr(AddressLiteral src, Register scratchReg = rscratch1); #ifdef _LP64 private: @@ -1994,6 +1994,8 @@ public: void convert_d2i(Register dst, XMMRegister src); void convert_f2l(Register dst, XMMRegister src); void convert_d2l(Register dst, XMMRegister src); + void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx); + void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx); void cache_wb(Address line); void cache_wbsync(bool is_pre); diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index ab28ebd5ca5..cf182d9880d 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1468,6 +1468,16 @@ const bool Matcher::match_rule_supported(int opcode) { return false; } break; + case Op_RoundVF: + if (UseAVX < 2) { // enabled for AVX2 only + return false; + } + break; + case Op_RoundVD: + if (UseAVX < 3) { + return false; // enabled for AVX3 only + } + break; case Op_CompareAndSwapL: #ifdef _LP64 case Op_CompareAndSwapP: @@ -1572,6 +1582,12 @@ const bool Matcher::match_rule_supported(int opcode) { return false; } break; + case Op_RoundF: + case Op_RoundD: + if (!is_LP64) { + return false; + } + break; case Op_CopySignD: case Op_CopySignF: if (UseAVX < 3 || !is_LP64) { @@ -1817,6 +1833,11 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType return false; } break; + case Op_RoundVD: + if (!VM_Version::supports_avx512dq()) { + return false; + } + break; case Op_VectorCastF2X: if (is_subword_type(bt) || bt == T_LONG) { return false; @@ -7173,13 +7194,14 @@ instruct vcastFtoD_reg(vec dst, vec src) %{ ins_pipe( pipe_slow ); %} -instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{ + +instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{ predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) < 64 && Matcher::vector_element_basic_type(n) == T_INT); match(Set dst (VectorCastF2X src)); effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr); - format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %} + format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, @@ -7189,13 +7211,13 @@ instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, ve ins_pipe( pipe_slow ); %} -instruct vcastFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ +instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) && Matcher::vector_element_basic_type(n) == T_INT); match(Set dst (VectorCastF2X src)); effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr); - format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %} + format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, @@ -7216,11 +7238,11 @@ instruct vcastDtoF_reg(vec dst, vec src) %{ ins_pipe( pipe_slow ); %} -instruct vcastDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ +instruct castDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ predicate(Matcher::vector_element_basic_type(n) == T_LONG); match(Set dst (VectorCastD2X src)); effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr); - format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %} + format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, @@ -7246,6 +7268,56 @@ instruct vucast(vec dst, vec src) %{ ins_pipe( pipe_slow ); %} +#ifdef _LP64 +instruct vround_float_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vl() && + Matcher::vector_length_in_bytes(n) < 64 && + Matcher::vector_element_basic_type(n) == T_INT); + match(Set dst (RoundVF src)); + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr); + format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + InternalAddress new_mxcsr = $constantaddress((jint)0x3F80); + __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, + ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vround_float_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ + predicate((VM_Version::supports_avx512vl() || + Matcher::vector_length_in_bytes(n) == 64) && + Matcher::vector_element_basic_type(n) == T_INT); + match(Set dst (RoundVF src)); + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr); + format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + InternalAddress new_mxcsr = $constantaddress((jint)0x3F80); + __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, + ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vround_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{ + predicate(Matcher::vector_element_basic_type(n) == T_LONG); + match(Set dst (RoundVD src)); + effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr); + format %{ "vector_round_long $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + InternalAddress new_mxcsr = $constantaddress((jint)0x3F80); + __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, + ExternalAddress(vector_double_signflip()), new_mxcsr, $scratch$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} +#endif // --------------------------------- VectorMaskCmp -------------------------------------- instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{ diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index 09ff7075994..62132ea930f 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -10821,6 +10821,28 @@ instruct convD2L_reg_reg(rRegL dst, regD src, rFlagsReg cr) ins_pipe(pipe_slow); %} +instruct round_double_reg(rRegL dst, regD src, rRegL rtmp, rcx_RegL rcx, rFlagsReg cr) +%{ + match(Set dst (RoundD src)); + effect(TEMP dst, TEMP rtmp, TEMP rcx, KILL cr); + format %{ "round_double $dst,$src \t! using $rtmp and $rcx as TEMP"%} + ins_encode %{ + __ round_double($dst$$Register, $src$$XMMRegister, $rtmp$$Register, $rcx$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct round_float_reg(rRegI dst, regF src, rRegL rtmp, rcx_RegL rcx, rFlagsReg cr) +%{ + match(Set dst (RoundF src)); + effect(TEMP dst, TEMP rtmp, TEMP rcx, KILL cr); + format %{ "round_float $dst,$src" %} + ins_encode %{ + __ round_float($dst$$Register, $src$$XMMRegister, $rtmp$$Register, $rcx$$Register); + %} + ins_pipe(pipe_slow); +%} + instruct convI2F_reg_reg(regF dst, rRegI src) %{ predicate(!UseXmmI2F); diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index 7ae8d123642..ba65dd706fb 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4239,6 +4239,7 @@ bool MatchRule::is_vector() const { "FmaVD","FmaVF","PopCountVI", "PopCountVL", "VectorLongToMask", // Next are vector mask ops. "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", + "RoundVF", "RoundVD", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD" diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 5b2c6a9ce56..b2757ca86bc 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -134,6 +134,7 @@ class methodHandle; do_name(log_name,"log") do_name(log10_name,"log10") do_name(pow_name,"pow") \ do_name(exp_name,"exp") do_name(min_name,"min") do_name(max_name,"max") \ do_name(floor_name, "floor") do_name(ceil_name, "ceil") do_name(rint_name, "rint") \ + do_name(round_name, "round") \ \ do_name(addExact_name,"addExact") \ do_name(decrementExact_name,"decrementExact") \ @@ -185,6 +186,8 @@ class methodHandle; do_intrinsic(_minF, java_lang_Math, min_name, float2_float_signature, F_S) \ do_intrinsic(_maxD, java_lang_Math, max_name, double2_double_signature, F_S) \ do_intrinsic(_minD, java_lang_Math, min_name, double2_double_signature, F_S) \ + do_intrinsic(_roundD, java_lang_Math, round_name, double_long_signature, F_S) \ + do_intrinsic(_roundF, java_lang_Math, round_name, float_int_signature, F_S) \ do_intrinsic(_dcopySign, java_lang_Math, copySign_name, double2_double_signature, F_S) \ do_intrinsic(_fcopySign, java_lang_Math, copySign_name, float2_float_signature, F_S) \ do_intrinsic(_dsignum, java_lang_Math, signum_name, double_double_signature, F_S) \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index 94e7fb4f4a0..dfa2a86e602 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -523,6 +523,8 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt case vmIntrinsics::_dlog: case vmIntrinsics::_dlog10: case vmIntrinsics::_dpow: + case vmIntrinsics::_roundD: + case vmIntrinsics::_roundF: case vmIntrinsics::_min: case vmIntrinsics::_max: case vmIntrinsics::_min_strict: diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 5cedd71ec56..a268399f1b9 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -311,6 +311,8 @@ macro(SignumD) macro(SignumF) macro(SqrtD) macro(SqrtF) +macro(RoundF) +macro(RoundD) macro(Start) macro(StartOSR) macro(StoreB) @@ -446,6 +448,8 @@ macro(ReplicateI) macro(ReplicateL) macro(ReplicateF) macro(ReplicateD) +macro(RoundVF) +macro(RoundVD) macro(Extract) macro(ExtractB) macro(ExtractUB) diff --git a/src/hotspot/share/opto/convertnode.hpp b/src/hotspot/share/opto/convertnode.hpp index fb670191fc3..87fca38d43f 100644 --- a/src/hotspot/share/opto/convertnode.hpp +++ b/src/hotspot/share/opto/convertnode.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -81,6 +81,14 @@ class ConvD2LNode : public Node { virtual uint ideal_reg() const { return Op_RegL; } }; +class RoundDNode : public Node { + public: + RoundDNode( Node *dbl ) : Node(0,dbl) {} + virtual int Opcode() const; + virtual const Type *bottom_type() const { return TypeLong::LONG; } + virtual uint ideal_reg() const { return Op_RegL; } +}; + //------------------------------ConvF2DNode------------------------------------ // Convert Float to a Double. class ConvF2DNode : public Node { @@ -105,6 +113,7 @@ class ConvF2INode : public Node { virtual uint ideal_reg() const { return Op_RegI; } }; + //------------------------------ConvF2LNode------------------------------------ // Convert float to long class ConvF2LNode : public Node { @@ -141,6 +150,14 @@ class ConvI2FNode : public Node { virtual uint ideal_reg() const { return Op_RegF; } }; +class RoundFNode : public Node { + public: + RoundFNode( Node *in1 ) : Node(0,in1) {} + virtual int Opcode() const; + virtual const Type *bottom_type() const { return TypeInt::INT; } + virtual uint ideal_reg() const { return Op_RegI; } +}; + //------------------------------ConvI2LNode------------------------------------ // Convert integer to long class ConvI2LNode : public TypeNode { diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 2f7c7fdc84c..fd23e272c71 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -269,6 +269,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_dcopySign: case vmIntrinsics::_fcopySign: case vmIntrinsics::_dsignum: + case vmIntrinsics::_roundF: + case vmIntrinsics::_roundD: case vmIntrinsics::_fsignum: return inline_math_native(intrinsic_id()); case vmIntrinsics::_notify: @@ -1605,6 +1607,7 @@ Node* LibraryCallKit::round_double_node(Node* n) { // public static double Math.sqrt(double) // public static double Math.log(double) // public static double Math.log10(double) +// public static double Math.round(double) bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) { Node* arg = round_double_node(argument(0)); Node* n = NULL; @@ -1616,6 +1619,7 @@ bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) { case vmIntrinsics::_ceil: n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_ceil); break; case vmIntrinsics::_floor: n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_floor); break; case vmIntrinsics::_rint: n = RoundDoubleModeNode::make(_gvn, arg, RoundDoubleModeNode::rmode_rint); break; + case vmIntrinsics::_roundD: n = new RoundDNode(arg); break; case vmIntrinsics::_dcopySign: n = CopySignDNode::make(_gvn, arg, round_double_node(argument(2))); break; case vmIntrinsics::_dsignum: n = SignumDNode::make(_gvn, arg); break; default: fatal_unexpected_iid(id); break; @@ -1637,6 +1641,7 @@ bool LibraryCallKit::inline_math(vmIntrinsics::ID id) { case vmIntrinsics::_labs: n = new AbsLNode( arg); break; case vmIntrinsics::_fcopySign: n = new CopySignFNode(arg, argument(1)); break; case vmIntrinsics::_fsignum: n = SignumFNode::make(_gvn, arg); break; + case vmIntrinsics::_roundF: n = new RoundFNode(arg); break; default: fatal_unexpected_iid(id); break; } set_result(_gvn.transform(n)); @@ -1752,9 +1757,11 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) { runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog10), "LOG10"); // These intrinsics are supported on all hardware + case vmIntrinsics::_roundD: return Matcher::match_rule_supported(Op_RoundD) ? inline_double_math(id) : false; case vmIntrinsics::_ceil: case vmIntrinsics::_floor: case vmIntrinsics::_rint: return Matcher::match_rule_supported(Op_RoundDoubleMode) ? inline_double_math(id) : false; + case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsqrt_strict: return Matcher::match_rule_supported(Op_SqrtD) ? inline_double_math(id) : false; @@ -1774,6 +1781,7 @@ bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) { case vmIntrinsics::_fcopySign: return inline_math(id); case vmIntrinsics::_dsignum: return Matcher::match_rule_supported(Op_SignumD) ? inline_double_math(id) : false; case vmIntrinsics::_fsignum: return Matcher::match_rule_supported(Op_SignumF) ? inline_math(id) : false; + case vmIntrinsics::_roundF: return Matcher::match_rule_supported(Op_RoundF) ? inline_math(id) : false; // These intrinsics are not yet correctly implemented case vmIntrinsics::_datan2: diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index 253340a6be5..a9839941669 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -970,6 +970,10 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) { case Op_ModL: body_size += 30; break; case Op_DivL: body_size += 30; break; case Op_MulL: body_size += 10; break; + case Op_RoundF: body_size += 30; break; + case Op_RoundD: body_size += 30; break; + case Op_RoundVF: body_size += 30; break; + case Op_RoundVD: body_size += 30; break; case Op_PopCountVI: case Op_PopCountVL: { const TypeVect* vt = n->bottom_type()->is_vect(); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 8a116212783..1459e3b7541 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2563,6 +2563,7 @@ bool SuperWord::output() { opc == Op_AbsF || opc == Op_AbsD || opc == Op_AbsI || opc == Op_AbsL || opc == Op_NegF || opc == Op_NegD || + opc == Op_RoundF || opc == Op_RoundD || opc == Op_PopCountI || opc == Op_PopCountL) { assert(n->req() == 2, "only one input expected"); Node* in = vector_opd(p, 1); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index ae9bb537959..8b385f28f40 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -157,6 +157,10 @@ int VectorNode::opcode(int sopc, BasicType bt) { return (bt == T_FLOAT ? Op_SqrtVF : 0); case Op_SqrtD: return (bt == T_DOUBLE ? Op_SqrtVD : 0); + case Op_RoundF: + return (bt == T_INT ? Op_RoundVF : 0); + case Op_RoundD: + return (bt == T_LONG ? Op_RoundVD : 0); case Op_PopCountI: // Unimplemented for subword types since bit count changes // depending on size of lane (and sign bit). @@ -585,6 +589,9 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b case Op_SqrtVF: return new SqrtVFNode(n1, vt); case Op_SqrtVD: return new SqrtVDNode(n1, vt); + case Op_RoundVF: return new RoundVFNode(n1, vt); + case Op_RoundVD: return new RoundVDNode(n1, vt); + case Op_PopCountVI: return new PopCountVINode(n1, vt); case Op_PopCountVL: return new PopCountVLNode(n1, vt); case Op_RotateLeftV: return new RotateLeftVNode(n1, n2, vt); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index d853a71f8cc..74fd1f63991 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1544,6 +1544,14 @@ class VectorCastD2XNode : public VectorCastNode { virtual int Opcode() const; }; +class RoundVFNode : public VectorNode { + public: + RoundVFNode(Node* in, const TypeVect* vt) :VectorNode(in, vt) { + assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float"); + } + virtual int Opcode() const; +}; + class VectorUCastB2XNode : public VectorCastNode { public: VectorUCastB2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) { @@ -1552,6 +1560,14 @@ class VectorUCastB2XNode : public VectorCastNode { virtual int Opcode() const; }; +class RoundVDNode : public VectorNode { + public: + RoundVDNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) { + assert(in->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE, "must be double"); + } + virtual int Opcode() const; +}; + class VectorUCastS2XNode : public VectorCastNode { public: VectorUCastS2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) { diff --git a/src/java.base/share/classes/java/lang/Math.java b/src/java.base/share/classes/java/lang/Math.java index 6d8fa48c976..4155e8616f5 100644 --- a/src/java.base/share/classes/java/lang/Math.java +++ b/src/java.base/share/classes/java/lang/Math.java @@ -753,6 +753,7 @@ public final class Math { * @see java.lang.Integer#MAX_VALUE * @see java.lang.Integer#MIN_VALUE */ + @IntrinsicCandidate public static int round(float a) { int intBits = Float.floatToRawIntBits(a); int biasedExp = (intBits & FloatConsts.EXP_BIT_MASK) @@ -802,6 +803,7 @@ public final class Math { * @see java.lang.Long#MAX_VALUE * @see java.lang.Long#MIN_VALUE */ + @IntrinsicCandidate public static long round(double a) { long longBits = Double.doubleToRawLongBits(a); long biasedExp = (longBits & DoubleConsts.EXP_BIT_MASK) diff --git a/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java b/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java index 8822b38a99a..cc6f860a6a9 100644 --- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java +++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -44,14 +44,16 @@ public class TestDoubleVect { System.out.println("Testing Double vectors"); int errn = test(); if (errn > 0) { - System.err.println("FAILED: " + errn + " errors"); - System.exit(97); + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); } System.out.println("PASSED"); } static int test() { double[] a0 = new double[ARRLEN]; + long [] l0 = new long[ARRLEN]; + double[] a1 = new double[ARRLEN]; double[] a2 = new double[ARRLEN]; double[] a3 = new double[ARRLEN]; @@ -91,6 +93,7 @@ public class TestDoubleVect { test_ceil(a0, a1); test_floor(a0, a1); test_sqrt(a0, a1); + test_round(l0, a1); } // Test and verify results System.out.println("Verification"); @@ -355,6 +358,7 @@ public class TestDoubleVect { errn += verify("test_negc: ", i, a0[i], (double)(-((double)(ADD_INIT+i)))); } + // To test -ve and +ve Zero scenarios. double [] other_corner_cases = { -0.0, 0.0, 9.007199254740992E15 }; double [] other_corner_cases_res = new double[3]; @@ -421,6 +425,35 @@ public class TestDoubleVect { for (int i=8; i 0) @@ -564,6 +597,12 @@ public class TestDoubleVect { end = System.currentTimeMillis(); System.out.println("test_sqrt_n: " + (end - start)); + start = System.currentTimeMillis(); + for (int i=0; i 0) @@ -512,6 +544,12 @@ public class TestFloatVect { end = System.currentTimeMillis(); System.out.println("test_sqrt_n: " + (end - start)); + start = System.currentTimeMillis(); + for (int i=0; i 0 "}) + public void test_round_double(long[] lout, double[] dinp) { + for (int i = 0; i < lout.length; i+=1) { + lout[i] = Math.round(dinp[i]); + } + } + + @Run(test = {"test_round_double"}, mode = RunMode.STANDALONE) + public void kernel_test_round_double() { + dinp = new double[ARRLEN]; + lout = new long[ARRLEN]; + for(int i = 0 ; i < ARRLEN; i++) { + dinp[i] = (double)i*1.4; + } + for (int i = 0; i < ITERS; i++) { + test_round_double(lout , dinp); + } + } +} diff --git a/test/hotspot/jtreg/compiler/vectorization/TestRoundVectFloat.java b/test/hotspot/jtreg/compiler/vectorization/TestRoundVectFloat.java new file mode 100644 index 00000000000..78e8d7b55cc --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorization/TestRoundVectFloat.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8279508 + * @summary Auto-vectorize Math.round API + * @requires vm.compiler2.enabled + * @requires vm.cpu.features ~= ".*avx.*" + * @requires os.simpleArch == "x64" + * @library /test/lib / + * @run driver compiler.vectorization.TestRoundVectFloat + */ + +package compiler.vectorization; + +import compiler.lib.ir_framework.*; + +public class TestRoundVectFloat { + private static final int ARRLEN = 1024; + private static final int ITERS = 11000; + private static float [] finp; + private static int [] iout; + + public static void main(String args[]) { + TestFramework.runWithFlags("-XX:-TieredCompilation", + "-XX:UseAVX=1", + "-XX:CompileThresholdScaling=0.3"); + System.out.println("PASSED"); + } + + @Test + @IR(applyIf = {"UseAVX", " > 1"}, counts = {"RoundVF" , " > 0 "}) + public void test_round_float(int[] iout, float[] finp) { + for (int i = 0; i < finp.length; i+=1) { + iout[i] = Math.round(finp[i]); + } + } + + @Run(test = {"test_round_float"}, mode = RunMode.STANDALONE) + public void kernel_test_round() { + finp = new float[ARRLEN]; + iout = new int[ARRLEN]; + for(int i = 0 ; i < ARRLEN; i++) { + finp[i] = (float)i*1.4f; + } + for (int i = 0; i < ITERS; i++) { + test_round_float(iout , finp); + } + } +} diff --git a/test/jdk/java/lang/Math/RoundTests.java b/test/jdk/java/lang/Math/RoundTests.java index 0a51b426386..e8f4e817a66 100644 --- a/test/jdk/java/lang/Math/RoundTests.java +++ b/test/jdk/java/lang/Math/RoundTests.java @@ -25,20 +25,22 @@ * @test * @bug 6430675 8010430 * @summary Check for correct implementation of {Math, StrictMath}.round + * @run main/othervm -XX:Tier3CompileThreshold=50 -XX:CompileThresholdScaling=0.01 -XX:+TieredCompilation RoundTests */ public class RoundTests { public static void main(String... args) { int failures = 0; + for (int i = 0; i < 500; i++) { + failures += testNearFloatHalfCases(); + failures += testNearDoubleHalfCases(); + failures += testUnityULPCases(); + failures += testSpecialCases(); - failures += testNearFloatHalfCases(); - failures += testNearDoubleHalfCases(); - failures += testUnityULPCases(); - failures += testSpecialCases(); - - if (failures > 0) { - System.err.println("Testing {Math, StrictMath}.round incurred " - + failures + " failures."); - throw new RuntimeException(); + if (failures > 0) { + System.err.println("Testing {Math, StrictMath}.round incurred " + + failures + " failures."); + throw new RuntimeException(); + } } } diff --git a/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java b/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java index cf0eed32e07..8381673028e 100644 --- a/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java +++ b/test/micro/org/openjdk/bench/java/math/FpRoundingBenchmark.java @@ -1,5 +1,5 @@ // -// Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -26,53 +26,97 @@ package org.openjdk.bench.java.math; import java.util.Random; import java.util.concurrent.TimeUnit; import org.openjdk.jmh.annotations.*; -import org.openjdk.jmh.infra.Blackhole; @OutputTimeUnit(TimeUnit.MILLISECONDS) @State(Scope.Thread) public class FpRoundingBenchmark { - @Param({"1024"}) + @Param({"1024", "2048"}) public int TESTSIZE; public double[] DargV1; - - public double[] Res; + public double[] ResD; + public long[] ResL; + public float[] FargV1; + public float[] ResF; + public int[] ResI; public final double[] DspecialVals = { - 0.0, -0.0, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY}; + 0.0, -0.0, Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, + Double.MAX_VALUE, -Double.MAX_VALUE, Double.MIN_VALUE, -Double.MIN_VALUE, + Double.MIN_NORMAL + }; + + public final float[] FspecialVals = { + 0.0f, -0.0f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, + Float.MAX_VALUE, -Float.MAX_VALUE, Float.MIN_VALUE, -Float.MIN_VALUE, + Float.MIN_NORMAL + }; @Setup(Level.Trial) public void BmSetup() { - int i = 0; - Random r = new Random(1024); - DargV1 = new double[TESTSIZE]; - Res = new double[TESTSIZE]; + int i = 0; + Random r = new Random(1024); - for (; i < DspecialVals.length; i++) { - DargV1[i] = DspecialVals[i]; - } + DargV1 = new double[TESTSIZE]; + ResD = new double[TESTSIZE]; - for (; i < TESTSIZE; i++) { - DargV1[i] = r.nextDouble()*TESTSIZE; - } + for (; i < DspecialVals.length; i++) { + DargV1[i] = DspecialVals[i]; + } + + for (; i < TESTSIZE; i++) { + DargV1[i] = Double.longBitsToDouble(r.nextLong());; + } + + FargV1 = new float[TESTSIZE]; + ResF = new float[TESTSIZE]; + + i = 0; + for (; i < FspecialVals.length; i++) { + FargV1[i] = FspecialVals[i]; + } + + for (; i < TESTSIZE; i++) { + FargV1[i] = Float.intBitsToFloat(r.nextInt()); + } + + ResI = new int[TESTSIZE]; + ResL = new long[TESTSIZE]; } @Benchmark - public void testceil(Blackhole bh) { - for (int i = 0; i < TESTSIZE; i++) - Res[i] = Math.ceil(DargV1[i]); + public void test_ceil() { + for (int i = 0; i < TESTSIZE; i++) { + ResD[i] = Math.ceil(DargV1[i]); + } } @Benchmark - public void testfloor(Blackhole bh) { - for (int i = 0; i < TESTSIZE; i++) - Res[i] = Math.floor(DargV1[i]); + public void test_floor() { + for (int i = 0; i < TESTSIZE; i++) { + ResD[i] = Math.floor(DargV1[i]); + } } @Benchmark - public void testrint(Blackhole bh) { - for (int i = 0; i < TESTSIZE; i++) - Res[i] = Math.rint(DargV1[i]); + public void test_rint() { + for (int i = 0; i < TESTSIZE; i++) { + ResD[i] = Math.rint(DargV1[i]); + } + } + + @Benchmark + public void test_round_double() { + for (int i = 0; i < TESTSIZE; i++) { + ResL[i] = Math.round(DargV1[i]); + } + } + + @Benchmark + public void test_round_float() { + for (int i = 0; i < TESTSIZE; i++) { + ResI[i] = Math.round(FargV1[i]); + } } }