8308966: Add intrinsic for float/double modulo for x86 AVX2 and AVX512

Co-authored-by: Marius Cornea <marius.cornea@intel.com>
Reviewed-by: jbhateja, sviswanathan
This commit is contained in:
Scott Gibbons 2023-06-12 15:06:42 +00:00 committed by Sandhya Viswanathan
parent 8e4e6b056c
commit 5d5ae35288
12 changed files with 902 additions and 1 deletions

@ -3555,6 +3555,14 @@ void Assembler::movsd(Address dst, XMMRegister src) {
emit_operand(src, dst, 0);
}
void Assembler::vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2) {
assert(UseAVX > 0, "Requires some form of AVX");
InstructionMark im(this);
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(src2->encoding(), src->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int16(0x11, (0xC0 | encode));
}
void Assembler::movss(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
@ -6531,6 +6539,29 @@ void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2)
emit_int16((unsigned char)0xB9, (0xC0 | encode));
}
void Assembler::evfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2, EvexRoundPrefix rmode) { // Need to add rmode for rounding mode support
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_extended_context();
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xAD, (0xC0 | encode));
}
void Assembler::vfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xAD, (0xC0 | encode));
}
void Assembler::vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xBD, (0xC0 | encode));
}
void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
@ -6892,6 +6923,22 @@ void Assembler::vroundpd(XMMRegister dst, Address src, int32_t rmode, int vecto
emit_int8((rmode));
}
void Assembler::vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode) {
assert(VM_Version::supports_avx(), "");
assert(rmode <= 0x0f, "rmode 0x%x", rmode);
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0B, (0xC0 | encode), (rmode));
}
void Assembler::vrndscalesd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int32_t rmode) {
assert(VM_Version::supports_evex(), "requires EVEX support");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0B, (0xC0 | encode), (rmode));
}
void Assembler::vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
@ -8857,6 +8904,19 @@ void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
emit_int8(imm8 & 0x01);
}
void Assembler::extractps(Register dst, XMMRegister src, uint8_t imm8) {
assert(VM_Version::supports_sse4_1(), "");
assert(imm8 <= 0x03, "imm8: %u", imm8);
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
// imm8:
// 0x00 - extract from bits 31:0
// 0x01 - extract from bits 63:32
// 0x02 - extract from bits 95:64
// 0x03 - extract from bits 127:96
emit_int24(0x17, (0xC0 | encode), imm8 & 0x03);
}
// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
void Assembler::vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_avx2(), "");
@ -9531,6 +9591,15 @@ void Assembler::evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
emit_operand(dst, src, 0);
}
void Assembler::evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_extended_context();
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int16(0x5E, (0xC0 | encode));
}
void Assembler::evpabsb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);

@ -528,6 +528,13 @@ class Assembler : public AbstractAssembler {
EVEX_Z = 0x80
};
enum EvexRoundPrefix {
EVEX_RNE = 0x0,
EVEX_RD = 0x1,
EVEX_RU = 0x2,
EVEX_RZ = 0x3
};
enum VexSimdPrefix {
VEX_SIMD_NONE = 0x0,
VEX_SIMD_66 = 0x1,
@ -886,6 +893,8 @@ private:
void movsd(Address dst, XMMRegister src);
void movlpd(XMMRegister dst, Address src);
void vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2);
// New cpus require use of movaps and movapd to avoid partial register stall
// when moving between registers.
void movaps(XMMRegister dst, XMMRegister src);
@ -2242,9 +2251,13 @@ private:
void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
void vdivss(XMMRegister dst, XMMRegister nds, Address src);
void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void evfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
@ -2334,8 +2347,11 @@ private:
// Round Packed Double precision value.
void vroundpd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vroundpd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
void vrndscalesd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int32_t rmode);
void vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
void vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode);
void vroundsd(XMMRegister dst, XMMRegister src, Address src2, int32_t rmode);
// Bitwise Logical AND of Packed Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
@ -2719,6 +2735,8 @@ private:
void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
void extractps(Register dst, XMMRegister src, uint8_t imm8);
// xmm/mem sourced byte/word/dword/qword replicate
void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
@ -2952,6 +2970,8 @@ public:
_embedded_opmask_register_specifier = mask->encoding() & 0x7;
}
void set_extended_context(void) { _is_extended_context = true; }
};
#endif // CPU_X86_ASSEMBLER_X86_HPP

@ -968,7 +968,7 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
break;
case vmIntrinsics::_dpow:
if (StubRoutines::dpow() != nullptr) {
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
} else {
__ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
}

@ -87,6 +87,8 @@ void SharedRuntime::inline_check_hashcode_from_object_header(MacroAssembler* mas
#if defined(TARGET_COMPILER_gcc) && !defined(_WIN64)
JRT_LEAF(jfloat, SharedRuntime::frem(jfloat x, jfloat y))
jfloat retval;
const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
if (!is_LP64 || UseAVX < 1 || !UseFMA) {
asm ("\
1: \n\
fprem \n\
@ -97,11 +99,21 @@ jne 1b \n\
:"=t"(retval)
:"0"(x), "u"(y)
:"cc", "ax");
} else {
assert(StubRoutines::fmod() != nullptr, "");
jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();
jdouble dx = (jdouble) x;
jdouble dy = (jdouble) y;
retval = (jfloat) (*addr)(dx, dy);
}
return retval;
JRT_END
JRT_LEAF(jdouble, SharedRuntime::drem(jdouble x, jdouble y))
jdouble retval;
const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
if (!is_LP64 || UseAVX < 1 || !UseFMA) {
asm ("\
1: \n\
fprem \n\
@ -112,6 +124,12 @@ jne 1b \n\
:"=t"(retval)
:"0"(x), "u"(y)
:"cc", "ax");
} else {
assert(StubRoutines::fmod() != nullptr, "");
jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();
retval = (*addr)(x, y);
}
return retval;
JRT_END
#endif // TARGET_COMPILER_gcc && !_WIN64

@ -3937,6 +3937,10 @@ void StubGenerator::generate_initial_stubs() {
}
generate_libm_stubs();
if ((UseAVX >= 1) && (VM_Version::supports_avx512vlbwdq() || VM_Version::supports_fma())) {
StubRoutines::_fmod = generate_libmFmod(); // from stubGenerator_x86_64_fmod.cpp
}
}
void StubGenerator::generate_continuation_stubs() {

@ -486,6 +486,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_libmPow();
address generate_libmLog();
address generate_libmLog10();
address generate_libmFmod();
// Shared constants
static address ZERO;

@ -0,0 +1,524 @@
/*
* Copyright (c) 2023, Intel Corporation. All rights reserved.
* Intel Math Library (LIBM) Source Code
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "macroAssembler_x86.hpp"
#include "stubGenerator_x86_64.hpp"
/******************************************************************************/
// ALGORITHM DESCRIPTION - FMOD()
// ---------------------
//
// If either value1 or value2 is NaN, the result is NaN.
//
// If neither value1 nor value2 is NaN, the sign of the result equals the sign of the dividend.
//
// If the dividend is an infinity or the divisor is a zero or both, the result is NaN.
//
// If the dividend is finite and the divisor is an infinity, the result equals the dividend.
//
// If the dividend is a zero and the divisor is finite, the result equals the dividend.
//
// In the remaining cases, where neither operand is an infinity, a zero, or NaN, the floating-point
// remainder result from a dividend value1 and a divisor value2 is defined by the mathematical
// relation result = value1 - (value2 * q), where q is an integer that is negative only if
// value1 / value2 is negative, and positive only if value1 / value2 is positive, and whose magnitude
// is as large as possible without exceeding the magnitude of the true mathematical quotient of value1 and value2.
//
/******************************************************************************/
#define __ _masm->
ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_NaN[] = {
0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL // NaN vector
};
ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_1p260[] = {
0x5030000000000000ULL, // 0x1p+260
};
ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_MAX[] = {
0x7FEFFFFFFFFFFFFFULL, // Max
};
ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_INF[] = {
0x7FF0000000000000ULL, // Inf
};
ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_e307[] = {
0x7FE0000000000000ULL
};
address StubGenerator::generate_libmFmod() {
StubCodeMark mark(this, "StubRoutines", "libmFmod");
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
if (VM_Version::supports_avx512vlbwdq()) { // AVX512 version
// Source used to generate the AVX512 fmod assembly below:
//
// #include <ia32intrin.h>
// #include <emmintrin.h>
// #pragma float_control(precise, on)
//
// #define UINT32 unsigned int
// #define SINT32 int
// #define UINT64 unsigned __int64
// #define SINT64 __int64
//
// #define DP_FMA(a, b, c) __fence(_mm_cvtsd_f64(_mm_fmadd_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c))))
// #define DP_FMA_RN(a, b, c) _mm_cvtsd_f64(_mm_fmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))
// #define DP_FMA_RZ(a, b, c) __fence(_mm_cvtsd_f64(_mm_fmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
//
// #define DP_ROUND_RZ(a) _mm_cvtsd_f64(_mm_roundscale_sd(_mm_setzero_pd(), _mm_set_sd(a), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)))
//
// #define DP_CONST(C) _castu64_f64(0x##C##ull)
// #define DP_AND(X, Y) _mm_cvtsd_f64(_mm_and_pd(_mm_set_sd(X), _mm_set_sd(Y)))
// #define DP_XOR(X, Y) _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(X), _mm_set_sd(Y)))
// #define DP_OR(X, Y) _mm_cvtsd_f64(_mm_or_pd(_mm_set_sd(X), _mm_set_sd(Y)))
// #define DP_DIV_RZ(a, b) __fence(_mm_cvtsd_f64(_mm_div_round_sd(_mm_set_sd(a), _mm_set_sd(b), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
// #define DP_FNMA(a, b, c) __fence(_mm_cvtsd_f64(_mm_fnmadd_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c))))
// #define DP_FNMA_RZ(a, b, c) __fence(_mm_cvtsd_f64(_mm_fnmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
//
// #define D2L(x) _mm_castpd_si128(x)
// // transfer highest 32 bits (of low 64b) to GPR
// #define TRANSFER_HIGH_INT32(X) _mm_extract_epi32(D2L(_mm_set_sd(X)), 1)
//
// double fmod(double x, double y)
// {
// double a, b, sgn_a, q, bs, bs2;
// unsigned eq;
Label L_5280, L_52a0, L_5256, L_5300, L_5320, L_52c0, L_52d0, L_5360, L_5380, L_53b0, L_5390;
Label L_53c0, L_52a6, L_53d0, L_exit;
__ movdqa(xmm2, xmm0);
// // |x|, |y|
// a = DP_AND(x, DP_CONST(7fffffffffffffff));
__ movq(xmm0, xmm0);
__ mov64(rax, 0x7FFFFFFFFFFFFFFFULL);
__ evpbroadcastq(xmm3, rax, Assembler::AVX_128bit);
__ vpand(xmm6, xmm0, xmm3, Assembler::AVX_128bit);
// b = DP_AND(y, DP_CONST(7fffffffffffffff));
__ vpand(xmm4, xmm1, xmm3, Assembler::AVX_128bit);
// // sign(x)
// sgn_a = DP_XOR(x, a);
__ vpxor(xmm3, xmm6, xmm0, Assembler::AVX_128bit);
// q = DP_DIV_RZ(a, b);
__ movq(xmm5, xmm4);
__ evdivsd(xmm0, xmm6, xmm5, Assembler::EVEX_RZ);
// q = DP_ROUND_RZ(q);
__ movq(xmm0, xmm0);
// a = DP_AND(x, DP_CONST(7fffffffffffffff));
__ vxorpd(xmm7, xmm7, xmm7, Assembler::AVX_128bit);
// q = DP_ROUND_RZ(q);
__ vroundsd(xmm0, xmm7, xmm0, 0xb);
// eq = TRANSFER_HIGH_INT32(q);
__ extractps(rax, xmm0, 1);
// if (!eq) return x + sgn_a;
__ testl(rax, rax);
__ jcc(Assembler::equal, L_5280);
// if (eq >= 0x7fefffffu) goto SPECIAL_FMOD;
__ cmpl(rax, 0x7feffffe);
__ jcc(Assembler::belowEqual, L_52a0);
__ vpxor(xmm2, xmm2, xmm2, Assembler::AVX_128bit);
// SPECIAL_FMOD:
//
// // y==0 or x==Inf?
// if ((b == 0.0) || (!(a <= DP_CONST(7fefffffffffffff))))
__ ucomisd(xmm4, xmm2);
__ jcc(Assembler::notEqual, L_5256);
__ jcc(Assembler::noParity, L_5300);
__ bind(L_5256);
__ movsd(xmm2, ExternalAddress((address)CONST_MAX), rax);
__ ucomisd(xmm2, xmm6);
__ jcc(Assembler::below, L_5300);
__ movsd(xmm0, ExternalAddress((address)CONST_INF), rax);
// return DP_FNMA(b, q, a); // NaN
// // y is NaN?
// if (!(b <= DP_CONST(7ff0000000000000))) return y + y;
__ ucomisd(xmm0, xmm4);
__ jcc(Assembler::aboveEqual, L_5320);
__ vaddsd(xmm0, xmm1, xmm1);
__ jmp(L_exit);
// if (!eq) return x + sgn_a;
__ align32();
__ bind(L_5280);
__ vaddsd(xmm0, xmm3, xmm2);
__ jmp(L_exit);
// a = DP_FNMA_RZ(b, q, a);
__ align(8);
__ bind(L_52a0);
__ evfnmadd213sd(xmm0, xmm4, xmm6, Assembler::EVEX_RZ);
// while (b <= a)
__ bind(L_52a6);
__ ucomisd(xmm0, xmm4);
__ jcc(Assembler::aboveEqual, L_52c0);
// a = DP_XOR(a, sgn_a);
__ vpxor(xmm0, xmm3, xmm0, Assembler::AVX_128bit);
__ jmp(L_exit);
__ bind(L_52c0);
__ movq(xmm6, xmm0);
// q = DP_ROUND_RZ(q);
__ vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
__ align32();
__ bind(L_52d0);
// q = DP_DIV_RZ(a, b);
__ evdivsd(xmm2, xmm6, xmm5, Assembler::EVEX_RZ);
// q = DP_ROUND_RZ(q);
__ movq(xmm2, xmm2);
__ vroundsd(xmm2, xmm1, xmm2, 0xb);
// a = DP_FNMA_RZ(b, q, a);
__ evfnmadd213sd(xmm2, xmm4, xmm0, Assembler::EVEX_RZ);
// while (b <= a)
__ ucomisd(xmm2, xmm4);
__ movq(xmm6, xmm2);
__ movapd(xmm0, xmm2);
__ jcc(Assembler::aboveEqual, L_52d0);
// a = DP_XOR(a, sgn_a);
__ vpxor(xmm0, xmm3, xmm2, Assembler::AVX_128bit);
__ jmp(L_exit);
// return DP_FNMA(b, q, a); // NaN
__ bind(L_5300);
__ vfnmadd213sd(xmm0, xmm4, xmm6);
__ jmp(L_exit);
// bs = b * DP_CONST(7fe0000000000000);
__ bind(L_5320);
__ vmulsd(xmm1, xmm4, ExternalAddress((address)CONST_e307), rax);
// q = DP_DIV_RZ(a, bs);
__ movq(xmm2, xmm1);
__ evdivsd(xmm0, xmm6, xmm2, Assembler::EVEX_RZ);
// q = DP_ROUND_RZ(q);
__ movq(xmm0, xmm0);
__ vroundsd(xmm7, xmm7, xmm0, 0xb);
// eq = TRANSFER_HIGH_INT32(q);
__ extractps(rax, xmm7, 1);
// if (eq >= 0x7fefffffu)
__ cmpl(rax, 0x7fefffff);
__ jcc(Assembler::below, L_5360);
// // b* 2*1023 * 2^1023
// bs2 = bs * DP_CONST(7fe0000000000000);
__ vmulsd(xmm0, xmm1, ExternalAddress((address)CONST_e307), rax);
// while (bs2 <= a)
__ ucomisd(xmm6, xmm0);
__ jcc(Assembler::aboveEqual, L_5380);
__ movapd(xmm7, xmm6);
__ jmp(L_53b0);
// a = DP_FNMA_RZ(b, q, a);
__ bind(L_5360);
__ evfnmadd213sd(xmm7, xmm1, xmm6, Assembler::EVEX_RZ);
__ jmp(L_53b0);
// q = DP_ROUND_RZ(q);
__ bind(L_5380);
__ vxorpd(xmm8, xmm8, xmm8, Assembler::AVX_128bit);
// q = DP_DIV_RZ(qa, bs2);
__ align32();
__ bind(L_5390);
__ evdivsd(xmm7, xmm6, xmm0, Assembler::EVEX_RZ);
// q = DP_ROUND_RZ(q);
__ movq(xmm7, xmm7);
__ vroundsd(xmm7, xmm8, xmm7, 0xb);
// a = DP_FNMA_RZ(bs2, q, a);
__ evfnmadd213sd(xmm7, xmm0, xmm6, Assembler::EVEX_RZ);
// while (bs2 <= a)
__ ucomisd(xmm7, xmm0);
__ movapd(xmm6, xmm7);
__ jcc(Assembler::aboveEqual, L_5390);
// while (bs <= a)
__ bind(L_53b0);
__ ucomisd(xmm7, xmm1);
__ jcc(Assembler::aboveEqual, L_53c0);
__ movapd(xmm0, xmm7);
__ jmp(L_52a6);
// q = DP_ROUND_RZ(q);
__ bind(L_53c0);
__ vxorpd(xmm6, xmm6, xmm6, Assembler::AVX_128bit);
// q = DP_DIV_RZ(a, bs);
__ align32();
__ bind(L_53d0);
__ evdivsd(xmm0, xmm7, xmm2, Assembler::EVEX_RZ);
// q = DP_ROUND_RZ(q);
__ movq(xmm0, xmm0);
__ vroundsd(xmm0, xmm6, xmm0, 0xb);
// a = DP_FNMA_RZ(bs, q, a);
__ evfnmadd213sd(xmm0, xmm1, xmm7, Assembler::EVEX_RZ);
// while (bs <= a)
__ ucomisd(xmm0, xmm1);
__ movapd(xmm7, xmm0);
__ jcc(Assembler::aboveEqual, L_53d0);
__ jmp(L_52a6);
__ bind(L_exit);
////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
// AVX2 code
////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
} else if (VM_Version::supports_fma()) { // AVX2 version
Label L_104a, L_11bd, L_10c1, L_1090, L_11b9, L_10e7, L_11af, L_111c, L_10f3, L_116e, L_112a;
Label L_1173, L_1157, L_117f, L_11a0;
// double fmod(double x, double y)
// {
// double a, b, sgn_a, q, bs, bs2, corr, res;
// unsigned eq;
// unsigned mxcsr, mxcsr_rz;
// __asm { stmxcsr DWORD PTR[mxcsr] }
// mxcsr_rz = 0x7f80 | mxcsr;
__ push(rax);
__ stmxcsr(Address(rsp, 0));
__ movl(rax, Address(rsp, 0));
__ movl(rcx, rax);
__ orl(rcx, 0x7f80);
__ movl(Address(rsp, 0x04), rcx);
// // |x|, |y|
// a = DP_AND(x, DP_CONST(7fffffffffffffff));
__ movq(xmm2, xmm0);
__ vmovdqu(xmm3, ExternalAddress((address)CONST_NaN), rcx);
__ vpand(xmm4, xmm2, xmm3, Assembler::AVX_128bit);
// b = DP_AND(y, DP_CONST(7fffffffffffffff));
__ vpand(xmm3, xmm1, xmm3, Assembler::AVX_128bit);
// // sign(x)
// sgn_a = DP_XOR(x, a);
__ mov64(rcx, 0x8000000000000000ULL);
__ movq(xmm5, rcx);
__ vpand(xmm2, xmm2, xmm5, Assembler::AVX_128bit);
// if (a < b) return x + sgn_a;
__ ucomisd(xmm3, xmm4);
__ jcc(Assembler::belowEqual, L_104a);
__ vaddsd(xmm0, xmm2, xmm0);
__ jmp(L_11bd);
// if (((mxcsr & 0x6000)!=0x2000) && (a < b * 0x1p+260))
__ bind(L_104a);
__ andl(rax, 0x6000);
__ cmpl(rax, 0x2000);
__ jcc(Assembler::equal, L_10c1);
__ vmulsd(xmm0, xmm3, ExternalAddress((address)CONST_1p260), rax);
__ ucomisd(xmm0, xmm4);
__ jcc(Assembler::belowEqual, L_10c1);
// {
// q = DP_DIV(a, b);
__ vdivpd(xmm0, xmm4, xmm3, Assembler::AVX_128bit);
// corr = DP_SHR(DP_FNMA(b, q, a), 63);
__ movapd(xmm1, xmm0);
__ vfnmadd213sd(xmm1, xmm3, xmm4);
__ movq(xmm5, xmm1);
__ vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
__ vpcmpgtq(xmm5, xmm1, xmm5, Assembler::AVX_128bit);
// q = DP_PSUBQ(q, corr);
__ vpaddq(xmm0, xmm5, xmm0, Assembler::AVX_128bit);
// q = DP_TRUNC(q);
__ vroundsd(xmm0, xmm0, xmm0, 3);
// a = DP_FNMA(b, q, a);
__ vfnmadd213sd(xmm0, xmm3, xmm4);
__ align32();
// while (b <= a)
__ bind(L_1090);
__ ucomisd(xmm0, xmm3);
__ jcc(Assembler::below, L_11b9);
// {
// q = DP_DIV(a, b);
__ vdivsd(xmm4, xmm0, xmm3);
// corr = DP_SHR(DP_FNMA(b, q, a), 63);
__ movapd(xmm5, xmm4);
__ vfnmadd213sd(xmm5, xmm3, xmm0);
__ movq(xmm5, xmm5);
__ vpcmpgtq(xmm5, xmm1, xmm5, Assembler::AVX_128bit);
// q = DP_PSUBQ(q, corr);
__ vpaddq(xmm4, xmm5, xmm4, Assembler::AVX_128bit);
// q = DP_TRUNC(q);
__ vroundsd(xmm4, xmm4, xmm4, 3);
// a = DP_FNMA(b, q, a);
__ vfnmadd231sd(xmm0, xmm3, xmm4);
__ jmp(L_1090);
// }
// return DP_XOR(a, sgn_a);
// }
// __asm { ldmxcsr DWORD PTR [mxcsr_rz] }
__ bind(L_10c1);
__ ldmxcsr(Address(rsp, 0x04));
// q = DP_DIV(a, b);
__ vdivpd(xmm0, xmm4, xmm3, Assembler::AVX_128bit);
// q = DP_TRUNC(q);
__ vroundsd(xmm0, xmm0, xmm0, 3);
// eq = TRANSFER_HIGH_INT32(q);
__ extractps(rax, xmm0, 1);
// if (__builtin_expect((eq >= 0x7fefffffu), (0==1))) goto SPECIAL_FMOD;
__ cmpl(rax, 0x7feffffe);
__ jcc(Assembler::above, L_10e7);
// a = DP_FNMA(b, q, a);
__ vfnmadd213sd(xmm0, xmm3, xmm4);
__ jmp(L_11af);
// SPECIAL_FMOD:
// // y==0 or x==Inf?
// if ((b == 0.0) || (!(a <= DP_CONST(7fefffffffffffff))))
__ bind(L_10e7);
__ vpxor(xmm5, xmm5, xmm5, Assembler::AVX_128bit);
__ ucomisd(xmm3, xmm5);
__ jcc(Assembler::notEqual, L_10f3);
__ jcc(Assembler::noParity, L_111c);
__ bind(L_10f3);
__ movsd(xmm5, ExternalAddress((address)CONST_MAX), rax);
__ ucomisd(xmm5, xmm4);
__ jcc(Assembler::below, L_111c);
// return res;
// }
// // y is NaN?
// if (!(b <= DP_CONST(7ff0000000000000))) {
__ movsd(xmm0, ExternalAddress((address)CONST_INF), rax);
__ ucomisd(xmm0, xmm3);
__ jcc(Assembler::aboveEqual, L_112a);
// res = y + y;
__ vaddsd(xmm0, xmm1, xmm1);
// __asm { ldmxcsr DWORD PTR[mxcsr] }
__ ldmxcsr(Address(rsp, 0));
__ jmp(L_11bd);
// {
// res = DP_FNMA(b, q, a); // NaN
__ bind(L_111c);
__ vfnmadd213sd(xmm0, xmm3, xmm4);
// __asm { ldmxcsr DWORD PTR[mxcsr] }
__ ldmxcsr(Address(rsp, 0));
__ jmp(L_11bd);
// return res;
// }
// // b* 2*1023
// bs = b * DP_CONST(7fe0000000000000);
__ bind(L_112a);
__ vmulsd(xmm1, xmm3, ExternalAddress((address)CONST_e307), rax);
// q = DP_DIV(a, bs);
__ vdivsd(xmm0, xmm4, xmm1);
// q = DP_TRUNC(q);
__ vroundsd(xmm0, xmm0, xmm0, 3);
// eq = TRANSFER_HIGH_INT32(q);
__ extractps(rax, xmm0, 1);
// if (eq >= 0x7fefffffu)
__ cmpl(rax, 0x7fefffff);
__ jcc(Assembler::below, L_116e);
// {
// // b* 2*1023 * 2^1023
// bs2 = bs * DP_CONST(7fe0000000000000);
__ vmulsd(xmm0, xmm1, ExternalAddress((address)CONST_e307), rax);
// while (bs2 <= a)
__ ucomisd(xmm4, xmm0);
__ jcc(Assembler::below, L_1173);
// {
// q = DP_DIV(a, bs2);
__ bind(L_1157);
__ vdivsd(xmm5, xmm4, xmm0);
// q = DP_TRUNC(q);
__ vroundsd(xmm5, xmm5, xmm5, 3);
// a = DP_FNMA(bs2, q, a);
__ vfnmadd231sd(xmm4, xmm0, xmm5);
// while (bs2 <= a)
__ ucomisd(xmm4, xmm0);
__ jcc(Assembler::aboveEqual, L_1157);
__ jmp(L_1173);
// }
// }
// else
// a = DP_FNMA(bs, q, a);
__ bind(L_116e);
__ vfnmadd231sd(xmm4, xmm1, xmm0);
// while (bs <= a)
__ bind(L_1173);
__ ucomisd(xmm4, xmm1);
__ jcc(Assembler::aboveEqual, L_117f);
__ movapd(xmm0, xmm4);
__ jmp(L_11af);
// {
// q = DP_DIV(a, bs);
__ bind(L_117f);
__ vdivsd(xmm0, xmm4, xmm1);
// q = DP_TRUNC(q);
__ vroundsd(xmm0, xmm0, xmm0, 3);
// a = DP_FNMA(bs, q, a);
__ vfnmadd213sd(xmm0, xmm1, xmm4);
// while (bs <= a)
__ ucomisd(xmm0, xmm1);
__ movapd(xmm4, xmm0);
__ jcc(Assembler::aboveEqual, L_117f);
__ jmp(L_11af);
__ align32();
// {
// q = DP_DIV(a, b);
__ bind(L_11a0);
__ vdivsd(xmm1, xmm0, xmm3);
// q = DP_TRUNC(q);
__ vroundsd(xmm1, xmm1, xmm1, 3);
// a = DP_FNMA(b, q, a);
__ vfnmadd231sd(xmm0, xmm3, xmm1);
// FMOD_CONT:
// while (b <= a)
__ bind(L_11af);
__ ucomisd(xmm0, xmm3);
__ jcc(Assembler::aboveEqual, L_11a0);
// }
// __asm { ldmxcsr DWORD PTR[mxcsr] }
__ ldmxcsr(Address(rsp, 0));
__ bind(L_11b9);
__ vpxor(xmm0, xmm2, xmm0, Assembler::AVX_128bit);
// }
// goto FMOD_CONT;
// }
__ bind(L_11bd);
__ pop(rax);
} else { // SSE version
assert(false, "SSE not implemented");
}
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
#undef __

@ -161,6 +161,7 @@ address StubRoutines::_vectorizedMismatch = nullptr;
address StubRoutines::_dexp = nullptr;
address StubRoutines::_dlog = nullptr;
address StubRoutines::_dlog10 = nullptr;
address StubRoutines::_fmod = nullptr;
address StubRoutines::_dpow = nullptr;
address StubRoutines::_dsin = nullptr;
address StubRoutines::_dcos = nullptr;

@ -249,6 +249,7 @@ class StubRoutines: AllStatic {
static address _dlibm_reduce_pi04l;
static address _dlibm_tan_cot_huge;
static address _dtan;
static address _fmod;
static address _f2hf;
static address _hf2f;
@ -425,6 +426,7 @@ class StubRoutines: AllStatic {
static address dlog() { return _dlog; }
static address dlog10() { return _dlog10; }
static address dpow() { return _dpow; }
static address fmod() { return _fmod; }
static address dsin() { return _dsin; }
static address dcos() { return _dcos; }
static address dlibm_reduce_pi04l() { return _dlibm_reduce_pi04l; }

@ -556,6 +556,7 @@
static_field(StubRoutines, _dlog, address) \
static_field(StubRoutines, _dlog10, address) \
static_field(StubRoutines, _dpow, address) \
static_field(StubRoutines, _fmod, address) \
static_field(StubRoutines, _dsin, address) \
static_field(StubRoutines, _dcos, address) \
static_field(StubRoutines, _dtan, address) \

@ -0,0 +1,131 @@
/*
* Copyright (c) 2023, Intel Corporation. All rights reserved.
* Intel Math Library (LIBM) Source Code
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/**
* @test
* @bug 8308966
* @summary Add intrinsic for float/double modulo for x86 AVX2 and AVX512
* @run main compiler.floatingpoint.DmodTest
*/
package compiler.floatingpoint;
import java.lang.Double;
public class DmodTest {
static double [] op1 = { 1.2345d, 0.0d, -0.0d, 1.0d/0.0d, -1.0d/0.0d, 0.0d/0.0d };
static double [] op2 = { 1.2345d, 0.0d, -0.0d, 1.0d/0.0d, -1.0d/0.0d, 0.0d/0.0d };
static double [][] res = {
{
0.0d,
Double.NaN,
Double.NaN,
1.2345d,
1.2345d,
Double.NaN,
},
{
0.0d,
Double.NaN,
Double.NaN,
0.0d,
0.0d,
Double.NaN,
},
{
-0.0d,
Double.NaN,
Double.NaN,
-0.0d,
-0.0d,
Double.NaN,
},
{
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
},
{
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
},
{
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
Double.NaN,
},
};
public static void main(String[] args) throws Exception {
double f1, f2, f3;
boolean failure = false;
boolean print_failure = false;
for (int i = 0; i < 100_000; i++) {
for (int j = 0; j < op1.length; j++) {
for (int k = 0; k < op2.length; k++) {
f1 = op1[j];
f2 = op2[k];
f3 = f1 % f2;
if (Double.isNaN(res[j][k])) {
if (!Double.isNaN(f3)) {
failure = true;
print_failure = true;
}
} else if (Double.isNaN(f3)) {
failure = true;
print_failure = true;
} else if (f3 != res[j][k]) {
failure = true;
print_failure = true;
}
if (print_failure) {
System.out.println( "Actual " + f1 + " % " + f2 + " = " + f3);
System.out.println( "Expected " + f1 + " % " + f2 + " = " + res[j][k]);
print_failure = false;
}
}
}
}
if (failure) {
throw new RuntimeException("Test Failed");
} else {
System.out.println("Test passed.");
}
}
}

@ -0,0 +1,130 @@
/*
* Copyright (c) 2023, Intel Corporation. All rights reserved.
* Intel Math Library (LIBM) Source Code
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/**
* @test
* @bug 8308966
* @summary Add intrinsic for float/double modulo for x86 AVX2 and AVX512
* @run main compiler.floatingpoint.FmodTest
*/
package compiler.floatingpoint;
import java.lang.Float;
public class FmodTest {
static float [] op1 = { 1.2345f, 0.0f, -0.0f, 1.0f/0.0f, -1.0f/0.0f, 0.0f/0.0f };
static float [] op2 = { 1.2345f, 0.0f, -0.0f, 1.0f/0.0f, -1.0f/0.0f, 0.0f/0.0f };
static float [][] res = {
{
0.0f,
Float.NaN,
Float.NaN,
1.2345f,
1.2345f,
Float.NaN,
},
{
0.0f,
Float.NaN,
Float.NaN,
0.0f,
0.0f,
Float.NaN,
},
{
-0.0f,
Float.NaN,
Float.NaN,
-0.0f,
-0.0f,
Float.NaN,
},
{
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
},
{
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
},
{
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
Float.NaN,
},
};
public static void main(String[] args) throws Exception {
float f1, f2, f3;
boolean failure = false;
boolean print_failure = false;
for (int i = 0; i < 100_000; i++) {
for (int j = 0; j < op1.length; j++) {
for (int k = 0; k < op2.length; k++) {
f1 = op1[j];
f2 = op2[k];
f3 = f1 % f2;
if (Float.isNaN(res[j][k])) {
if (!Float.isNaN(f3)) {
failure = true;
print_failure = true;
}
} else if (Float.isNaN(f3)) {
failure = true;
print_failure = true;
} else if (f3 != res[j][k]) {
failure = true;
print_failure = true;
}
if (print_failure) {
System.out.println( "Actual " + f1 + " % " + f2 + " = " + f3);
System.out.println( "Expected " + f1 + " % " + f2 + " = " + res[j][k]);
print_failure = false;
}
}
}
}
if (failure) {
throw new RuntimeException("Test Failed");
} else {
System.out.println("Test passed.");
}
}
}