8132160: support for AVX 512 call frames and stack management
Simplify save/restore frame on x86 systems which support EVEX. Reviewed-by: kvn, iveresov
This commit is contained in:
parent
9f9739c156
commit
d67924dc8e
File diff suppressed because it is too large
Load Diff
@ -438,7 +438,9 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC {
|
||||
|
||||
};
|
||||
|
||||
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
|
||||
// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
|
||||
// See fxsave and xsave(EVEX enabled) documentation for layout
|
||||
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
|
||||
|
||||
// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
|
||||
// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
|
||||
@ -594,11 +596,16 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
private:
|
||||
|
||||
int evex_encoding;
|
||||
int input_size_in_bits;
|
||||
int avx_vector_len;
|
||||
int tuple_type;
|
||||
bool is_evex_instruction;
|
||||
int _evex_encoding;
|
||||
int _input_size_in_bits;
|
||||
int _avx_vector_len;
|
||||
int _tuple_type;
|
||||
bool _is_evex_instruction;
|
||||
bool _legacy_mode_bw;
|
||||
bool _legacy_mode_dq;
|
||||
bool _legacy_mode_vl;
|
||||
bool _legacy_mode_vlbw;
|
||||
bool _instruction_uses_vl;
|
||||
|
||||
// 64bit prefixes
|
||||
int prefix_and_encode(int reg_enc, bool byteinst = false);
|
||||
@ -972,11 +979,16 @@ private:
|
||||
// belong in macro assembler but there is no need for both varieties to exist
|
||||
|
||||
void init_attributes(void) {
|
||||
evex_encoding = 0;
|
||||
input_size_in_bits = 0;
|
||||
avx_vector_len = AVX_NoVec;
|
||||
tuple_type = EVEX_ETUP;
|
||||
is_evex_instruction = false;
|
||||
_evex_encoding = 0;
|
||||
_input_size_in_bits = 0;
|
||||
_avx_vector_len = AVX_NoVec;
|
||||
_tuple_type = EVEX_ETUP;
|
||||
_is_evex_instruction = false;
|
||||
_legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
|
||||
_legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
|
||||
_legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
|
||||
_legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
|
||||
_instruction_uses_vl = false;
|
||||
}
|
||||
|
||||
void lea(Register dst, Address src);
|
||||
@ -1344,8 +1356,10 @@ private:
|
||||
void fxch(int i = 1);
|
||||
|
||||
void fxrstor(Address src);
|
||||
void xrstor(Address src);
|
||||
|
||||
void fxsave(Address dst);
|
||||
void xsave(Address dst);
|
||||
|
||||
void fyl2x();
|
||||
void frndint();
|
||||
@ -1479,11 +1493,12 @@ private:
|
||||
void movb(Address dst, int imm8);
|
||||
void movb(Register dst, Address src);
|
||||
|
||||
void kmovq(KRegister dst, KRegister src);
|
||||
void kmovql(KRegister dst, KRegister src);
|
||||
void kmovql(KRegister dst, Register src);
|
||||
void kmovdl(KRegister dst, Register src);
|
||||
void kmovq(Address dst, KRegister src);
|
||||
void kmovq(KRegister dst, Address src);
|
||||
void kmovwl(KRegister dst, Register src);
|
||||
void kmovql(Address dst, KRegister src);
|
||||
void kmovql(KRegister dst, Address src);
|
||||
|
||||
void movdl(XMMRegister dst, Register src);
|
||||
void movdl(Register dst, XMMRegister src);
|
||||
@ -1509,9 +1524,12 @@ private:
|
||||
void vmovdqu(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Move Unaligned 512bit Vector
|
||||
void evmovdqu(Address dst, XMMRegister src, int vector_len);
|
||||
void evmovdqu(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void evmovdqul(Address dst, XMMRegister src, int vector_len);
|
||||
void evmovdqul(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void evmovdquq(Address dst, XMMRegister src, int vector_len);
|
||||
void evmovdquq(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
// Move lower 64bit to high 64bit in 128bit register
|
||||
void movlhps(XMMRegister dst, XMMRegister src);
|
||||
@ -1643,6 +1661,7 @@ private:
|
||||
|
||||
// Pemutation of 64bit words
|
||||
void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
|
||||
void vpermq(XMMRegister dst, XMMRegister src, int imm8);
|
||||
|
||||
void pause();
|
||||
|
||||
@ -2061,6 +2080,9 @@ private:
|
||||
void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
|
||||
void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
|
||||
void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
|
||||
void vextractf32x4h(Address dst, XMMRegister src, int value);
|
||||
void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
|
||||
void vinsertf32x4h(XMMRegister dst, Address src, int value);
|
||||
|
||||
// duplicate 4-bytes integer data from src into 8 locations in dest
|
||||
void vpbroadcastd(XMMRegister dst, XMMRegister src);
|
||||
|
@ -3798,16 +3798,24 @@ void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
|
||||
if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
|
||||
__ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
|
||||
}
|
||||
__ xorps(dest->as_xmm_float_reg(),
|
||||
ExternalAddress((address)float_signflip_pool));
|
||||
|
||||
if (UseAVX > 1) {
|
||||
__ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
|
||||
ExternalAddress((address)float_signflip_pool));
|
||||
} else {
|
||||
__ xorps(dest->as_xmm_float_reg(),
|
||||
ExternalAddress((address)float_signflip_pool));
|
||||
}
|
||||
} else if (dest->is_double_xmm()) {
|
||||
if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
|
||||
__ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
|
||||
}
|
||||
__ xorpd(dest->as_xmm_double_reg(),
|
||||
ExternalAddress((address)double_signflip_pool));
|
||||
|
||||
if (UseAVX > 1) {
|
||||
__ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
|
||||
ExternalAddress((address)double_signflip_pool));
|
||||
} else {
|
||||
__ xorpd(dest->as_xmm_double_reg(),
|
||||
ExternalAddress((address)double_signflip_pool));
|
||||
}
|
||||
} else if (left->is_single_fpu() || left->is_double_fpu()) {
|
||||
assert(left->fpu() == 0, "arg must be on TOS");
|
||||
assert(dest->fpu() == 0, "dest must be TOS");
|
||||
|
@ -401,11 +401,9 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
|
||||
|
||||
} else if (UseSSE == 1) {
|
||||
int xmm_off = xmm_regs_as_doubles_off;
|
||||
for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
|
||||
if (n < xmm_bypass_limit) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
}
|
||||
for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
xmm_off += 2;
|
||||
}
|
||||
assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
|
||||
@ -452,14 +450,11 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
|
||||
__ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size));
|
||||
|
||||
// Save the FPU registers in de-opt-able form
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
|
||||
int offset = 0;
|
||||
for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
|
||||
__ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
|
||||
offset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (UseSSE >= 2) {
|
||||
@ -468,52 +463,26 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
|
||||
// so always save them as doubles.
|
||||
// note that float values are _not_ converted automatically, so for float values
|
||||
// the second word contains only garbage data.
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0), xmm0);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8), xmm1);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
|
||||
int xmm_bypass_limit = FrameMap::nof_xmm_regs;
|
||||
int offset = 0;
|
||||
#ifdef _LP64
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
|
||||
if (UseAVX > 2) {
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
|
||||
if (UseAVX < 3) {
|
||||
xmm_bypass_limit = xmm_bypass_limit / 2;
|
||||
}
|
||||
#endif
|
||||
for (int n = 0; n < xmm_bypass_limit; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
|
||||
offset += 8;
|
||||
}
|
||||
#endif // _LP64
|
||||
} else if (UseSSE == 1) {
|
||||
// save XMM registers as float because double not supported without SSE2
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0), xmm0);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8), xmm1);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
|
||||
// save XMM registers as float because double not supported without SSE2(num MMX == num fpu)
|
||||
int offset = 0;
|
||||
for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
__ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
|
||||
offset += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -528,52 +497,26 @@ static void restore_fpu(StubAssembler* sasm, bool restore_fpu_registers = true)
|
||||
if (restore_fpu_registers) {
|
||||
if (UseSSE >= 2) {
|
||||
// restore XMM registers
|
||||
__ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0));
|
||||
__ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8));
|
||||
__ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
|
||||
__ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
|
||||
__ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
|
||||
__ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
|
||||
__ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
|
||||
__ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
|
||||
int xmm_bypass_limit = FrameMap::nof_xmm_regs;
|
||||
#ifdef _LP64
|
||||
__ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64));
|
||||
__ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72));
|
||||
__ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80));
|
||||
__ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88));
|
||||
__ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96));
|
||||
__ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
|
||||
__ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
|
||||
__ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
|
||||
if (UseAVX > 2) {
|
||||
__ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
|
||||
__ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
|
||||
__ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
|
||||
__ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
|
||||
__ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
|
||||
__ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
|
||||
__ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
|
||||
__ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
|
||||
__ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
|
||||
__ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
|
||||
__ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
|
||||
__ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
|
||||
__ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
|
||||
__ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
|
||||
__ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
|
||||
__ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
|
||||
if (UseAVX < 3) {
|
||||
xmm_bypass_limit = xmm_bypass_limit / 2;
|
||||
}
|
||||
#endif
|
||||
int offset = 0;
|
||||
for (int n = 0; n < xmm_bypass_limit; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
__ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
|
||||
offset += 8;
|
||||
}
|
||||
#endif // _LP64
|
||||
} else if (UseSSE == 1) {
|
||||
// restore XMM registers
|
||||
__ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0));
|
||||
__ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8));
|
||||
__ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
|
||||
__ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
|
||||
__ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
|
||||
__ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
|
||||
__ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
|
||||
__ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
|
||||
// restore XMM registers(num MMX == num fpu)
|
||||
int offset = 0;
|
||||
for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
__ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
|
||||
offset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (UseSSE < 2) {
|
||||
|
@ -3751,8 +3751,31 @@ void MacroAssembler::pop_CPU_state() {
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_FPU_state() {
|
||||
NOT_LP64(frstor(Address(rsp, 0));)
|
||||
LP64_ONLY(fxrstor(Address(rsp, 0));)
|
||||
#ifndef _LP64
|
||||
frstor(Address(rsp, 0));
|
||||
#else
|
||||
// AVX will continue to use the fxsave area.
|
||||
// EVEX needs to utilize the xsave area, which is under different
|
||||
// management.
|
||||
if(VM_Version::supports_evex()) {
|
||||
// EDX:EAX describe the XSAVE header and
|
||||
// are obtained while fetching info for XCR0 via cpuid.
|
||||
// These two registers make up 64-bits in the header for which bits
|
||||
// 62:10 are currently reserved for future implementations and unused. Bit 63
|
||||
// is unused for our implementation as we do not utilize
|
||||
// compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
|
||||
// the functionality for PKRU state and MSR tracing.
|
||||
// Ergo we are primarily concerned with bits 7..0, which define
|
||||
// which ISA extensions and features are enabled for a given machine and are
|
||||
// defined in XemXcr0Eax and is used to map the XSAVE area
|
||||
// for restoring registers as described via XCR0.
|
||||
movl(rdx,VM_Version::get_xsave_header_upper_segment());
|
||||
movl(rax,VM_Version::get_xsave_header_lower_segment());
|
||||
xrstor(Address(rsp, 0));
|
||||
} else {
|
||||
fxrstor(Address(rsp, 0));
|
||||
}
|
||||
#endif
|
||||
addptr(rsp, FPUStateSizeInWords * wordSize);
|
||||
}
|
||||
|
||||
@ -3769,13 +3792,49 @@ void MacroAssembler::push_CPU_state() {
|
||||
push_FPU_state();
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
#define XSTATE_BV 0x200
|
||||
#endif
|
||||
|
||||
void MacroAssembler::push_FPU_state() {
|
||||
subptr(rsp, FPUStateSizeInWords * wordSize);
|
||||
#ifndef _LP64
|
||||
fnsave(Address(rsp, 0));
|
||||
fwait();
|
||||
#else
|
||||
fxsave(Address(rsp, 0));
|
||||
// AVX will continue to use the fxsave area.
|
||||
// EVEX needs to utilize the xsave area, which is under different
|
||||
// management.
|
||||
if(VM_Version::supports_evex()) {
|
||||
// Save a copy of EAX and EDX
|
||||
push(rax);
|
||||
push(rdx);
|
||||
// EDX:EAX describe the XSAVE header and
|
||||
// are obtained while fetching info for XCR0 via cpuid.
|
||||
// These two registers make up 64-bits in the header for which bits
|
||||
// 62:10 are currently reserved for future implementations and unused. Bit 63
|
||||
// is unused for our implementation as we do not utilize
|
||||
// compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
|
||||
// the functionality for PKRU state and MSR tracing.
|
||||
// Ergo we are primarily concerned with bits 7..0, which define
|
||||
// which ISA extensions and features are enabled for a given machine and are
|
||||
// defined in XemXcr0Eax and is used to program XSAVE area
|
||||
// for saving the required registers as defined in XCR0.
|
||||
int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
|
||||
int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
|
||||
movl(rdx,xcr0_edx);
|
||||
movl(rax,xcr0_eax);
|
||||
xsave(Address(rsp, wordSize*2));
|
||||
// now Apply control bits and clear bytes 8..23 in the header
|
||||
pop(rdx);
|
||||
pop(rax);
|
||||
movl(Address(rsp, XSTATE_BV), xcr0_eax);
|
||||
movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
|
||||
andq(Address(rsp, XSTATE_BV+8), 0);
|
||||
andq(Address(rsp, XSTATE_BV+16), 0);
|
||||
} else {
|
||||
fxsave(Address(rsp, 0));
|
||||
}
|
||||
#endif // LP64
|
||||
}
|
||||
|
||||
@ -4082,6 +4141,84 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
|
||||
int nds_enc = nds->encoding();
|
||||
int dst_enc = dst->encoding();
|
||||
bool dst_upper_bank = (dst_enc > 15);
|
||||
bool nds_upper_bank = (nds_enc > 15);
|
||||
if (VM_Version::supports_avx512novl() &&
|
||||
(nds_upper_bank || dst_upper_bank)) {
|
||||
if (dst_upper_bank) {
|
||||
subptr(rsp, 64);
|
||||
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
|
||||
movflt(xmm0, nds);
|
||||
if (reachable(src)) {
|
||||
vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
movflt(dst, xmm0);
|
||||
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
|
||||
addptr(rsp, 64);
|
||||
} else {
|
||||
movflt(dst, nds);
|
||||
if (reachable(src)) {
|
||||
vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (reachable(src)) {
|
||||
vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
|
||||
int nds_enc = nds->encoding();
|
||||
int dst_enc = dst->encoding();
|
||||
bool dst_upper_bank = (dst_enc > 15);
|
||||
bool nds_upper_bank = (nds_enc > 15);
|
||||
if (VM_Version::supports_avx512novl() &&
|
||||
(nds_upper_bank || dst_upper_bank)) {
|
||||
if (dst_upper_bank) {
|
||||
subptr(rsp, 64);
|
||||
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
|
||||
movdbl(xmm0, nds);
|
||||
if (reachable(src)) {
|
||||
vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
movdbl(dst, xmm0);
|
||||
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
|
||||
addptr(rsp, 64);
|
||||
} else {
|
||||
movdbl(dst, nds);
|
||||
if (reachable(src)) {
|
||||
vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (reachable(src)) {
|
||||
vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
|
||||
if (reachable(src)) {
|
||||
vxorpd(dst, nds, as_Address(src), vector_len);
|
||||
@ -4318,7 +4455,6 @@ void MacroAssembler::store_check(Register obj, Address dst) {
|
||||
void MacroAssembler::store_check(Register obj) {
|
||||
// Does a store check for the oop in register obj. The content of
|
||||
// register obj is destroyed afterwards.
|
||||
|
||||
BarrierSet* bs = Universe::heap()->barrier_set();
|
||||
assert(bs->kind() == BarrierSet::CardTableForRS ||
|
||||
bs->kind() == BarrierSet::CardTableExtension,
|
||||
@ -4572,69 +4708,58 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
|
||||
|
||||
// if we are coming from c1, xmm registers may be live
|
||||
int off = 0;
|
||||
int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
|
||||
if (UseAVX > 2) {
|
||||
num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
|
||||
}
|
||||
|
||||
if (UseSSE == 1) {
|
||||
subptr(rsp, sizeof(jdouble)*8);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
|
||||
for (int n = 0; n < 8; n++) {
|
||||
movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
|
||||
}
|
||||
} else if (UseSSE >= 2) {
|
||||
if (UseAVX > 2) {
|
||||
push(rbx);
|
||||
movl(rbx, 0xffff);
|
||||
#ifdef _LP64
|
||||
kmovql(k1, rbx);
|
||||
#else
|
||||
kmovdl(k1, rbx);
|
||||
#endif
|
||||
kmovwl(k1, rbx);
|
||||
pop(rbx);
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (MaxVectorSize > 16) {
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
if(UseAVX > 2) {
|
||||
// Save upper half of ZMM registes
|
||||
subptr(rsp, 32*num_xmm_regs);
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
|
||||
}
|
||||
off = 0;
|
||||
}
|
||||
assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
|
||||
// Save upper half of YMM registes
|
||||
subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
|
||||
vextractf128h(Address(rsp, 0),xmm0);
|
||||
vextractf128h(Address(rsp, 16),xmm1);
|
||||
vextractf128h(Address(rsp, 32),xmm2);
|
||||
vextractf128h(Address(rsp, 48),xmm3);
|
||||
vextractf128h(Address(rsp, 64),xmm4);
|
||||
vextractf128h(Address(rsp, 80),xmm5);
|
||||
vextractf128h(Address(rsp, 96),xmm6);
|
||||
vextractf128h(Address(rsp,112),xmm7);
|
||||
#ifdef _LP64
|
||||
vextractf128h(Address(rsp,128),xmm8);
|
||||
vextractf128h(Address(rsp,144),xmm9);
|
||||
vextractf128h(Address(rsp,160),xmm10);
|
||||
vextractf128h(Address(rsp,176),xmm11);
|
||||
vextractf128h(Address(rsp,192),xmm12);
|
||||
vextractf128h(Address(rsp,208),xmm13);
|
||||
vextractf128h(Address(rsp,224),xmm14);
|
||||
vextractf128h(Address(rsp,240),xmm15);
|
||||
#endif
|
||||
subptr(rsp, 16*num_xmm_regs);
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Save whole 128bit (16 bytes) XMM regiters
|
||||
subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
|
||||
movdqu(Address(rsp,off++*16),xmm0);
|
||||
movdqu(Address(rsp,off++*16),xmm1);
|
||||
movdqu(Address(rsp,off++*16),xmm2);
|
||||
movdqu(Address(rsp,off++*16),xmm3);
|
||||
movdqu(Address(rsp,off++*16),xmm4);
|
||||
movdqu(Address(rsp,off++*16),xmm5);
|
||||
movdqu(Address(rsp,off++*16),xmm6);
|
||||
movdqu(Address(rsp,off++*16),xmm7);
|
||||
// Save whole 128bit (16 bytes) XMM registers
|
||||
subptr(rsp, 16*num_xmm_regs);
|
||||
off = 0;
|
||||
#ifdef _LP64
|
||||
movdqu(Address(rsp,off++*16),xmm8);
|
||||
movdqu(Address(rsp,off++*16),xmm9);
|
||||
movdqu(Address(rsp,off++*16),xmm10);
|
||||
movdqu(Address(rsp,off++*16),xmm11);
|
||||
movdqu(Address(rsp,off++*16),xmm12);
|
||||
movdqu(Address(rsp,off++*16),xmm13);
|
||||
movdqu(Address(rsp,off++*16),xmm14);
|
||||
movdqu(Address(rsp,off++*16),xmm15);
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
|
||||
}
|
||||
} else {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
movdqu(Address(rsp, off++*16), as_XMMRegister(n));
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
movdqu(Address(rsp, off++*16), as_XMMRegister(n));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -4689,7 +4814,7 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
|
||||
movsd(Address(rsp, 0), xmm0);
|
||||
fld_d(Address(rsp, 0));
|
||||
#endif // _LP64
|
||||
addptr(rsp, sizeof(jdouble) * nb_args);
|
||||
addptr(rsp, sizeof(jdouble)*nb_args);
|
||||
if (num_fpu_regs_in_use > 1) {
|
||||
// Must save return value to stack and then restore entire FPU
|
||||
// stack except incoming arguments
|
||||
@ -4699,63 +4824,50 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
|
||||
addptr(rsp, sizeof(jdouble));
|
||||
}
|
||||
fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
|
||||
addptr(rsp, sizeof(jdouble) * nb_args);
|
||||
addptr(rsp, sizeof(jdouble)*nb_args);
|
||||
}
|
||||
|
||||
off = 0;
|
||||
if (UseSSE == 1) {
|
||||
movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
|
||||
movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
|
||||
for (int n = 0; n < 8; n++) {
|
||||
movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
|
||||
}
|
||||
addptr(rsp, sizeof(jdouble)*8);
|
||||
} else if (UseSSE >= 2) {
|
||||
// Restore whole 128bit (16 bytes) XMM regiters
|
||||
movdqu(xmm0, Address(rsp,off++*16));
|
||||
movdqu(xmm1, Address(rsp,off++*16));
|
||||
movdqu(xmm2, Address(rsp,off++*16));
|
||||
movdqu(xmm3, Address(rsp,off++*16));
|
||||
movdqu(xmm4, Address(rsp,off++*16));
|
||||
movdqu(xmm5, Address(rsp,off++*16));
|
||||
movdqu(xmm6, Address(rsp,off++*16));
|
||||
movdqu(xmm7, Address(rsp,off++*16));
|
||||
#ifdef _LP64
|
||||
movdqu(xmm8, Address(rsp,off++*16));
|
||||
movdqu(xmm9, Address(rsp,off++*16));
|
||||
movdqu(xmm10, Address(rsp,off++*16));
|
||||
movdqu(xmm11, Address(rsp,off++*16));
|
||||
movdqu(xmm12, Address(rsp,off++*16));
|
||||
movdqu(xmm13, Address(rsp,off++*16));
|
||||
movdqu(xmm14, Address(rsp,off++*16));
|
||||
movdqu(xmm15, Address(rsp,off++*16));
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
movdqu(as_XMMRegister(n), Address(rsp, off++*16));
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
|
||||
}
|
||||
#endif
|
||||
addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
|
||||
addptr(rsp, 16*num_xmm_regs);
|
||||
|
||||
#ifdef COMPILER2
|
||||
if (MaxVectorSize > 16) {
|
||||
// Restore upper half of YMM registes.
|
||||
vinsertf128h(xmm0, Address(rsp, 0));
|
||||
vinsertf128h(xmm1, Address(rsp, 16));
|
||||
vinsertf128h(xmm2, Address(rsp, 32));
|
||||
vinsertf128h(xmm3, Address(rsp, 48));
|
||||
vinsertf128h(xmm4, Address(rsp, 64));
|
||||
vinsertf128h(xmm5, Address(rsp, 80));
|
||||
vinsertf128h(xmm6, Address(rsp, 96));
|
||||
vinsertf128h(xmm7, Address(rsp,112));
|
||||
#ifdef _LP64
|
||||
vinsertf128h(xmm8, Address(rsp,128));
|
||||
vinsertf128h(xmm9, Address(rsp,144));
|
||||
vinsertf128h(xmm10, Address(rsp,160));
|
||||
vinsertf128h(xmm11, Address(rsp,176));
|
||||
vinsertf128h(xmm12, Address(rsp,192));
|
||||
vinsertf128h(xmm13, Address(rsp,208));
|
||||
vinsertf128h(xmm14, Address(rsp,224));
|
||||
vinsertf128h(xmm15, Address(rsp,240));
|
||||
#endif
|
||||
addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
|
||||
}
|
||||
addptr(rsp, 16*num_xmm_regs);
|
||||
if(UseAVX > 2) {
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
|
||||
}
|
||||
addptr(rsp, 32*num_xmm_regs);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -7095,11 +7207,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
|
||||
if (UseAVX > 2) {
|
||||
movl(rtmp, 0xffff);
|
||||
#ifdef _LP64
|
||||
kmovql(k1, rtmp);
|
||||
#else
|
||||
kmovdl(k1, rtmp);
|
||||
#endif
|
||||
kmovwl(k1, rtmp);
|
||||
}
|
||||
movdl(xtmp, value);
|
||||
if (UseAVX > 2 && UseUnalignedLoadStores) {
|
||||
@ -7112,7 +7220,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
align(16);
|
||||
|
||||
BIND(L_fill_64_bytes_loop);
|
||||
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
addptr(to, 64);
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
|
||||
@ -7120,7 +7228,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
BIND(L_check_fill_32_bytes);
|
||||
addl(count, 8 << shift);
|
||||
jccb(Assembler::less, L_check_fill_8_bytes);
|
||||
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
|
||||
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
|
||||
addptr(to, 32);
|
||||
subl(count, 8 << shift);
|
||||
|
||||
@ -8399,6 +8507,14 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
|
||||
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
|
||||
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
movl(tmp, 0xffff);
|
||||
kmovwl(k1, tmp);
|
||||
}
|
||||
|
||||
lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
|
||||
notl(crc); // ~crc
|
||||
cmpl(len, 16);
|
||||
|
@ -1069,6 +1069,9 @@ public:
|
||||
void vsubss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vsubss(dst, nds, src); }
|
||||
void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
|
||||
|
||||
void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
|
||||
void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
|
||||
|
||||
// AVX Vector instructions
|
||||
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
|
||||
|
@ -115,6 +115,7 @@ class RegisterSaver {
|
||||
OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
|
||||
int* total_frame_words, bool verify_fpu, bool save_vectors) {
|
||||
int vect_words = 0;
|
||||
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
|
||||
#ifdef COMPILER2
|
||||
if (save_vectors) {
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
|
||||
@ -173,59 +174,50 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
__ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
|
||||
}
|
||||
|
||||
int off = st0_off;
|
||||
int delta = st1_off - off;
|
||||
|
||||
// Save the FPU registers in de-opt-able form
|
||||
for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
|
||||
__ fstp_d(Address(rsp, off*wordSize));
|
||||
off += delta;
|
||||
}
|
||||
|
||||
__ fstp_d(Address(rsp, st0_off*wordSize)); // st(0)
|
||||
__ fstp_d(Address(rsp, st1_off*wordSize)); // st(1)
|
||||
__ fstp_d(Address(rsp, st2_off*wordSize)); // st(2)
|
||||
__ fstp_d(Address(rsp, st3_off*wordSize)); // st(3)
|
||||
__ fstp_d(Address(rsp, st4_off*wordSize)); // st(4)
|
||||
__ fstp_d(Address(rsp, st5_off*wordSize)); // st(5)
|
||||
__ fstp_d(Address(rsp, st6_off*wordSize)); // st(6)
|
||||
__ fstp_d(Address(rsp, st7_off*wordSize)); // st(7)
|
||||
|
||||
if( UseSSE == 1 ) { // Save the XMM state
|
||||
__ movflt(Address(rsp,xmm0_off*wordSize),xmm0);
|
||||
__ movflt(Address(rsp,xmm1_off*wordSize),xmm1);
|
||||
__ movflt(Address(rsp,xmm2_off*wordSize),xmm2);
|
||||
__ movflt(Address(rsp,xmm3_off*wordSize),xmm3);
|
||||
__ movflt(Address(rsp,xmm4_off*wordSize),xmm4);
|
||||
__ movflt(Address(rsp,xmm5_off*wordSize),xmm5);
|
||||
__ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
|
||||
__ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
|
||||
} else if( UseSSE >= 2 ) {
|
||||
off = xmm0_off;
|
||||
delta = xmm1_off - off;
|
||||
if(UseSSE == 1) { // Save the XMM state
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
|
||||
off += delta;
|
||||
}
|
||||
} else if(UseSSE >= 2) {
|
||||
// Save whole 128bit (16 bytes) XMM regiters
|
||||
__ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
|
||||
__ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
|
||||
__ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
|
||||
__ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
|
||||
__ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
|
||||
__ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
|
||||
__ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
|
||||
__ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
|
||||
off += delta;
|
||||
}
|
||||
} else {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
|
||||
off += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vect_words > 0) {
|
||||
assert(vect_words*wordSize == 128, "");
|
||||
__ subptr(rsp, 128); // Save upper half of YMM registes
|
||||
__ vextractf128h(Address(rsp, 0),xmm0);
|
||||
__ vextractf128h(Address(rsp, 16),xmm1);
|
||||
__ vextractf128h(Address(rsp, 32),xmm2);
|
||||
__ vextractf128h(Address(rsp, 48),xmm3);
|
||||
__ vextractf128h(Address(rsp, 64),xmm4);
|
||||
__ vextractf128h(Address(rsp, 80),xmm5);
|
||||
__ vextractf128h(Address(rsp, 96),xmm6);
|
||||
__ vextractf128h(Address(rsp,112),xmm7);
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
|
||||
}
|
||||
if (UseAVX > 2) {
|
||||
__ subptr(rsp, 256); // Save upper half of ZMM registes
|
||||
__ vextractf64x4h(Address(rsp, 0), xmm0);
|
||||
__ vextractf64x4h(Address(rsp, 32), xmm1);
|
||||
__ vextractf64x4h(Address(rsp, 64), xmm2);
|
||||
__ vextractf64x4h(Address(rsp, 96), xmm3);
|
||||
__ vextractf64x4h(Address(rsp, 128), xmm4);
|
||||
__ vextractf64x4h(Address(rsp, 160), xmm5);
|
||||
__ vextractf64x4h(Address(rsp, 192), xmm6);
|
||||
__ vextractf64x4h(Address(rsp, 224), xmm7);
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -238,58 +230,40 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
OopMap* map = new OopMap( frame_words, 0 );
|
||||
|
||||
#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
|
||||
|
||||
map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg());
|
||||
// rbp, location is known implicitly, no oopMap
|
||||
map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg());
|
||||
// %%% This is really a waste but we'll keep things as they were for now
|
||||
if (true) {
|
||||
#define NEXTREG(x) (x)->as_VMReg()->next()
|
||||
map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0)));
|
||||
map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1)));
|
||||
map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2)));
|
||||
map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3)));
|
||||
map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4)));
|
||||
map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5)));
|
||||
map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6)));
|
||||
map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7)));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6));
|
||||
map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7));
|
||||
|
||||
map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg());
|
||||
// rbp, location is known implicitly, no oopMap
|
||||
map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg());
|
||||
// %%% This is really a waste but we'll keep things as they were for now for the upper component
|
||||
off = st0_off;
|
||||
delta = st1_off - off;
|
||||
for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
|
||||
FloatRegister freg_name = as_FloatRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name));
|
||||
off += delta;
|
||||
}
|
||||
off = xmm0_off;
|
||||
delta = xmm1_off - off;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name));
|
||||
off += delta;
|
||||
}
|
||||
#undef NEXTREG
|
||||
#undef STACK_OFFSET
|
||||
}
|
||||
|
||||
return map;
|
||||
|
||||
}
|
||||
|
||||
void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
|
||||
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
|
||||
// Recover XMM & FPU state
|
||||
int additional_frame_bytes = 0;
|
||||
#ifdef COMPILER2
|
||||
@ -301,52 +275,43 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
|
||||
#else
|
||||
assert(!restore_vectors, "vectors are generated only by C2");
|
||||
#endif
|
||||
int off = xmm0_off;
|
||||
int delta = xmm1_off - off;
|
||||
|
||||
if (UseSSE == 1) {
|
||||
assert(additional_frame_bytes == 0, "");
|
||||
__ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
|
||||
__ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
|
||||
__ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
|
||||
__ movflt(xmm3,Address(rsp,xmm3_off*wordSize));
|
||||
__ movflt(xmm4,Address(rsp,xmm4_off*wordSize));
|
||||
__ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
|
||||
__ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
|
||||
__ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
|
||||
off += delta;
|
||||
}
|
||||
} else if (UseSSE >= 2) {
|
||||
#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
|
||||
__ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
|
||||
__ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
|
||||
__ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
|
||||
__ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
|
||||
__ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
|
||||
__ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
|
||||
__ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
|
||||
__ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
|
||||
#undef STACK_ADDRESS
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
|
||||
off += delta;
|
||||
}
|
||||
} else {
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
|
||||
off += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (restore_vectors) {
|
||||
if (UseAVX > 2) {
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
|
||||
}
|
||||
__ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
|
||||
}
|
||||
// Restore upper half of YMM registes.
|
||||
assert(additional_frame_bytes == 128, "");
|
||||
__ vinsertf128h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf128h(xmm1, Address(rsp, 16));
|
||||
__ vinsertf128h(xmm2, Address(rsp, 32));
|
||||
__ vinsertf128h(xmm3, Address(rsp, 48));
|
||||
__ vinsertf128h(xmm4, Address(rsp, 64));
|
||||
__ vinsertf128h(xmm5, Address(rsp, 80));
|
||||
__ vinsertf128h(xmm6, Address(rsp, 96));
|
||||
__ vinsertf128h(xmm7, Address(rsp,112));
|
||||
__ addptr(rsp, additional_frame_bytes);
|
||||
if (UseAVX > 2) {
|
||||
additional_frame_bytes = 256;
|
||||
__ vinsertf64x4h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf64x4h(xmm1, Address(rsp, 32));
|
||||
__ vinsertf64x4h(xmm2, Address(rsp, 64));
|
||||
__ vinsertf64x4h(xmm3, Address(rsp, 96));
|
||||
__ vinsertf64x4h(xmm4, Address(rsp, 128));
|
||||
__ vinsertf64x4h(xmm5, Address(rsp, 160));
|
||||
__ vinsertf64x4h(xmm6, Address(rsp, 192));
|
||||
__ vinsertf64x4h(xmm7, Address(rsp, 224));
|
||||
__ addptr(rsp, additional_frame_bytes);
|
||||
off = 0;
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
|
||||
}
|
||||
__ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
|
||||
}
|
||||
__ pop_FPU_state();
|
||||
__ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
|
||||
|
@ -69,7 +69,9 @@ class SimpleRuntimeFrame {
|
||||
class RegisterSaver {
|
||||
// Capture info about frame layout. Layout offsets are in jint
|
||||
// units because compiler frame slots are jints.
|
||||
#define HALF_ZMM_BANK_WORDS 128
|
||||
#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
|
||||
#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
|
||||
enum layout {
|
||||
fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
|
||||
xmm_off = fpu_state_off + 160/BytesPerInt, // offset in fxsave save area
|
||||
@ -89,23 +91,24 @@ class RegisterSaver {
|
||||
DEF_XMM_OFFS(13),
|
||||
DEF_XMM_OFFS(14),
|
||||
DEF_XMM_OFFS(15),
|
||||
DEF_XMM_OFFS(16),
|
||||
DEF_XMM_OFFS(17),
|
||||
DEF_XMM_OFFS(18),
|
||||
DEF_XMM_OFFS(19),
|
||||
DEF_XMM_OFFS(20),
|
||||
DEF_XMM_OFFS(21),
|
||||
DEF_XMM_OFFS(22),
|
||||
DEF_XMM_OFFS(23),
|
||||
DEF_XMM_OFFS(24),
|
||||
DEF_XMM_OFFS(25),
|
||||
DEF_XMM_OFFS(26),
|
||||
DEF_XMM_OFFS(27),
|
||||
DEF_XMM_OFFS(28),
|
||||
DEF_XMM_OFFS(29),
|
||||
DEF_XMM_OFFS(30),
|
||||
DEF_XMM_OFFS(31),
|
||||
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
|
||||
zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
|
||||
DEF_ZMM_OFFS(16),
|
||||
DEF_ZMM_OFFS(17),
|
||||
DEF_ZMM_OFFS(18),
|
||||
DEF_ZMM_OFFS(19),
|
||||
DEF_ZMM_OFFS(20),
|
||||
DEF_ZMM_OFFS(21),
|
||||
DEF_ZMM_OFFS(22),
|
||||
DEF_ZMM_OFFS(23),
|
||||
DEF_ZMM_OFFS(24),
|
||||
DEF_ZMM_OFFS(25),
|
||||
DEF_ZMM_OFFS(26),
|
||||
DEF_ZMM_OFFS(27),
|
||||
DEF_ZMM_OFFS(28),
|
||||
DEF_ZMM_OFFS(29),
|
||||
DEF_ZMM_OFFS(30),
|
||||
DEF_ZMM_OFFS(31),
|
||||
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
|
||||
fpu_stateH_end,
|
||||
r15_off, r15H_off,
|
||||
r14_off, r14H_off,
|
||||
@ -155,9 +158,10 @@ class RegisterSaver {
|
||||
|
||||
OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
|
||||
int vect_words = 0;
|
||||
int num_xmm_regs = 16;
|
||||
if (UseAVX > 2) {
|
||||
num_xmm_regs = 32;
|
||||
int off = 0;
|
||||
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
|
||||
if (UseAVX < 3) {
|
||||
num_xmm_regs = num_xmm_regs/2;
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (save_vectors) {
|
||||
@ -165,9 +169,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
// Save upper half of YMM registers
|
||||
vect_words = 16 * num_xmm_regs / wordSize;
|
||||
additional_frame_words += vect_words;
|
||||
if (UseAVX > 2) {
|
||||
// Save upper half of ZMM registers as well
|
||||
if (UseAVX < 3) {
|
||||
additional_frame_words += vect_words;
|
||||
}
|
||||
}
|
||||
@ -195,77 +197,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
__ enter(); // rsp becomes 16-byte aligned here
|
||||
__ push_CPU_state(); // Push a multiple of 16 bytes
|
||||
|
||||
if (vect_words > 0) {
|
||||
// push cpu state handles this on EVEX enabled targets
|
||||
if ((vect_words > 0) && (UseAVX < 3)) {
|
||||
assert(vect_words*wordSize >= 256, "");
|
||||
__ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
|
||||
__ vextractf128h(Address(rsp, 0), xmm0);
|
||||
__ vextractf128h(Address(rsp, 16), xmm1);
|
||||
__ vextractf128h(Address(rsp, 32), xmm2);
|
||||
__ vextractf128h(Address(rsp, 48), xmm3);
|
||||
__ vextractf128h(Address(rsp, 64), xmm4);
|
||||
__ vextractf128h(Address(rsp, 80), xmm5);
|
||||
__ vextractf128h(Address(rsp, 96), xmm6);
|
||||
__ vextractf128h(Address(rsp, 112), xmm7);
|
||||
__ vextractf128h(Address(rsp, 128), xmm8);
|
||||
__ vextractf128h(Address(rsp, 144), xmm9);
|
||||
__ vextractf128h(Address(rsp, 160), xmm10);
|
||||
__ vextractf128h(Address(rsp, 176), xmm11);
|
||||
__ vextractf128h(Address(rsp, 192), xmm12);
|
||||
__ vextractf128h(Address(rsp, 208), xmm13);
|
||||
__ vextractf128h(Address(rsp, 224), xmm14);
|
||||
__ vextractf128h(Address(rsp, 240), xmm15);
|
||||
if (UseAVX > 2) {
|
||||
__ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
|
||||
__ vextractf128h(Address(rsp, 0), xmm16);
|
||||
__ vextractf128h(Address(rsp, 16), xmm17);
|
||||
__ vextractf128h(Address(rsp, 32), xmm18);
|
||||
__ vextractf128h(Address(rsp, 48), xmm19);
|
||||
__ vextractf128h(Address(rsp, 64), xmm20);
|
||||
__ vextractf128h(Address(rsp, 80), xmm21);
|
||||
__ vextractf128h(Address(rsp, 96), xmm22);
|
||||
__ vextractf128h(Address(rsp, 112), xmm23);
|
||||
__ vextractf128h(Address(rsp, 128), xmm24);
|
||||
__ vextractf128h(Address(rsp, 144), xmm25);
|
||||
__ vextractf128h(Address(rsp, 160), xmm26);
|
||||
__ vextractf128h(Address(rsp, 176), xmm27);
|
||||
__ vextractf128h(Address(rsp, 192), xmm28);
|
||||
__ vextractf128h(Address(rsp, 208), xmm29);
|
||||
__ vextractf128h(Address(rsp, 224), xmm30);
|
||||
__ vextractf128h(Address(rsp, 240), xmm31);
|
||||
// Now handle the ZMM registers (0..31)
|
||||
__ subptr(rsp, 1024); // Save upper half of ZMM registes
|
||||
__ vextractf64x4h(Address(rsp, 0), xmm0);
|
||||
__ vextractf64x4h(Address(rsp, 32), xmm1);
|
||||
__ vextractf64x4h(Address(rsp, 64), xmm2);
|
||||
__ vextractf64x4h(Address(rsp, 96), xmm3);
|
||||
__ vextractf64x4h(Address(rsp, 128), xmm4);
|
||||
__ vextractf64x4h(Address(rsp, 160), xmm5);
|
||||
__ vextractf64x4h(Address(rsp, 192), xmm6);
|
||||
__ vextractf64x4h(Address(rsp, 224), xmm7);
|
||||
__ vextractf64x4h(Address(rsp, 256), xmm8);
|
||||
__ vextractf64x4h(Address(rsp, 288), xmm9);
|
||||
__ vextractf64x4h(Address(rsp, 320), xmm10);
|
||||
__ vextractf64x4h(Address(rsp, 352), xmm11);
|
||||
__ vextractf64x4h(Address(rsp, 384), xmm12);
|
||||
__ vextractf64x4h(Address(rsp, 416), xmm13);
|
||||
__ vextractf64x4h(Address(rsp, 448), xmm14);
|
||||
__ vextractf64x4h(Address(rsp, 480), xmm15);
|
||||
__ vextractf64x4h(Address(rsp, 512), xmm16);
|
||||
__ vextractf64x4h(Address(rsp, 544), xmm17);
|
||||
__ vextractf64x4h(Address(rsp, 576), xmm18);
|
||||
__ vextractf64x4h(Address(rsp, 608), xmm19);
|
||||
__ vextractf64x4h(Address(rsp, 640), xmm20);
|
||||
__ vextractf64x4h(Address(rsp, 672), xmm21);
|
||||
__ vextractf64x4h(Address(rsp, 704), xmm22);
|
||||
__ vextractf64x4h(Address(rsp, 736), xmm23);
|
||||
__ vextractf64x4h(Address(rsp, 768), xmm24);
|
||||
__ vextractf64x4h(Address(rsp, 800), xmm25);
|
||||
__ vextractf64x4h(Address(rsp, 832), xmm26);
|
||||
__ vextractf64x4h(Address(rsp, 864), xmm27);
|
||||
__ vextractf64x4h(Address(rsp, 896), xmm28);
|
||||
__ vextractf64x4h(Address(rsp, 928), xmm29);
|
||||
__ vextractf64x4h(Address(rsp, 960), xmm30);
|
||||
__ vextractf64x4h(Address(rsp, 992), xmm31);
|
||||
// Save upper half of YMM registes(0..num_xmm_regs)
|
||||
__ subptr(rsp, num_xmm_regs*16);
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
|
||||
}
|
||||
}
|
||||
if (frame::arg_reg_save_area_bytes != 0) {
|
||||
@ -299,39 +237,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
|
||||
if (UseAVX > 2) {
|
||||
map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
|
||||
// For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
|
||||
// on EVEX enabled targets, we get it included in the xsave area
|
||||
off = xmm0_off;
|
||||
int delta = xmm1_off - off;
|
||||
for (int n = 0; n < 16; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
|
||||
off += delta;
|
||||
}
|
||||
if(UseAVX > 2) {
|
||||
// Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
|
||||
off = zmm16_off;
|
||||
delta = zmm17_off - off;
|
||||
for (int n = 16; n < num_xmm_regs; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
|
||||
off += delta;
|
||||
}
|
||||
}
|
||||
|
||||
// %%% These should all be a waste but we'll keep things as they were for now
|
||||
@ -351,39 +274,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
|
||||
// For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
|
||||
// on EVEX enabled targets, we get it included in the xsave area
|
||||
off = xmm0H_off;
|
||||
delta = xmm1H_off - off;
|
||||
for (int n = 0; n < 16; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
|
||||
off += delta;
|
||||
}
|
||||
if (UseAVX > 2) {
|
||||
map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
|
||||
// Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
|
||||
off = zmm16H_off;
|
||||
delta = zmm17H_off - off;
|
||||
for (int n = 16; n < num_xmm_regs; n++) {
|
||||
XMMRegister xmm_name = as_XMMRegister(n);
|
||||
map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
|
||||
off += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -391,86 +299,25 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
}
|
||||
|
||||
void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
|
||||
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
|
||||
if (UseAVX < 3) {
|
||||
num_xmm_regs = num_xmm_regs/2;
|
||||
}
|
||||
if (frame::arg_reg_save_area_bytes != 0) {
|
||||
// Pop arg register save area
|
||||
__ addptr(rsp, frame::arg_reg_save_area_bytes);
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (restore_vectors) {
|
||||
// Restore upper half of YMM registes (0..15)
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
__ vinsertf128h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf128h(xmm1, Address(rsp, 16));
|
||||
__ vinsertf128h(xmm2, Address(rsp, 32));
|
||||
__ vinsertf128h(xmm3, Address(rsp, 48));
|
||||
__ vinsertf128h(xmm4, Address(rsp, 64));
|
||||
__ vinsertf128h(xmm5, Address(rsp, 80));
|
||||
__ vinsertf128h(xmm6, Address(rsp, 96));
|
||||
__ vinsertf128h(xmm7, Address(rsp,112));
|
||||
__ vinsertf128h(xmm8, Address(rsp,128));
|
||||
__ vinsertf128h(xmm9, Address(rsp,144));
|
||||
__ vinsertf128h(xmm10, Address(rsp,160));
|
||||
__ vinsertf128h(xmm11, Address(rsp,176));
|
||||
__ vinsertf128h(xmm12, Address(rsp,192));
|
||||
__ vinsertf128h(xmm13, Address(rsp,208));
|
||||
__ vinsertf128h(xmm14, Address(rsp,224));
|
||||
__ vinsertf128h(xmm15, Address(rsp,240));
|
||||
__ addptr(rsp, 256);
|
||||
if (UseAVX > 2) {
|
||||
// Restore upper half of YMM registes (16..31)
|
||||
__ vinsertf128h(xmm16, Address(rsp, 0));
|
||||
__ vinsertf128h(xmm17, Address(rsp, 16));
|
||||
__ vinsertf128h(xmm18, Address(rsp, 32));
|
||||
__ vinsertf128h(xmm19, Address(rsp, 48));
|
||||
__ vinsertf128h(xmm20, Address(rsp, 64));
|
||||
__ vinsertf128h(xmm21, Address(rsp, 80));
|
||||
__ vinsertf128h(xmm22, Address(rsp, 96));
|
||||
__ vinsertf128h(xmm23, Address(rsp,112));
|
||||
__ vinsertf128h(xmm24, Address(rsp,128));
|
||||
__ vinsertf128h(xmm25, Address(rsp,144));
|
||||
__ vinsertf128h(xmm26, Address(rsp,160));
|
||||
__ vinsertf128h(xmm27, Address(rsp,176));
|
||||
__ vinsertf128h(xmm28, Address(rsp,192));
|
||||
__ vinsertf128h(xmm29, Address(rsp,208));
|
||||
__ vinsertf128h(xmm30, Address(rsp,224));
|
||||
__ vinsertf128h(xmm31, Address(rsp,240));
|
||||
__ addptr(rsp, 256);
|
||||
// Restore upper half of ZMM registes.
|
||||
__ vinsertf64x4h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf64x4h(xmm1, Address(rsp, 32));
|
||||
__ vinsertf64x4h(xmm2, Address(rsp, 64));
|
||||
__ vinsertf64x4h(xmm3, Address(rsp, 96));
|
||||
__ vinsertf64x4h(xmm4, Address(rsp, 128));
|
||||
__ vinsertf64x4h(xmm5, Address(rsp, 160));
|
||||
__ vinsertf64x4h(xmm6, Address(rsp, 192));
|
||||
__ vinsertf64x4h(xmm7, Address(rsp, 224));
|
||||
__ vinsertf64x4h(xmm8, Address(rsp, 256));
|
||||
__ vinsertf64x4h(xmm9, Address(rsp, 288));
|
||||
__ vinsertf64x4h(xmm10, Address(rsp, 320));
|
||||
__ vinsertf64x4h(xmm11, Address(rsp, 352));
|
||||
__ vinsertf64x4h(xmm12, Address(rsp, 384));
|
||||
__ vinsertf64x4h(xmm13, Address(rsp, 416));
|
||||
__ vinsertf64x4h(xmm14, Address(rsp, 448));
|
||||
__ vinsertf64x4h(xmm15, Address(rsp, 480));
|
||||
__ vinsertf64x4h(xmm16, Address(rsp, 512));
|
||||
__ vinsertf64x4h(xmm17, Address(rsp, 544));
|
||||
__ vinsertf64x4h(xmm18, Address(rsp, 576));
|
||||
__ vinsertf64x4h(xmm19, Address(rsp, 608));
|
||||
__ vinsertf64x4h(xmm20, Address(rsp, 640));
|
||||
__ vinsertf64x4h(xmm21, Address(rsp, 672));
|
||||
__ vinsertf64x4h(xmm22, Address(rsp, 704));
|
||||
__ vinsertf64x4h(xmm23, Address(rsp, 736));
|
||||
__ vinsertf64x4h(xmm24, Address(rsp, 768));
|
||||
__ vinsertf64x4h(xmm25, Address(rsp, 800));
|
||||
__ vinsertf64x4h(xmm26, Address(rsp, 832));
|
||||
__ vinsertf64x4h(xmm27, Address(rsp, 864));
|
||||
__ vinsertf64x4h(xmm28, Address(rsp, 896));
|
||||
__ vinsertf64x4h(xmm29, Address(rsp, 928));
|
||||
__ vinsertf64x4h(xmm30, Address(rsp, 960));
|
||||
__ vinsertf64x4h(xmm31, Address(rsp, 992));
|
||||
__ addptr(rsp, 1024);
|
||||
// On EVEX enabled targets everything is handled in pop fpu state
|
||||
if ((restore_vectors) && (UseAVX < 3)) {
|
||||
assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
|
||||
int off = 0;
|
||||
// Restore upper half of YMM registes (0..num_xmm_regs)
|
||||
for (int n = 0; n < num_xmm_regs; n++) {
|
||||
__ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
|
||||
}
|
||||
__ addptr(rsp, num_xmm_regs*16);
|
||||
}
|
||||
#else
|
||||
assert(!restore_vectors, "vectors are generated only by C2");
|
||||
|
@ -795,6 +795,12 @@ class StubGenerator: public StubCodeGenerator {
|
||||
void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
|
||||
assert( UseSSE >= 2, "supported cpu only" );
|
||||
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
|
||||
if (UseAVX > 2) {
|
||||
__ push(rbx);
|
||||
__ movl(rbx, 0xffff);
|
||||
__ kmovdl(k1, rbx);
|
||||
__ pop(rbx);
|
||||
}
|
||||
// Copy 64-byte chunks
|
||||
__ jmpb(L_copy_64_bytes);
|
||||
__ align(OptoLoopAlignment);
|
||||
@ -802,8 +808,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
if (UseUnalignedLoadStores) {
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, 0));
|
||||
__ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
|
||||
@ -2217,6 +2223,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
const XMMRegister xmm_temp4 = xmm5;
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rdx, 0xffff);
|
||||
__ kmovdl(k1, rdx);
|
||||
}
|
||||
|
||||
__ movptr(from, from_param);
|
||||
__ movptr(key, key_param);
|
||||
|
||||
@ -2315,6 +2330,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
const XMMRegister xmm_temp4 = xmm5;
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rdx, 0xffff);
|
||||
__ kmovdl(k1, rdx);
|
||||
}
|
||||
|
||||
__ movptr(from, from_param);
|
||||
__ movptr(key, key_param);
|
||||
|
||||
@ -2441,6 +2465,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
handleSOERegisters(true /*saving*/);
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rdx, 0xffff);
|
||||
__ kmovdl(k1, rdx);
|
||||
}
|
||||
|
||||
// load registers from incoming parameters
|
||||
const Address from_param(rbp, 8+0);
|
||||
const Address to_param (rbp, 8+4);
|
||||
@ -2602,6 +2634,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
handleSOERegisters(true /*saving*/);
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rdx, 0xffff);
|
||||
__ kmovdl(k1, rdx);
|
||||
}
|
||||
|
||||
// load registers from incoming parameters
|
||||
const Address from_param(rbp, 8+0);
|
||||
const Address to_param (rbp, 8+4);
|
||||
@ -2782,6 +2822,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ enter();
|
||||
handleSOERegisters(true); // Save registers
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rdx, 0xffff);
|
||||
__ kmovdl(k1, rdx);
|
||||
}
|
||||
|
||||
__ movptr(state, state_param);
|
||||
__ movptr(subkeyH, subkeyH_param);
|
||||
__ movptr(data, data_param);
|
||||
|
@ -269,12 +269,16 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ kmovql(k1, rbx);
|
||||
}
|
||||
#ifdef _WIN64
|
||||
int last_reg = 15;
|
||||
if (UseAVX > 2) {
|
||||
for (int i = 6; i <= 31; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
last_reg = 31;
|
||||
}
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int i = xmm_save_first; i <= last_reg; i++) {
|
||||
__ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
|
||||
}
|
||||
} else {
|
||||
for (int i = 6; i <= 15; i++) {
|
||||
for (int i = xmm_save_first; i <= last_reg; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
}
|
||||
}
|
||||
@ -386,13 +390,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// restore regs belonging to calling function
|
||||
#ifdef _WIN64
|
||||
int xmm_ub = 15;
|
||||
if (UseAVX > 2) {
|
||||
xmm_ub = 31;
|
||||
}
|
||||
// emit the restores for xmm regs
|
||||
for (int i = 6; i <= xmm_ub; i++) {
|
||||
__ movdqu(as_XMMRegister(i), xmm_save(i));
|
||||
if (VM_Version::supports_avx512novl()) {
|
||||
for (int i = xmm_save_first; i <= last_reg; i++) {
|
||||
__ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
|
||||
}
|
||||
} else {
|
||||
for (int i = xmm_save_first; i <= last_reg; i++) {
|
||||
__ movdqu(as_XMMRegister(i), xmm_save(i));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__ movptr(r15, r15_save);
|
||||
@ -1342,11 +1348,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(OptoLoopAlignment);
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
if (UseAVX > 2) {
|
||||
__ movl(to, 0xffff);
|
||||
__ kmovql(k1, to);
|
||||
}
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
@ -1422,11 +1432,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(OptoLoopAlignment);
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
if (UseAVX > 2) {
|
||||
__ movl(to, 0xffff);
|
||||
__ kmovql(k1, to);
|
||||
}
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
@ -3106,6 +3120,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rax, 0xffff);
|
||||
__ kmovql(k1, rax);
|
||||
}
|
||||
|
||||
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
|
||||
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
||||
|
||||
@ -3200,6 +3222,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rax, 0xffff);
|
||||
__ kmovql(k1, rax);
|
||||
}
|
||||
|
||||
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
|
||||
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
||||
|
||||
@ -3312,6 +3342,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rax, 0xffff);
|
||||
__ kmovql(k1, rax);
|
||||
}
|
||||
|
||||
#ifdef _WIN64
|
||||
// on win64, fill len_reg from stack position
|
||||
__ movl(len_reg, len_mem);
|
||||
@ -3508,6 +3546,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rax, 0xffff);
|
||||
__ kmovql(k1, rax);
|
||||
}
|
||||
|
||||
#ifdef _WIN64
|
||||
// on win64, fill len_reg from stack position
|
||||
__ movl(len_reg, len_mem);
|
||||
@ -3746,6 +3792,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ enter();
|
||||
|
||||
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
||||
// context for the registers used, where all instructions below are using 128-bit mode
|
||||
// On EVEX without VL and BW, these instructions will all be AVX.
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
__ movl(rax, 0xffff);
|
||||
__ kmovql(k1, rax);
|
||||
}
|
||||
|
||||
#ifdef _WIN64
|
||||
// save the xmm registers which must be preserved 6-10
|
||||
__ subptr(rsp, -rsp_after_call_off * wordSize);
|
||||
|
@ -31,7 +31,7 @@
|
||||
|
||||
enum platform_dependent_constants {
|
||||
code_size1 = 9000, // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 22000 // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 30000 // simply increase if too small (assembler will crash if too small)
|
||||
};
|
||||
|
||||
class x86 {
|
||||
|
@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _
|
||||
|
||||
enum platform_dependent_constants {
|
||||
code_size1 = 19000, // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 24000 // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 32000 // simply increase if too small (assembler will crash if too small)
|
||||
};
|
||||
|
||||
class x86 {
|
||||
|
@ -367,16 +367,12 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
__ movl(rcx, VM_Version::ymm_test_value());
|
||||
__ movdl(xmm0, rcx);
|
||||
__ movl(rcx, 0xffff);
|
||||
#ifdef _LP64
|
||||
__ kmovql(k1, rcx);
|
||||
#else
|
||||
__ kmovdl(k1, rcx);
|
||||
#endif
|
||||
__ kmovwl(k1, rcx);
|
||||
__ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
|
||||
#ifdef _LP64
|
||||
__ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
__ jmp(save_restore_except);
|
||||
@ -427,11 +423,11 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
UseAVX = 3;
|
||||
UseSSE = 2;
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
|
||||
__ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
|
||||
#ifdef _LP64
|
||||
__ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
UseAVX = saved_useavx;
|
||||
|
@ -227,14 +227,15 @@ public:
|
||||
union XemXcr0Eax {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t x87 : 1,
|
||||
sse : 1,
|
||||
ymm : 1,
|
||||
: 2,
|
||||
opmask : 1,
|
||||
zmm512 : 1,
|
||||
zmm32 : 1,
|
||||
: 24;
|
||||
uint32_t x87 : 1,
|
||||
sse : 1,
|
||||
ymm : 1,
|
||||
bndregs : 1,
|
||||
bndcsr : 1,
|
||||
opmask : 1,
|
||||
zmm512 : 1,
|
||||
zmm32 : 1,
|
||||
: 24;
|
||||
} bits;
|
||||
};
|
||||
|
||||
@ -703,6 +704,7 @@ public:
|
||||
static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
|
||||
static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
|
||||
static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
|
||||
static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
|
||||
// Intel features
|
||||
static bool is_intel_family_core() { return is_intel() &&
|
||||
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
|
||||
@ -817,6 +819,12 @@ public:
|
||||
intx count = PrefetchFieldsAhead;
|
||||
return count >= 0 ? count : 1;
|
||||
}
|
||||
static uint32_t get_xsave_header_lower_segment() {
|
||||
return _cpuid_info.xem_xcr0_eax.value;
|
||||
}
|
||||
static uint32_t get_xsave_header_upper_segment() {
|
||||
return _cpuid_info.xem_xcr0_edx;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CPU_X86_VM_VM_VERSION_X86_HPP
|
||||
|
@ -1661,50 +1661,55 @@ const bool Matcher::match_rule_supported(int opcode) {
|
||||
if (!has_match_rule(opcode))
|
||||
return false;
|
||||
|
||||
bool ret_value = true;
|
||||
switch (opcode) {
|
||||
case Op_PopCountI:
|
||||
case Op_PopCountL:
|
||||
if (!UsePopCountInstruction)
|
||||
return false;
|
||||
break;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_MulVI:
|
||||
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
|
||||
return false;
|
||||
break;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_MulVL:
|
||||
case Op_MulReductionVL:
|
||||
if (VM_Version::supports_avx512dq() == false)
|
||||
return false;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_AddReductionVL:
|
||||
if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
|
||||
return false;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_AddReductionVI:
|
||||
if (UseSSE < 3) // requires at least SSE3
|
||||
return false;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_MulReductionVI:
|
||||
if (UseSSE < 4) // requires at least SSE4
|
||||
return false;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_AddReductionVF:
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVF:
|
||||
case Op_MulReductionVD:
|
||||
if (UseSSE < 1) // requires at least SSE
|
||||
return false;
|
||||
break;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_SqrtVD:
|
||||
if (UseAVX < 1) // enabled for AVX only
|
||||
return false;
|
||||
break;
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_CompareAndSwapL:
|
||||
#ifdef _LP64
|
||||
case Op_CompareAndSwapP:
|
||||
#endif
|
||||
if (!VM_Version::supports_cx8())
|
||||
return false;
|
||||
break;
|
||||
ret_value = false;
|
||||
break;
|
||||
}
|
||||
|
||||
return true; // Per default match rules are supported.
|
||||
return ret_value; // Per default match rules are supported.
|
||||
}
|
||||
|
||||
// Max vector size in bytes. 0 if not supported.
|
||||
@ -1725,14 +1730,24 @@ const int Matcher::vector_width_in_bytes(BasicType bt) {
|
||||
case T_DOUBLE:
|
||||
case T_LONG:
|
||||
if (size < 16) return 0;
|
||||
break;
|
||||
case T_FLOAT:
|
||||
case T_INT:
|
||||
if (size < 8) return 0;
|
||||
break;
|
||||
case T_BOOLEAN:
|
||||
case T_BYTE:
|
||||
if (size < 4) return 0;
|
||||
break;
|
||||
case T_CHAR:
|
||||
if (size < 4) return 0;
|
||||
break;
|
||||
case T_BYTE:
|
||||
if (size < 4) return 0;
|
||||
if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
|
||||
break;
|
||||
case T_SHORT:
|
||||
if (size < 4) return 0;
|
||||
if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1804,7 +1819,7 @@ static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo
|
||||
__ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
|
||||
break;
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
|
||||
__ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1859,7 +1874,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
|
||||
__ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
|
||||
break;
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
|
||||
__ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1879,7 +1894,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
|
||||
__ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
|
||||
break;
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
|
||||
__ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1933,9 +1948,40 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
|
||||
}
|
||||
#endif
|
||||
}
|
||||
int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
|
||||
bool is_single_byte = false;
|
||||
int vec_len = 0;
|
||||
if ((UseAVX > 2) && (stack_offset != 0)) {
|
||||
switch (ireg) {
|
||||
case Op_VecS:
|
||||
case Op_VecD:
|
||||
case Op_VecX:
|
||||
break;
|
||||
case Op_VecY:
|
||||
vec_len = 1;
|
||||
break;
|
||||
case Op_VecZ:
|
||||
vec_len = 2;
|
||||
break;
|
||||
}
|
||||
is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
|
||||
}
|
||||
int offset_size = 0;
|
||||
int size = 5;
|
||||
if (UseAVX > 2 ) {
|
||||
if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) {
|
||||
offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
|
||||
size += 2; // Need an additional two bytes for EVEX encoding
|
||||
} else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) {
|
||||
offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
|
||||
} else {
|
||||
offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
|
||||
size += 2; // Need an additional two bytes for EVEX encodding
|
||||
}
|
||||
} else {
|
||||
offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
|
||||
}
|
||||
// VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
|
||||
return 5+offset_size;
|
||||
return size+offset_size;
|
||||
}
|
||||
|
||||
static inline jfloat replicate4_imm(int con, int width) {
|
||||
@ -2679,11 +2725,10 @@ instruct negF_reg_reg(regF dst, regF src) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (NegF src));
|
||||
ins_cost(150);
|
||||
format %{ "vxorps $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
|
||||
format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
|
||||
ins_encode %{
|
||||
int vector_len = 0;
|
||||
__ vxorps($dst$$XMMRegister, $src$$XMMRegister,
|
||||
ExternalAddress(float_signflip()), vector_len);
|
||||
__ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
|
||||
ExternalAddress(float_signflip()));
|
||||
%}
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
@ -2704,12 +2749,11 @@ instruct negD_reg_reg(regD dst, regD src) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (NegD src));
|
||||
ins_cost(150);
|
||||
format %{ "vxorpd $dst, $src, [0x8000000000000000]\t"
|
||||
format %{ "vnegatess $dst, $src, [0x8000000000000000]\t"
|
||||
"# neg double by sign flipping" %}
|
||||
ins_encode %{
|
||||
int vector_len = 0;
|
||||
__ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
|
||||
ExternalAddress(double_signflip()), vector_len);
|
||||
__ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
|
||||
ExternalAddress(double_signflip()));
|
||||
%}
|
||||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
@ -2842,7 +2886,7 @@ instruct loadV64(vecZ dst, memory mem) %{
|
||||
format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
|
||||
ins_encode %{
|
||||
int vector_len = 2;
|
||||
__ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
|
||||
__ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -2899,7 +2943,7 @@ instruct storeV64(memory mem, vecZ src) %{
|
||||
format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
|
||||
ins_encode %{
|
||||
int vector_len = 2;
|
||||
__ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
|
||||
__ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
@ -3319,6 +3363,37 @@ instruct Repl8F_mem(vecY dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct Repl2F_zero(vecD dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate2F zero" %}
|
||||
ins_encode %{
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4F_zero(vecX dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate4F zero" %}
|
||||
ins_encode %{
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8F_zero(vecY dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %}
|
||||
ins_encode %{
|
||||
int vector_len = 1;
|
||||
__ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl2D_mem(vecX dst, memory mem) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
|
||||
match(Set dst (ReplicateD (LoadD mem)));
|
||||
@ -3353,6 +3428,28 @@ instruct Repl4D_mem(vecY dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Replicate double (8 byte) scalar zero to be vector
|
||||
instruct Repl2D_zero(vecX dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
|
||||
ins_encode %{
|
||||
__ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4D_zero(vecY dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
|
||||
ins_encode %{
|
||||
int vector_len = 1;
|
||||
__ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
// ====================GENERIC REPLICATE==========================================
|
||||
|
||||
// Replicate byte scalar to be vector
|
||||
@ -3684,38 +3781,6 @@ instruct Repl4F(vecX dst, regF src) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Replicate float (4 byte) scalar zero to be vector
|
||||
instruct Repl2F_zero(vecD dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate2F zero" %}
|
||||
ins_encode %{
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4F_zero(vecX dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "xorps $dst,$dst\t! replicate4F zero" %}
|
||||
ins_encode %{
|
||||
__ xorps($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8F_zero(vecY dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %}
|
||||
ins_encode %{
|
||||
int vector_len = 1;
|
||||
__ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
// Replicate double (8 bytes) scalar to be vector
|
||||
instruct Repl2D(vecX dst, regD src) %{
|
||||
predicate(n->as_Vector()->length() == 2);
|
||||
@ -3727,28 +3792,6 @@ instruct Repl2D(vecX dst, regD src) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Replicate double (8 byte) scalar zero to be vector
|
||||
instruct Repl2D_zero(vecX dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
|
||||
ins_encode %{
|
||||
__ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4D_zero(vecY dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
|
||||
ins_encode %{
|
||||
int vector_len = 1;
|
||||
__ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
// ====================EVEX REPLICATE=============================================
|
||||
|
||||
instruct Repl4B_mem_evex(vecS dst, memory mem) %{
|
||||
@ -3818,7 +3861,7 @@ instruct Repl32B_mem_evex(vecY dst, memory mem) %{
|
||||
%}
|
||||
|
||||
instruct Repl64B_evex(vecZ dst, rRegI src) %{
|
||||
predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
|
||||
predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateB src));
|
||||
format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
|
||||
ins_encode %{
|
||||
@ -3829,7 +3872,7 @@ instruct Repl64B_evex(vecZ dst, rRegI src) %{
|
||||
%}
|
||||
|
||||
instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
|
||||
predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
|
||||
predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateB (LoadB mem)));
|
||||
format %{ "vpbroadcastb $dst,$mem\t! replicate64B" %}
|
||||
ins_encode %{
|
||||
@ -3866,7 +3909,7 @@ instruct Repl32B_imm_evex(vecY dst, immI con) %{
|
||||
%}
|
||||
|
||||
instruct Repl64B_imm_evex(vecZ dst, immI con) %{
|
||||
predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
|
||||
predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateB con));
|
||||
format %{ "movq $dst,[$constantaddress]\n\t"
|
||||
"vpbroadcastb $dst,$dst\t! upper replicate64B" %}
|
||||
@ -3957,7 +4000,7 @@ instruct Repl16S_mem_evex(vecY dst, memory mem) %{
|
||||
%}
|
||||
|
||||
instruct Repl32S_evex(vecZ dst, rRegI src) %{
|
||||
predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
|
||||
predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateS src));
|
||||
format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
|
||||
ins_encode %{
|
||||
@ -3968,7 +4011,7 @@ instruct Repl32S_evex(vecZ dst, rRegI src) %{
|
||||
%}
|
||||
|
||||
instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
|
||||
predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
|
||||
predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateS (LoadS mem)));
|
||||
format %{ "vpbroadcastw $dst,$mem\t! replicate32S" %}
|
||||
ins_encode %{
|
||||
@ -4005,7 +4048,7 @@ instruct Repl16S_imm_evex(vecY dst, immI con) %{
|
||||
%}
|
||||
|
||||
instruct Repl32S_imm_evex(vecZ dst, immI con) %{
|
||||
predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
|
||||
predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
|
||||
match(Set dst (ReplicateS con));
|
||||
format %{ "movq $dst,[$constantaddress]\n\t"
|
||||
"vpbroadcastw $dst,$dst\t! replicate32S" %}
|
||||
@ -4322,13 +4365,50 @@ instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate8F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
|
||||
match(Set dst (ReplicateF zero));
|
||||
format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %}
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate16F zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
@ -4377,13 +4457,38 @@ instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate2D zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vpxor $dst k0,$dst,$dst\t! replicate4D zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
|
||||
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
|
||||
match(Set dst (ReplicateD zero));
|
||||
format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
|
||||
format %{ "vpxor $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
|
||||
ins_encode %{
|
||||
// Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
|
||||
int vector_len = 2;
|
||||
__ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
__ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
@ -1004,10 +1004,10 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
|
||||
__ vmovdqu(Address(rsp, dst_offset), xmm0);
|
||||
__ vmovdqu(xmm0, Address(rsp, -32));
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, -64), 2);
|
||||
__ evmovdqul(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqul(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqul(xmm0, Address(rsp, -64), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
|
@ -1075,10 +1075,10 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
|
||||
__ vmovdqu(Address(rsp, dst_offset), xmm0);
|
||||
__ vmovdqu(xmm0, Address(rsp, -32));
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, -64), 2);
|
||||
__ evmovdqul(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqul(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqul(xmm0, Address(rsp, -64), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
|
Loading…
x
Reference in New Issue
Block a user