8252847: Optimize primitive arrayCopy stubs using AVX-512 masked instructions
Reviewed-by: neliasso, kvn
This commit is contained in:
parent
ec41046c5c
commit
4b5ac3abac
@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
|
||||
assert(VM_Version::supports_avx512vlbw(), "");
|
||||
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
|
||||
InstructionMark im(this);
|
||||
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
|
||||
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||
attributes.set_embedded_opmask_register_specifier(mask);
|
||||
attributes.set_is_evex_instruction();
|
||||
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x6F);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
|
||||
assert(VM_Version::supports_avx512vlbw(), "");
|
||||
assert(src != xnoreg, "sanity");
|
||||
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
|
||||
InstructionMark im(this);
|
||||
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
|
||||
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||
attributes.reset_is_clear_context();
|
||||
attributes.set_embedded_opmask_register_specifier(mask);
|
||||
attributes.set_is_evex_instruction();
|
||||
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x7F);
|
||||
emit_operand(src, dst);
|
||||
}
|
||||
|
||||
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
|
||||
assert(VM_Version::supports_evex(), "");
|
||||
InstructionMark im(this);
|
||||
@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
|
||||
emit_int16((unsigned char)0xF7, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::shrxq(Register dst, Register src1, Register src2) {
|
||||
assert(VM_Version::supports_bmi2(), "");
|
||||
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int16((unsigned char)0xF7, (0xC0 | encode));
|
||||
}
|
||||
|
||||
#ifndef _LP64
|
||||
|
||||
void Assembler::incl(Register dst) {
|
||||
|
@ -794,7 +794,6 @@ private:
|
||||
|
||||
void decl(Register dst);
|
||||
void decl(Address dst);
|
||||
void decq(Register dst);
|
||||
void decq(Address dst);
|
||||
|
||||
void incl(Register dst);
|
||||
@ -879,6 +878,7 @@ private:
|
||||
void popa_uncached();
|
||||
#endif
|
||||
void vzeroupper_uncached();
|
||||
void decq(Register dst);
|
||||
|
||||
void pusha();
|
||||
void popa();
|
||||
@ -1487,6 +1487,10 @@ private:
|
||||
void evmovdquq(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
// Generic move instructions.
|
||||
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
|
||||
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);
|
||||
|
||||
// Move lower 64bit to high 64bit in 128bit register
|
||||
void movlhps(XMMRegister dst, XMMRegister src);
|
||||
|
||||
@ -1989,6 +1993,8 @@ private:
|
||||
|
||||
void shlxl(Register dst, Register src1, Register src2);
|
||||
void shlxq(Register dst, Register src1, Register src2);
|
||||
void shrxq(Register dst, Register src1, Register src2);
|
||||
|
||||
|
||||
//====================VECTOR ARITHMETIC=====================================
|
||||
|
||||
|
@ -7964,6 +7964,7 @@ void MacroAssembler::cache_wbsync(bool is_pre)
|
||||
sfence();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // _LP64
|
||||
|
||||
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
|
||||
|
@ -1037,6 +1037,18 @@ public:
|
||||
Register rax, Register rcx, Register rdx, Register tmp);
|
||||
#endif
|
||||
|
||||
#ifdef _LP64
|
||||
void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register count, int shift,
|
||||
Register index, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit);
|
||||
|
||||
void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register start_index, Register end_index,
|
||||
Register count, int shift, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit);
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
||||
// these are private because users should be doing movflt/movdbl
|
||||
@ -1725,6 +1737,23 @@ public:
|
||||
|
||||
void cache_wb(Address line);
|
||||
void cache_wbsync(bool is_pre);
|
||||
|
||||
void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift = Address::times_1, int offset = 0,
|
||||
bool use64byteVector = false);
|
||||
|
||||
void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift = Address::times_1, int offset = 0);
|
||||
|
||||
void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
int shift = Address::times_1, int offset = 0);
|
||||
|
||||
void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
bool conjoint, int shift = Address::times_1, int offset = 0,
|
||||
bool use64byteVector = false);
|
||||
|
||||
#endif // _LP64
|
||||
|
||||
void vallones(XMMRegister dst, int vector_len);
|
||||
|
249
src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp
Normal file
249
src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp
Normal file
@ -0,0 +1,249 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Intel Corporation.
|
||||
*
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "asm/macroAssembler.hpp"
|
||||
#include "asm/macroAssembler.inline.hpp"
|
||||
|
||||
#ifdef PRODUCT
|
||||
#define BLOCK_COMMENT(str) /* nothing */
|
||||
#else
|
||||
#define BLOCK_COMMENT(str) block_comment(str)
|
||||
#endif
|
||||
|
||||
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
|
||||
|
||||
#ifdef _LP64
|
||||
|
||||
void MacroAssembler::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register count, int shift,
|
||||
Register index, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit) {
|
||||
Label L_entry_64, L_entry_96, L_entry_128;
|
||||
Label L_entry_160, L_entry_192;
|
||||
|
||||
int size_mat[][6] = {
|
||||
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
|
||||
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
|
||||
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
|
||||
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
|
||||
};
|
||||
|
||||
// Case A) Special case for length less than equal to 32 bytes.
|
||||
cmpq(count, size_mat[shift][0]);
|
||||
jccb(Assembler::greater, L_entry_64);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case B) Special case for length less than equal to 64 bytes.
|
||||
BIND(L_entry_64);
|
||||
cmpq(count, size_mat[shift][1]);
|
||||
jccb(Assembler::greater, L_entry_96);
|
||||
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case C) Special case for length less than equal to 96 bytes.
|
||||
BIND(L_entry_96);
|
||||
cmpq(count, size_mat[shift][2]);
|
||||
jccb(Assembler::greater, L_entry_128);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
|
||||
subq(count, 64 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case D) Special case for length less than equal to 128 bytes.
|
||||
BIND(L_entry_128);
|
||||
cmpq(count, size_mat[shift][3]);
|
||||
jccb(Assembler::greater, L_entry_160);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
|
||||
copy32_avx(to, from, index, xmm, shift, 64);
|
||||
subq(count, 96 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case E) Special case for length less than equal to 160 bytes.
|
||||
BIND(L_entry_160);
|
||||
cmpq(count, size_mat[shift][4]);
|
||||
jccb(Assembler::greater, L_entry_192);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
|
||||
subq(count, 128 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case F) Special case for length less than equal to 192 bytes.
|
||||
BIND(L_entry_192);
|
||||
cmpq(count, size_mat[shift][5]);
|
||||
jcc(Assembler::greater, L_entry);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
|
||||
copy32_avx(to, from, index, xmm, shift, 128);
|
||||
subq(count, 160 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
|
||||
jmp(L_exit);
|
||||
}
|
||||
|
||||
void MacroAssembler::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register start_index, Register end_index,
|
||||
Register count, int shift, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit) {
|
||||
Label L_entry_64, L_entry_96, L_entry_128;
|
||||
Label L_entry_160, L_entry_192;
|
||||
bool avx3 = MaxVectorSize > 32 && AVX3Threshold == 0;
|
||||
|
||||
int size_mat[][6] = {
|
||||
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
|
||||
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
|
||||
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
|
||||
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
|
||||
};
|
||||
|
||||
// Case A) Special case for length less than equal to 32 bytes.
|
||||
cmpq(count, size_mat[shift][0]);
|
||||
jccb(Assembler::greater, L_entry_64);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case B) Special case for length less than equal to 64 bytes.
|
||||
BIND(L_entry_64);
|
||||
cmpq(count, size_mat[shift][1]);
|
||||
jccb(Assembler::greater, L_entry_96);
|
||||
if (avx3) {
|
||||
copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
|
||||
} else {
|
||||
copy32_avx(to, from, end_index, xmm, shift, -32);
|
||||
subq(count, 32 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
}
|
||||
jmp(L_exit);
|
||||
|
||||
// Case C) Special case for length less than equal to 96 bytes.
|
||||
BIND(L_entry_96);
|
||||
cmpq(count, size_mat[shift][2]);
|
||||
jccb(Assembler::greater, L_entry_128);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
|
||||
subq(count, 64 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case D) Special case for length less than equal to 128 bytes.
|
||||
BIND(L_entry_128);
|
||||
cmpq(count, size_mat[shift][3]);
|
||||
jccb(Assembler::greater, L_entry_160);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
|
||||
copy32_avx(to, from, end_index, xmm, shift, -96);
|
||||
subq(count, 96 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case E) Special case for length less than equal to 160 bytes.
|
||||
BIND(L_entry_160);
|
||||
cmpq(count, size_mat[shift][4]);
|
||||
jccb(Assembler::greater, L_entry_192);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
|
||||
subq(count, 128 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
jmp(L_exit);
|
||||
|
||||
// Case F) Special case for length less than equal to 192 bytes.
|
||||
BIND(L_entry_192);
|
||||
cmpq(count, size_mat[shift][5]);
|
||||
jcc(Assembler::greater, L_entry);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
|
||||
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
|
||||
copy32_avx(to, from, end_index, xmm, shift, -160);
|
||||
subq(count, 160 >> shift);
|
||||
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
|
||||
jmp(L_exit);
|
||||
}
|
||||
|
||||
void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift, int offset,
|
||||
bool use64byteVector) {
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
if (!use64byteVector) {
|
||||
copy32_avx(dst, src, index, xmm, shift, offset);
|
||||
subptr(length, 32 >> shift);
|
||||
copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
|
||||
} else {
|
||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||
assert(MaxVectorSize == 64, "vector length != 64");
|
||||
negptr(length);
|
||||
addq(length, 64);
|
||||
mov64(temp, -1);
|
||||
shrxq(temp, temp, length);
|
||||
kmovql(mask, temp);
|
||||
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]);
|
||||
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift, int offset) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||
mov64(temp, 1);
|
||||
shlxq(temp, temp, length);
|
||||
decq(temp);
|
||||
kmovql(mask, temp);
|
||||
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]);
|
||||
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]);
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
int shift, int offset) {
|
||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||
vmovdqu(xmm, Address(src, index, scale, offset));
|
||||
vmovdqu(Address(dst, index, scale, offset), xmm);
|
||||
}
|
||||
|
||||
|
||||
void MacroAssembler::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
bool conjoint, int shift, int offset, bool use64byteVector) {
|
||||
assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
|
||||
if (!use64byteVector) {
|
||||
if (conjoint) {
|
||||
copy32_avx(dst, src, index, xmm, shift, offset+32);
|
||||
copy32_avx(dst, src, index, xmm, shift, offset);
|
||||
} else {
|
||||
copy32_avx(dst, src, index, xmm, shift, offset);
|
||||
copy32_avx(dst, src, index, xmm, shift, offset+32);
|
||||
}
|
||||
} else {
|
||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||
evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
|
||||
evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -1124,59 +1124,28 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(OptoLoopAlignment);
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
if (UseAVX > 2) {
|
||||
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
|
||||
__ jccb(Assembler::less, L_above_threshold);
|
||||
__ jmpb(L_below_threshold);
|
||||
|
||||
__ bind(L_loop_avx512);
|
||||
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
|
||||
__ bind(L_above_threshold);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop_avx512);
|
||||
__ jmpb(L_32_byte_head);
|
||||
|
||||
__ bind(L_loop_avx2);
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
|
||||
__ bind(L_below_threshold);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop_avx2);
|
||||
|
||||
__ bind(L_32_byte_head);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
} else {
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
|
||||
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
|
||||
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
|
||||
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
|
||||
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ addptr(qword_count, 8);
|
||||
__ jcc(Assembler::lessEqual, L_loop);
|
||||
__ subptr(qword_count, 4); // sub(8) and add(4)
|
||||
__ jccb(Assembler::greater, L_end);
|
||||
// Copy trailing 32 bytes
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
|
||||
@ -1232,60 +1201,29 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(OptoLoopAlignment);
|
||||
if (UseUnalignedLoadStores) {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
if (UseAVX > 2) {
|
||||
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ cmpptr(qword_count, (AVX3Threshold / 8));
|
||||
__ jccb(Assembler::greater, L_above_threshold);
|
||||
__ jmpb(L_below_threshold);
|
||||
|
||||
__ BIND(L_loop_avx512);
|
||||
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
|
||||
__ bind(L_above_threshold);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop_avx512);
|
||||
__ jmpb(L_32_byte_head);
|
||||
|
||||
__ bind(L_loop_avx2);
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
__ bind(L_below_threshold);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop_avx2);
|
||||
|
||||
__ bind(L_32_byte_head);
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
} else {
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
} else {
|
||||
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
|
||||
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
|
||||
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
|
||||
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop);
|
||||
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
|
||||
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
|
||||
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
|
||||
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
|
||||
}
|
||||
|
||||
__ BIND(L_copy_bytes);
|
||||
__ subptr(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_loop);
|
||||
|
||||
__ addptr(qword_count, 4); // add(8) and sub(4)
|
||||
__ jccb(Assembler::less, L_end);
|
||||
// Copy trailing 32 bytes
|
||||
if (UseAVX >= 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
|
||||
@ -1323,6 +1261,442 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
int& get_profile_ctr(int shift) {
|
||||
if ( 0 == shift)
|
||||
return SharedRuntime::_jbyte_array_copy_ctr;
|
||||
else if(1 == shift)
|
||||
return SharedRuntime::_jshort_array_copy_ctr;
|
||||
else if(2 == shift)
|
||||
return SharedRuntime::_jint_array_copy_ctr;
|
||||
else
|
||||
return SharedRuntime::_jlong_array_copy_ctr;
|
||||
}
|
||||
#endif
|
||||
|
||||
void setup_argument_regs(BasicType type) {
|
||||
if (type == T_BYTE || type == T_SHORT) {
|
||||
setup_arg_regs(); // from => rdi, to => rsi, count => rdx
|
||||
// r9 and r10 may be used to save non-volatile registers
|
||||
} else {
|
||||
setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
|
||||
// r9 is used to save r15_thread
|
||||
}
|
||||
}
|
||||
|
||||
void restore_argument_regs(BasicType type) {
|
||||
if (type == T_BYTE || type == T_SHORT) {
|
||||
restore_arg_regs();
|
||||
} else {
|
||||
restore_arg_regs_using_thread();
|
||||
}
|
||||
}
|
||||
|
||||
// Note: Following rules apply to AVX3 optimized arraycopy stubs:-
|
||||
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
|
||||
// for both special cases (various small block sizes) and aligned copy loop. This is the
|
||||
// default configuration.
|
||||
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
|
||||
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
|
||||
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
|
||||
// better performance for disjoint copies. For conjoint/backward copy vector based
|
||||
// copy performs better.
|
||||
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
|
||||
// 64 byte vector registers (ZMMs).
|
||||
|
||||
// Inputs:
|
||||
// c_rarg0 - source array address
|
||||
// c_rarg1 - destination array address
|
||||
// c_rarg2 - element count, treated as ssize_t, can be zero
|
||||
//
|
||||
//
|
||||
// Side Effects:
|
||||
// disjoint_copy_avx3_masked is set to the no-overlap entry point
|
||||
// used by generate_conjoint_[byte/int/short/long]_copy().
|
||||
//
|
||||
|
||||
address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
|
||||
bool aligned, bool is_oop, bool dest_uninitialized) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
|
||||
bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
|
||||
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
|
||||
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
|
||||
const Register from = rdi; // source array address
|
||||
const Register to = rsi; // destination array address
|
||||
const Register count = rdx; // elements count
|
||||
const Register temp1 = r8;
|
||||
const Register temp2 = r11;
|
||||
const Register temp3 = rax;
|
||||
const Register temp4 = rcx;
|
||||
// End pointers are inclusive, and if count is not zero they point
|
||||
// to the last unit copied: end_to[0] := end_from[0]
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
|
||||
|
||||
if (entry != NULL) {
|
||||
*entry = __ pc();
|
||||
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
||||
BLOCK_COMMENT("Entry:");
|
||||
}
|
||||
|
||||
BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
BasicType type = is_oop ? T_OBJECT : type_vec[shift];
|
||||
|
||||
setup_argument_regs(type);
|
||||
|
||||
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
|
||||
if (dest_uninitialized) {
|
||||
decorators |= IS_DEST_UNINITIALIZED;
|
||||
}
|
||||
if (aligned) {
|
||||
decorators |= ARRAYCOPY_ALIGNED;
|
||||
}
|
||||
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
||||
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
|
||||
|
||||
{
|
||||
// Type(shift) byte(0), short(1), int(2), long(3)
|
||||
int loop_size[] = { 192, 96, 48, 24};
|
||||
int threshold[] = { 4096, 2048, 1024, 512};
|
||||
|
||||
// UnsafeCopyMemory page error: continue after ucm
|
||||
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
|
||||
// 'from', 'to' and 'count' are now valid
|
||||
|
||||
// temp1 holds remaining count and temp4 holds running count used to compute
|
||||
// next address offset for start of to/from addresses (temp4 * scale).
|
||||
__ mov64(temp4, 0);
|
||||
__ movq(temp1, count);
|
||||
|
||||
// Zero length check.
|
||||
__ BIND(L_tail);
|
||||
__ cmpq(temp1, 0);
|
||||
__ jcc(Assembler::lessEqual, L_exit);
|
||||
|
||||
// Special cases using 32 byte [masked] vector copy operations.
|
||||
__ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
|
||||
temp4, temp3, use64byteVector, L_entry, L_exit);
|
||||
|
||||
// PRE-MAIN-POST loop for aligned copy.
|
||||
__ BIND(L_entry);
|
||||
|
||||
if (AVX3Threshold != 0) {
|
||||
__ cmpq(count, threshold[shift]);
|
||||
if (MaxVectorSize == 64) {
|
||||
// Copy using 64 byte vectors.
|
||||
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
|
||||
} else {
|
||||
assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
|
||||
// REP MOVS offer a faster copy path.
|
||||
__ jcc(Assembler::greaterEqual, L_repmovs);
|
||||
}
|
||||
}
|
||||
|
||||
if (MaxVectorSize < 64 || AVX3Threshold != 0) {
|
||||
// Partial copy to make dst address 32 byte aligned.
|
||||
__ movq(temp2, to);
|
||||
__ andq(temp2, 31);
|
||||
__ jcc(Assembler::equal, L_main_pre_loop);
|
||||
|
||||
__ negptr(temp2);
|
||||
__ addq(temp2, 32);
|
||||
if (shift) {
|
||||
__ shrq(temp2, shift);
|
||||
}
|
||||
__ movq(temp3, temp2);
|
||||
__ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
|
||||
__ movq(temp4, temp2);
|
||||
__ movq(temp1, count);
|
||||
__ subq(temp1, temp2);
|
||||
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::less, L_tail);
|
||||
|
||||
__ BIND(L_main_pre_loop);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
|
||||
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
||||
__ BIND(L_main_loop);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
|
||||
__ addptr(temp4, loop_size[shift]);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::greater, L_main_loop);
|
||||
|
||||
__ addq(temp1, loop_size[shift]);
|
||||
|
||||
// Tail loop.
|
||||
__ jmp(L_tail);
|
||||
|
||||
__ BIND(L_repmovs);
|
||||
__ movq(temp2, temp1);
|
||||
// Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
|
||||
__ movq(temp3, to);
|
||||
__ movq(to, from);
|
||||
__ movq(from, temp3);
|
||||
// Save to/from for restoration post rep_mov.
|
||||
__ movq(temp1, to);
|
||||
__ movq(temp3, from);
|
||||
if(shift < 3) {
|
||||
__ shrq(temp2, 3-shift); // quad word count
|
||||
}
|
||||
__ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
|
||||
__ rep_mov();
|
||||
__ shlq(temp2, 3); // convert quad words into byte count.
|
||||
if(shift) {
|
||||
__ shrq(temp2, shift); // type specific count.
|
||||
}
|
||||
// Restore original addresses in to/from.
|
||||
__ movq(to, temp3);
|
||||
__ movq(from, temp1);
|
||||
__ movq(temp4, temp2);
|
||||
__ movq(temp1, count);
|
||||
__ subq(temp1, temp2); // tailing part (less than a quad ward size).
|
||||
__ jmp(L_tail);
|
||||
}
|
||||
|
||||
if (MaxVectorSize > 32) {
|
||||
__ BIND(L_pre_main_post_64);
|
||||
// Partial copy to make dst address 64 byte aligned.
|
||||
__ movq(temp2, to);
|
||||
__ andq(temp2, 63);
|
||||
__ jcc(Assembler::equal, L_main_pre_loop_64bytes);
|
||||
|
||||
__ negptr(temp2);
|
||||
__ addq(temp2, 64);
|
||||
if (shift) {
|
||||
__ shrq(temp2, shift);
|
||||
}
|
||||
__ movq(temp3, temp2);
|
||||
__ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
|
||||
__ movq(temp4, temp2);
|
||||
__ movq(temp1, count);
|
||||
__ subq(temp1, temp2);
|
||||
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::less, L_tail64);
|
||||
|
||||
__ BIND(L_main_pre_loop_64bytes);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
|
||||
// Main loop with aligned copy block size of 192 bytes at
|
||||
// 64 byte copy granularity.
|
||||
__ BIND(L_main_loop_64bytes);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
|
||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
|
||||
__ addptr(temp4, loop_size[shift]);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::greater, L_main_loop_64bytes);
|
||||
|
||||
__ addq(temp1, loop_size[shift]);
|
||||
// Zero length check.
|
||||
__ jcc(Assembler::lessEqual, L_exit);
|
||||
|
||||
__ BIND(L_tail64);
|
||||
|
||||
// Tail handling using 64 byte [masked] vector copy operations.
|
||||
use64byteVector = true;
|
||||
__ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
|
||||
temp4, temp3, use64byteVector, L_entry, L_exit);
|
||||
}
|
||||
__ BIND(L_exit);
|
||||
}
|
||||
|
||||
address ucme_exit_pc = __ pc();
|
||||
// When called from generic_arraycopy r11 contains specific values
|
||||
// used during arraycopy epilogue, re-initializing r11.
|
||||
if (is_oop) {
|
||||
__ movq(r11, shift == 3 ? count : to);
|
||||
}
|
||||
bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
|
||||
restore_argument_regs(type);
|
||||
inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
|
||||
__ xorptr(rax, rax); // return 0
|
||||
__ vzeroupper();
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
// Inputs:
|
||||
// c_rarg0 - source array address
|
||||
// c_rarg1 - destination array address
|
||||
// c_rarg2 - element count, treated as ssize_t, can be zero
|
||||
//
|
||||
//
|
||||
address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
|
||||
address nooverlap_target, bool aligned, bool is_oop,
|
||||
bool dest_uninitialized) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
|
||||
bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
|
||||
|
||||
Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
|
||||
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
|
||||
const Register from = rdi; // source array address
|
||||
const Register to = rsi; // destination array address
|
||||
const Register count = rdx; // elements count
|
||||
const Register temp1 = r8;
|
||||
const Register temp2 = rcx;
|
||||
const Register temp3 = r11;
|
||||
const Register temp4 = rax;
|
||||
// End pointers are inclusive, and if count is not zero they point
|
||||
// to the last unit copied: end_to[0] := end_from[0]
|
||||
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
|
||||
|
||||
if (entry != NULL) {
|
||||
*entry = __ pc();
|
||||
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
||||
BLOCK_COMMENT("Entry:");
|
||||
}
|
||||
|
||||
array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
|
||||
|
||||
BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||
BasicType type = is_oop ? T_OBJECT : type_vec[shift];
|
||||
|
||||
setup_argument_regs(type);
|
||||
|
||||
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
|
||||
if (dest_uninitialized) {
|
||||
decorators |= IS_DEST_UNINITIALIZED;
|
||||
}
|
||||
if (aligned) {
|
||||
decorators |= ARRAYCOPY_ALIGNED;
|
||||
}
|
||||
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
||||
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
|
||||
{
|
||||
// Type(shift) byte(0), short(1), int(2), long(3)
|
||||
int loop_size[] = { 192, 96, 48, 24};
|
||||
int threshold[] = { 4096, 2048, 1024, 512};
|
||||
|
||||
// UnsafeCopyMemory page error: continue after ucm
|
||||
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
|
||||
// 'from', 'to' and 'count' are now valid
|
||||
|
||||
// temp1 holds remaining count.
|
||||
__ movq(temp1, count);
|
||||
|
||||
// Zero length check.
|
||||
__ BIND(L_tail);
|
||||
__ cmpq(temp1, 0);
|
||||
__ jcc(Assembler::lessEqual, L_exit);
|
||||
|
||||
__ mov64(temp2, 0);
|
||||
__ movq(temp3, temp1);
|
||||
// Special cases using 32 byte [masked] vector copy operations.
|
||||
__ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
|
||||
temp4, use64byteVector, L_entry, L_exit);
|
||||
|
||||
// PRE-MAIN-POST loop for aligned copy.
|
||||
__ BIND(L_entry);
|
||||
|
||||
if (MaxVectorSize > 32 && AVX3Threshold != 0) {
|
||||
__ cmpq(temp1, threshold[shift]);
|
||||
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
|
||||
}
|
||||
|
||||
if (MaxVectorSize < 64 || AVX3Threshold != 0) {
|
||||
// Partial copy to make dst address 32 byte aligned.
|
||||
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
|
||||
__ andq(temp2, 31);
|
||||
__ jcc(Assembler::equal, L_main_pre_loop);
|
||||
|
||||
if (shift) {
|
||||
__ shrq(temp2, shift);
|
||||
}
|
||||
__ subq(temp1, temp2);
|
||||
__ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
|
||||
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::less, L_tail);
|
||||
|
||||
__ BIND(L_main_pre_loop);
|
||||
|
||||
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
||||
__ BIND(L_main_loop);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
|
||||
__ subptr(temp1, loop_size[shift]);
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::greater, L_main_loop);
|
||||
|
||||
// Tail loop.
|
||||
__ jmp(L_tail);
|
||||
}
|
||||
|
||||
if (MaxVectorSize > 32) {
|
||||
__ BIND(L_pre_main_post_64);
|
||||
// Partial copy to make dst address 64 byte aligned.
|
||||
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
|
||||
__ andq(temp2, 63);
|
||||
__ jcc(Assembler::equal, L_main_pre_loop_64bytes);
|
||||
|
||||
if (shift) {
|
||||
__ shrq(temp2, shift);
|
||||
}
|
||||
__ subq(temp1, temp2);
|
||||
__ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
|
||||
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::less, L_tail64);
|
||||
|
||||
__ BIND(L_main_pre_loop_64bytes);
|
||||
|
||||
// Main loop with aligned copy block size of 192 bytes at
|
||||
// 64 byte copy granularity.
|
||||
__ BIND(L_main_loop_64bytes);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
|
||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::greater, L_main_loop_64bytes);
|
||||
|
||||
// Zero length check.
|
||||
__ cmpq(temp1, 0);
|
||||
__ jcc(Assembler::lessEqual, L_exit);
|
||||
|
||||
__ BIND(L_tail64);
|
||||
|
||||
// Tail handling using 64 byte [masked] vector copy operations.
|
||||
use64byteVector = true;
|
||||
__ mov64(temp2, 0);
|
||||
__ movq(temp3, temp1);
|
||||
__ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
|
||||
temp4, use64byteVector, L_entry, L_exit);
|
||||
}
|
||||
__ BIND(L_exit);
|
||||
}
|
||||
address ucme_exit_pc = __ pc();
|
||||
// When called from generic_arraycopy r11 contains specific values
|
||||
// used during arraycopy epilogue, re-initializing r11.
|
||||
if(is_oop) {
|
||||
__ movq(r11, count);
|
||||
}
|
||||
bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
|
||||
restore_argument_regs(type);
|
||||
inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
|
||||
__ xorptr(rax, rax); // return 0
|
||||
__ vzeroupper();
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
// Arguments:
|
||||
// aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
|
||||
// ignored
|
||||
@ -1343,6 +1717,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// used by generate_conjoint_byte_copy().
|
||||
//
|
||||
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
|
||||
aligned, false, false);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1453,6 +1831,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
//
|
||||
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
|
||||
address* entry, const char *name) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
|
||||
nooverlap_target, aligned, false, false);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1558,6 +1940,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// used by generate_conjoint_short_copy().
|
||||
//
|
||||
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
|
||||
aligned, false, false);
|
||||
}
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1682,6 +2069,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
//
|
||||
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
|
||||
address *entry, const char *name) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
|
||||
nooverlap_target, aligned, false, false);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1780,6 +2171,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
//
|
||||
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
|
||||
const char *name, bool dest_uninitialized = false) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
|
||||
aligned, is_oop, dest_uninitialized);
|
||||
}
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1884,6 +2280,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
|
||||
address *entry, const char *name,
|
||||
bool dest_uninitialized = false) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
|
||||
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -1991,6 +2391,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
//
|
||||
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
|
||||
const char *name, bool dest_uninitialized = false) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
|
||||
aligned, is_oop, dest_uninitialized);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
@ -2095,6 +2499,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
|
||||
address nooverlap_target, address *entry,
|
||||
const char *name, bool dest_uninitialized = false) {
|
||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
||||
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
|
||||
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
||||
}
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ pc();
|
||||
|
@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
|
||||
|
||||
enum platform_dependent_constants {
|
||||
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 35300 LP64_ONLY(+11400) // simply increase if too small (assembler will crash if too small)
|
||||
code_size2 = 35300 LP64_ONLY(+21400) // simply increase if too small (assembler will crash if too small)
|
||||
};
|
||||
|
||||
class x86 {
|
||||
|
@ -763,6 +763,8 @@ void VM_Version::get_processor_features() {
|
||||
if (is_intel()) { // Intel cpus specific settings
|
||||
if (is_knights_family()) {
|
||||
_features &= ~CPU_VZEROUPPER;
|
||||
_features &= ~CPU_AVX512BW;
|
||||
_features &= ~CPU_AVX512VL;
|
||||
}
|
||||
}
|
||||
|
||||
|
269
test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java
Normal file
269
test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java
Normal file
@ -0,0 +1,269 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.arraycopy;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8251871
|
||||
* @summary Optimize arrayCopy using AVX-512 masked instructions.
|
||||
*
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
|
||||
* compiler.arraycopy.TestArrayCopyConjoint
|
||||
*
|
||||
*/
|
||||
|
||||
public class TestArrayCopyConjoint {
|
||||
|
||||
public static final int SIZE = 4096;
|
||||
public static byte[] fromByteArr, toByteArr, valByteArr;
|
||||
public static char[] fromCharArr, toCharArr, valCharArr;
|
||||
public static int[] fromIntArr, toIntArr, valIntArr;
|
||||
public static long[] fromLongArr, toLongArr, valLongArr;
|
||||
|
||||
static public void reinit(Class<?> c) {
|
||||
if (c == byte.class) {
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromByteArr[i] = (byte)i;
|
||||
}
|
||||
} else if (c == char.class) {
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromCharArr[i] = (char)i;
|
||||
}
|
||||
} else if (c == int.class) {
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromIntArr[i] = i;
|
||||
}
|
||||
} else {
|
||||
assert c == long.class;
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromLongArr[i] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static public void setup() {
|
||||
// Both positions aligned
|
||||
fromByteArr = new byte[SIZE];
|
||||
valByteArr = new byte[SIZE];
|
||||
toByteArr = fromByteArr;
|
||||
fromCharArr = new char[SIZE];
|
||||
valCharArr = new char[SIZE];
|
||||
toCharArr = fromCharArr;
|
||||
fromIntArr = new int[SIZE];
|
||||
valIntArr = new int[SIZE];
|
||||
toIntArr = fromIntArr;
|
||||
fromLongArr = new long[SIZE];
|
||||
valLongArr = new long[SIZE];
|
||||
toLongArr = fromLongArr;
|
||||
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromByteArr[i] = (byte)i;
|
||||
valByteArr[i] = (byte)i;
|
||||
fromCharArr[i] = (char)i;
|
||||
valCharArr[i] = (char)i;
|
||||
fromIntArr[i] = i;
|
||||
valIntArr[i] = i;
|
||||
fromLongArr[i] = i;
|
||||
valLongArr[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
public static int validate_ctr = 0;
|
||||
public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
|
||||
validate_ctr++;
|
||||
if (arr instanceof byte []) {
|
||||
byte [] barr = (byte [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (valByteArr[i+fromPos] != barr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + valByteArr[i+fromPos]
|
||||
+ " actual = " + barr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
|
||||
}
|
||||
}
|
||||
else if (arr instanceof char []) {
|
||||
char [] carr = (char [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (valCharArr[i+fromPos] != carr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + valCharArr[i+fromPos]
|
||||
+ " actual = " + carr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
else if (arr instanceof int []) {
|
||||
int [] iarr = (int [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (valIntArr[i+fromPos] != iarr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + valIntArr[i+fromPos]
|
||||
+ " actual = " + iarr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
else if (arr instanceof long []) {
|
||||
long [] larr = (long [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (valLongArr[i+fromPos] != larr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + valLongArr[i+fromPos]
|
||||
+ " actual = " + larr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void testByte(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
|
||||
validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testChar(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
|
||||
validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testInt(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
|
||||
validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testLong(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
|
||||
validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testByte_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
|
||||
validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testByte_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
|
||||
validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testChar_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
|
||||
validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testChar_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
|
||||
validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testInt_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
|
||||
validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testInt_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
|
||||
validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testLong_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
|
||||
validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
|
||||
}
|
||||
public static void testLong_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
|
||||
validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
|
||||
}
|
||||
|
||||
|
||||
public static void main(String [] args) {
|
||||
// Cases to test each new optimized stub special blocks.
|
||||
// Cases to test new PI handling (PI32 and PI64).
|
||||
// Cases to test vectorized constant array copies for all primitive types.
|
||||
// LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2
|
||||
int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 };
|
||||
Random r = new Random(1024);
|
||||
|
||||
setup();
|
||||
|
||||
try {
|
||||
for (int i = 0 ; i < 1000000 ; i++ ) {
|
||||
int index = r.nextInt(2048);
|
||||
testByte(lengths[i % lengths.length], index , index+2);
|
||||
reinit(byte.class);
|
||||
testByte_constant_LT32B (index , index+2);
|
||||
reinit(byte.class);
|
||||
testByte_constant_LT64B (index , index+2);
|
||||
reinit(byte.class);
|
||||
|
||||
testChar(lengths[i % lengths.length] >> 1, index , index+2);
|
||||
reinit(char.class);
|
||||
testChar_constant_LT32B (index , index+2);
|
||||
reinit(char.class);
|
||||
testChar_constant_LT64B (index , index+2);
|
||||
reinit(char.class);
|
||||
|
||||
testInt(lengths[i % lengths.length] >> 2, index , index+2);
|
||||
reinit(int.class);
|
||||
testInt_constant_LT32B (index , index+2);
|
||||
reinit(int.class);
|
||||
testInt_constant_LT64B (index , index+2);
|
||||
reinit(int.class);
|
||||
|
||||
testLong(lengths[i % lengths.length] >> 3, index , index+2);
|
||||
reinit(long.class);
|
||||
testLong_constant_LT32B (index , index+2);
|
||||
reinit(long.class);
|
||||
testLong_constant_LT64B (index , index+2);
|
||||
reinit(long.class);
|
||||
}
|
||||
System.out.println("PASS : " + validate_ctr);
|
||||
} catch (Exception e) {
|
||||
System.out.println(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
226
test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java
Normal file
226
test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java
Normal file
@ -0,0 +1,226 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.arraycopy;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8251871
|
||||
* @summary Optimize arrayCopy using AVX-512 masked instructions.
|
||||
*
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
|
||||
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
|
||||
* compiler.arraycopy.TestArrayCopyDisjoint
|
||||
*
|
||||
*/
|
||||
|
||||
public class TestArrayCopyDisjoint {
|
||||
|
||||
public static final int SIZE = 4096;
|
||||
public static byte[] fromByteArr, toByteArr;
|
||||
public static char[] fromCharArr, toCharArr;
|
||||
public static int[] fromIntArr, toIntArr;
|
||||
public static long[] fromLongArr, toLongArr;
|
||||
|
||||
static public void setup() {
|
||||
// Both positions aligned
|
||||
fromByteArr = new byte[SIZE];
|
||||
toByteArr = new byte[SIZE];
|
||||
fromCharArr = new char[SIZE];
|
||||
toCharArr = new char[SIZE];
|
||||
fromIntArr = new int[SIZE];
|
||||
toIntArr = new int[SIZE];
|
||||
fromLongArr = new long[SIZE];
|
||||
toLongArr = new long[SIZE];
|
||||
|
||||
for (int i = 0 ; i < SIZE ; i++) {
|
||||
fromByteArr[i] = (byte)i;
|
||||
fromCharArr[i] = (char)i;
|
||||
fromIntArr[i] = i;
|
||||
fromLongArr[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
public static int validate_ctr = 0;
|
||||
public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
|
||||
validate_ctr++;
|
||||
if (arr instanceof byte []) {
|
||||
byte [] barr = (byte [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (fromByteArr[i+fromPos] != barr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + fromByteArr[i+fromPos]
|
||||
+ " actual = " + barr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
else if (arr instanceof char []) {
|
||||
char [] carr = (char [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (fromCharArr[i+fromPos] != carr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + fromCharArr[i+fromPos]
|
||||
+ " actual = " + carr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
else if (arr instanceof int []) {
|
||||
int [] iarr = (int [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (fromIntArr[i+fromPos] != iarr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + fromIntArr[i+fromPos]
|
||||
+ " actual = " + iarr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
else if (arr instanceof long []) {
|
||||
long [] larr = (long [])arr;
|
||||
for(int i = 0 ; i < length; i++)
|
||||
if (fromLongArr[i+fromPos] != larr[i+toPos]) {
|
||||
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
|
||||
+ " expected = " + fromLongArr[i+fromPos]
|
||||
+ " actual = " + larr[i+toPos]
|
||||
+ " fromPos = " + fromPos
|
||||
+ " toPos = " + toPos);
|
||||
throw new Error("Fail");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void testByte(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
|
||||
validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testChar(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
|
||||
validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testInt(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
|
||||
validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testLong(int length, int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
|
||||
validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testByte_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
|
||||
validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testByte_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
|
||||
validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testChar_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
|
||||
validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testChar_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
|
||||
validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testInt_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
|
||||
validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
|
||||
}
|
||||
public static void testInt_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
|
||||
validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
|
||||
}
|
||||
|
||||
public static void testLong_constant_LT32B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
|
||||
validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
|
||||
}
|
||||
public static void testLong_constant_LT64B(int fromPos, int toPos) {
|
||||
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
|
||||
validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
|
||||
}
|
||||
|
||||
|
||||
public static void main(String [] args) {
|
||||
// Cases to test each new optimized stub special blocks.
|
||||
// Cases to test new PI handling (PI32 and PI64).
|
||||
// Cases to test vectorized constant array copies for all primitive types.
|
||||
// LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2
|
||||
int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 };
|
||||
Random r = new Random(1024);
|
||||
|
||||
setup();
|
||||
|
||||
try {
|
||||
for (int i = 0 ; i < 1000000 ; i++ ) {
|
||||
testByte(lengths[i % lengths.length], r.nextInt(2048) , r.nextInt(2048));
|
||||
testByte_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
|
||||
testByte_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
|
||||
|
||||
testChar(lengths[i % lengths.length] >> 1, r.nextInt(2048) , r.nextInt(2048));
|
||||
testChar_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
|
||||
testChar_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
|
||||
|
||||
testInt(lengths[i % lengths.length] >> 2, r.nextInt(2048) , r.nextInt(2048));
|
||||
testInt_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
|
||||
testInt_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
|
||||
|
||||
testLong(lengths[i % lengths.length] >> 3, r.nextInt(2048) , r.nextInt(2048));
|
||||
testLong_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
|
||||
testLong_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
|
||||
}
|
||||
System.out.println("PASS : " + validate_ctr);
|
||||
} catch (Exception e) {
|
||||
System.out.println(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
121
test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java
Normal file
121
test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2020, Arm Limited. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.results.Result;
|
||||
import org.openjdk.jmh.results.RunResult;
|
||||
import org.openjdk.jmh.runner.Runner;
|
||||
import org.openjdk.jmh.runner.RunnerException;
|
||||
import org.openjdk.jmh.runner.options.Options;
|
||||
import org.openjdk.jmh.runner.options.OptionsBuilder;
|
||||
import org.openjdk.jmh.runner.options.TimeValue;
|
||||
|
||||
|
||||
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.Arrays;
|
||||
|
||||
class MyClass {
|
||||
public int field1;
|
||||
public int field2;
|
||||
public int field3;
|
||||
|
||||
public MyClass(int val) {
|
||||
field1 = val;
|
||||
field2 = val;
|
||||
field3 = val;
|
||||
}
|
||||
}
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
public class ArrayCopyObject {
|
||||
@Param({"31", "63", "127" , "2047" , "4095", "8191"}) private int size;
|
||||
|
||||
private MyClass [] src;
|
||||
private MyClass [] dst;
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
src = new MyClass[size];
|
||||
dst = new MyClass[size];
|
||||
for (int i = 0; i < src.length ; i++) {
|
||||
src[i] = new MyClass(i);
|
||||
dst[i] = new MyClass(0);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void disjoint_micro() {
|
||||
System.arraycopy(src, 0 , dst, 0 , size);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void conjoint_micro() {
|
||||
System.arraycopy(src, 0 , src, 10 , size - 10 );
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws RunnerException {
|
||||
String [] base_opts =
|
||||
{ "-XX:+UnlockDiagnosticVMOptions ",
|
||||
"-XX:+IgnoreUnrecognizedVMOptions ",
|
||||
"-XX:UseAVX=3" };
|
||||
String [] opts_str1 = {"-XX:-UseCompressedOops "};
|
||||
String [] opts_str2 = {"-XX:+UseCompressedOops "};
|
||||
|
||||
Options baseOpts = new OptionsBuilder()
|
||||
.include(ArrayCopyObject.class.getName())
|
||||
.warmupTime(TimeValue.seconds(30))
|
||||
.measurementTime(TimeValue.seconds(10))
|
||||
.warmupIterations(1)
|
||||
.measurementIterations(2)
|
||||
.jvmArgs(base_opts)
|
||||
.forks(1)
|
||||
.build();
|
||||
|
||||
RunResult r1 = new Runner(new OptionsBuilder()
|
||||
.parent(baseOpts)
|
||||
.jvmArgs(opts_str1)
|
||||
.build()).runSingle();
|
||||
|
||||
RunResult r2 = new Runner(new OptionsBuilder()
|
||||
.parent(baseOpts)
|
||||
.jvmArgs(opts_str2)
|
||||
.build()).runSingle();
|
||||
|
||||
System.out.println(r1.getPrimaryResult().getScore() + r2.getPrimaryResult().getScore());
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user