8252847: Optimize primitive arrayCopy stubs using AVX-512 masked instructions

Reviewed-by: neliasso, kvn
This commit is contained in:
Jatin Bhateja 2020-10-10 06:29:38 +00:00
parent ec41046c5c
commit 4b5ac3abac
11 changed files with 1449 additions and 99 deletions

View File

@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
emit_operand(dst, src);
}
void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x6F);
emit_operand(dst, src);
}
void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.reset_is_clear_context();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}
void Assembler::shrxq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}
#ifndef _LP64
void Assembler::incl(Register dst) {

View File

@ -794,7 +794,6 @@ private:
void decl(Register dst);
void decl(Address dst);
void decq(Register dst);
void decq(Address dst);
void incl(Register dst);
@ -879,6 +878,7 @@ private:
void popa_uncached();
#endif
void vzeroupper_uncached();
void decq(Register dst);
void pusha();
void popa();
@ -1487,6 +1487,10 @@ private:
void evmovdquq(XMMRegister dst, Address src, int vector_len);
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
// Generic move instructions.
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);
// Move lower 64bit to high 64bit in 128bit register
void movlhps(XMMRegister dst, XMMRegister src);
@ -1989,6 +1993,8 @@ private:
void shlxl(Register dst, Register src1, Register src2);
void shlxq(Register dst, Register src1, Register src2);
void shrxq(Register dst, Register src1, Register src2);
//====================VECTOR ARITHMETIC=====================================

View File

@ -7964,6 +7964,7 @@ void MacroAssembler::cache_wbsync(bool is_pre)
sfence();
}
}
#endif // _LP64
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {

View File

@ -1037,6 +1037,18 @@ public:
Register rax, Register rcx, Register rdx, Register tmp);
#endif
#ifdef _LP64
void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift,
Register index, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);
void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);
#endif
private:
// these are private because users should be doing movflt/movdbl
@ -1725,6 +1737,23 @@ public:
void cache_wb(Address line);
void cache_wbsync(bool is_pre);
void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false);
void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift = Address::times_1, int offset = 0);
void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
int shift = Address::times_1, int offset = 0);
void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
bool conjoint, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false);
#endif // _LP64
void vallones(XMMRegister dst, int vector_len);

View File

@ -0,0 +1,249 @@
/*
* Copyright (c) 2020, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
#ifdef _LP64
void MacroAssembler::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift,
Register index, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit) {
Label L_entry_64, L_entry_96, L_entry_128;
Label L_entry_160, L_entry_192;
int size_mat[][6] = {
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
};
// Case A) Special case for length less than equal to 32 bytes.
cmpq(count, size_mat[shift][0]);
jccb(Assembler::greater, L_entry_64);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
jmp(L_exit);
// Case B) Special case for length less than equal to 64 bytes.
BIND(L_entry_64);
cmpq(count, size_mat[shift][1]);
jccb(Assembler::greater, L_entry_96);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
jmp(L_exit);
// Case C) Special case for length less than equal to 96 bytes.
BIND(L_entry_96);
cmpq(count, size_mat[shift][2]);
jccb(Assembler::greater, L_entry_128);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
subq(count, 64 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
jmp(L_exit);
// Case D) Special case for length less than equal to 128 bytes.
BIND(L_entry_128);
cmpq(count, size_mat[shift][3]);
jccb(Assembler::greater, L_entry_160);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy32_avx(to, from, index, xmm, shift, 64);
subq(count, 96 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
jmp(L_exit);
// Case E) Special case for length less than equal to 160 bytes.
BIND(L_entry_160);
cmpq(count, size_mat[shift][4]);
jccb(Assembler::greater, L_entry_192);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
subq(count, 128 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
jmp(L_exit);
// Case F) Special case for length less than equal to 192 bytes.
BIND(L_entry_192);
cmpq(count, size_mat[shift][5]);
jcc(Assembler::greater, L_entry);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
copy32_avx(to, from, index, xmm, shift, 128);
subq(count, 160 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
jmp(L_exit);
}
void MacroAssembler::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit) {
Label L_entry_64, L_entry_96, L_entry_128;
Label L_entry_160, L_entry_192;
bool avx3 = MaxVectorSize > 32 && AVX3Threshold == 0;
int size_mat[][6] = {
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
};
// Case A) Special case for length less than equal to 32 bytes.
cmpq(count, size_mat[shift][0]);
jccb(Assembler::greater, L_entry_64);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
// Case B) Special case for length less than equal to 64 bytes.
BIND(L_entry_64);
cmpq(count, size_mat[shift][1]);
jccb(Assembler::greater, L_entry_96);
if (avx3) {
copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
} else {
copy32_avx(to, from, end_index, xmm, shift, -32);
subq(count, 32 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
}
jmp(L_exit);
// Case C) Special case for length less than equal to 96 bytes.
BIND(L_entry_96);
cmpq(count, size_mat[shift][2]);
jccb(Assembler::greater, L_entry_128);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
subq(count, 64 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
// Case D) Special case for length less than equal to 128 bytes.
BIND(L_entry_128);
cmpq(count, size_mat[shift][3]);
jccb(Assembler::greater, L_entry_160);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy32_avx(to, from, end_index, xmm, shift, -96);
subq(count, 96 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
// Case E) Special case for length less than equal to 160 bytes.
BIND(L_entry_160);
cmpq(count, size_mat[shift][4]);
jccb(Assembler::greater, L_entry_192);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
subq(count, 128 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
// Case F) Special case for length less than equal to 192 bytes.
BIND(L_entry_192);
cmpq(count, size_mat[shift][5]);
jcc(Assembler::greater, L_entry);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
copy32_avx(to, from, end_index, xmm, shift, -160);
subq(count, 160 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
}
void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift, int offset,
bool use64byteVector) {
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
assert(MaxVectorSize >= 32, "vector length should be >= 32");
if (!use64byteVector) {
copy32_avx(dst, src, index, xmm, shift, offset);
subptr(length, 32 >> shift);
copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
} else {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
assert(MaxVectorSize == 64, "vector length != 64");
negptr(length);
addq(length, 64);
mov64(temp, -1);
shrxq(temp, temp, length);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]);
}
}
void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift, int offset) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
mov64(temp, 1);
shlxq(temp, temp, length);
decq(temp);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]);
}
void MacroAssembler::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
int shift, int offset) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
vmovdqu(xmm, Address(src, index, scale, offset));
vmovdqu(Address(dst, index, scale, offset), xmm);
}
void MacroAssembler::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
bool conjoint, int shift, int offset, bool use64byteVector) {
assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
if (!use64byteVector) {
if (conjoint) {
copy32_avx(dst, src, index, xmm, shift, offset+32);
copy32_avx(dst, src, index, xmm, shift, offset);
} else {
copy32_avx(dst, src, index, xmm, shift, offset);
copy32_avx(dst, src, index, xmm, shift, offset+32);
}
} else {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
}
}
#endif

View File

@ -1124,59 +1124,28 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
if (UseAVX > 2) {
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
__ BIND(L_copy_bytes);
__ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
__ jccb(Assembler::less, L_above_threshold);
__ jmpb(L_below_threshold);
__ bind(L_loop_avx512);
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
__ bind(L_above_threshold);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop_avx512);
__ jmpb(L_32_byte_head);
__ bind(L_loop_avx2);
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
__ bind(L_below_threshold);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop_avx2);
__ bind(L_32_byte_head);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
} else {
__ BIND(L_loop);
if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@ -1232,60 +1201,29 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
if (UseAVX > 2) {
Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
__ BIND(L_copy_bytes);
__ cmpptr(qword_count, (AVX3Threshold / 8));
__ jccb(Assembler::greater, L_above_threshold);
__ jmpb(L_below_threshold);
__ BIND(L_loop_avx512);
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
__ bind(L_above_threshold);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop_avx512);
__ jmpb(L_32_byte_head);
__ bind(L_loop_avx2);
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
__ bind(L_below_threshold);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop_avx2);
__ bind(L_32_byte_head);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
} else {
__ BIND(L_loop);
if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
}
__ BIND(L_copy_bytes);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
}
__ BIND(L_copy_bytes);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
@ -1323,6 +1261,442 @@ class StubGenerator: public StubCodeGenerator {
__ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
}
#ifndef PRODUCT
int& get_profile_ctr(int shift) {
if ( 0 == shift)
return SharedRuntime::_jbyte_array_copy_ctr;
else if(1 == shift)
return SharedRuntime::_jshort_array_copy_ctr;
else if(2 == shift)
return SharedRuntime::_jint_array_copy_ctr;
else
return SharedRuntime::_jlong_array_copy_ctr;
}
#endif
void setup_argument_regs(BasicType type) {
if (type == T_BYTE || type == T_SHORT) {
setup_arg_regs(); // from => rdi, to => rsi, count => rdx
// r9 and r10 may be used to save non-volatile registers
} else {
setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
// r9 is used to save r15_thread
}
}
void restore_argument_regs(BasicType type) {
if (type == T_BYTE || type == T_SHORT) {
restore_arg_regs();
} else {
restore_arg_regs_using_thread();
}
}
// Note: Following rules apply to AVX3 optimized arraycopy stubs:-
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
// for both special cases (various small block sizes) and aligned copy loop. This is the
// default configuration.
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
// better performance for disjoint copies. For conjoint/backward copy vector based
// copy performs better.
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
// 64 byte vector registers (ZMMs).
// Inputs:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - element count, treated as ssize_t, can be zero
//
//
// Side Effects:
// disjoint_copy_avx3_masked is set to the no-overlap entry point
// used by generate_conjoint_[byte/int/short/long]_copy().
//
address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
bool aligned, bool is_oop, bool dest_uninitialized) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
const Register temp1 = r8;
const Register temp2 = r11;
const Register temp3 = rax;
const Register temp4 = rcx;
// End pointers are inclusive, and if count is not zero they point
// to the last unit copied: end_to[0] := end_from[0]
__ enter(); // required for proper stackwalking of RuntimeStub frame
assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
if (entry != NULL) {
*entry = __ pc();
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
BasicType type = is_oop ? T_OBJECT : type_vec[shift];
setup_argument_regs(type);
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
{
// Type(shift) byte(0), short(1), int(2), long(3)
int loop_size[] = { 192, 96, 48, 24};
int threshold[] = { 4096, 2048, 1024, 512};
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
// 'from', 'to' and 'count' are now valid
// temp1 holds remaining count and temp4 holds running count used to compute
// next address offset for start of to/from addresses (temp4 * scale).
__ mov64(temp4, 0);
__ movq(temp1, count);
// Zero length check.
__ BIND(L_tail);
__ cmpq(temp1, 0);
__ jcc(Assembler::lessEqual, L_exit);
// Special cases using 32 byte [masked] vector copy operations.
__ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
temp4, temp3, use64byteVector, L_entry, L_exit);
// PRE-MAIN-POST loop for aligned copy.
__ BIND(L_entry);
if (AVX3Threshold != 0) {
__ cmpq(count, threshold[shift]);
if (MaxVectorSize == 64) {
// Copy using 64 byte vectors.
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
} else {
assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
// REP MOVS offer a faster copy path.
__ jcc(Assembler::greaterEqual, L_repmovs);
}
}
if (MaxVectorSize < 64 || AVX3Threshold != 0) {
// Partial copy to make dst address 32 byte aligned.
__ movq(temp2, to);
__ andq(temp2, 31);
__ jcc(Assembler::equal, L_main_pre_loop);
__ negptr(temp2);
__ addq(temp2, 32);
if (shift) {
__ shrq(temp2, shift);
}
__ movq(temp3, temp2);
__ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
__ movq(temp4, temp2);
__ movq(temp1, count);
__ subq(temp1, temp2);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::less, L_tail);
__ BIND(L_main_pre_loop);
__ subq(temp1, loop_size[shift]);
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
__ BIND(L_main_loop);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
__ addptr(temp4, loop_size[shift]);
__ subq(temp1, loop_size[shift]);
__ jcc(Assembler::greater, L_main_loop);
__ addq(temp1, loop_size[shift]);
// Tail loop.
__ jmp(L_tail);
__ BIND(L_repmovs);
__ movq(temp2, temp1);
// Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
__ movq(temp3, to);
__ movq(to, from);
__ movq(from, temp3);
// Save to/from for restoration post rep_mov.
__ movq(temp1, to);
__ movq(temp3, from);
if(shift < 3) {
__ shrq(temp2, 3-shift); // quad word count
}
__ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
__ rep_mov();
__ shlq(temp2, 3); // convert quad words into byte count.
if(shift) {
__ shrq(temp2, shift); // type specific count.
}
// Restore original addresses in to/from.
__ movq(to, temp3);
__ movq(from, temp1);
__ movq(temp4, temp2);
__ movq(temp1, count);
__ subq(temp1, temp2); // tailing part (less than a quad ward size).
__ jmp(L_tail);
}
if (MaxVectorSize > 32) {
__ BIND(L_pre_main_post_64);
// Partial copy to make dst address 64 byte aligned.
__ movq(temp2, to);
__ andq(temp2, 63);
__ jcc(Assembler::equal, L_main_pre_loop_64bytes);
__ negptr(temp2);
__ addq(temp2, 64);
if (shift) {
__ shrq(temp2, shift);
}
__ movq(temp3, temp2);
__ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
__ movq(temp4, temp2);
__ movq(temp1, count);
__ subq(temp1, temp2);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::less, L_tail64);
__ BIND(L_main_pre_loop_64bytes);
__ subq(temp1, loop_size[shift]);
// Main loop with aligned copy block size of 192 bytes at
// 64 byte copy granularity.
__ BIND(L_main_loop_64bytes);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
__ addptr(temp4, loop_size[shift]);
__ subq(temp1, loop_size[shift]);
__ jcc(Assembler::greater, L_main_loop_64bytes);
__ addq(temp1, loop_size[shift]);
// Zero length check.
__ jcc(Assembler::lessEqual, L_exit);
__ BIND(L_tail64);
// Tail handling using 64 byte [masked] vector copy operations.
use64byteVector = true;
__ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
temp4, temp3, use64byteVector, L_entry, L_exit);
}
__ BIND(L_exit);
}
address ucme_exit_pc = __ pc();
// When called from generic_arraycopy r11 contains specific values
// used during arraycopy epilogue, re-initializing r11.
if (is_oop) {
__ movq(r11, shift == 3 ? count : to);
}
bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
restore_argument_regs(type);
inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// Inputs:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - element count, treated as ssize_t, can be zero
//
//
address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
address nooverlap_target, bool aligned, bool is_oop,
bool dest_uninitialized) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
const Register temp1 = r8;
const Register temp2 = rcx;
const Register temp3 = r11;
const Register temp4 = rax;
// End pointers are inclusive, and if count is not zero they point
// to the last unit copied: end_to[0] := end_from[0]
__ enter(); // required for proper stackwalking of RuntimeStub frame
assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
if (entry != NULL) {
*entry = __ pc();
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
BasicType type = is_oop ? T_OBJECT : type_vec[shift];
setup_argument_regs(type);
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
{
// Type(shift) byte(0), short(1), int(2), long(3)
int loop_size[] = { 192, 96, 48, 24};
int threshold[] = { 4096, 2048, 1024, 512};
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
// 'from', 'to' and 'count' are now valid
// temp1 holds remaining count.
__ movq(temp1, count);
// Zero length check.
__ BIND(L_tail);
__ cmpq(temp1, 0);
__ jcc(Assembler::lessEqual, L_exit);
__ mov64(temp2, 0);
__ movq(temp3, temp1);
// Special cases using 32 byte [masked] vector copy operations.
__ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
temp4, use64byteVector, L_entry, L_exit);
// PRE-MAIN-POST loop for aligned copy.
__ BIND(L_entry);
if (MaxVectorSize > 32 && AVX3Threshold != 0) {
__ cmpq(temp1, threshold[shift]);
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
}
if (MaxVectorSize < 64 || AVX3Threshold != 0) {
// Partial copy to make dst address 32 byte aligned.
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
__ andq(temp2, 31);
__ jcc(Assembler::equal, L_main_pre_loop);
if (shift) {
__ shrq(temp2, shift);
}
__ subq(temp1, temp2);
__ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::less, L_tail);
__ BIND(L_main_pre_loop);
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
__ BIND(L_main_loop);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
__ subptr(temp1, loop_size[shift]);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::greater, L_main_loop);
// Tail loop.
__ jmp(L_tail);
}
if (MaxVectorSize > 32) {
__ BIND(L_pre_main_post_64);
// Partial copy to make dst address 64 byte aligned.
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
__ andq(temp2, 63);
__ jcc(Assembler::equal, L_main_pre_loop_64bytes);
if (shift) {
__ shrq(temp2, shift);
}
__ subq(temp1, temp2);
__ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::less, L_tail64);
__ BIND(L_main_pre_loop_64bytes);
// Main loop with aligned copy block size of 192 bytes at
// 64 byte copy granularity.
__ BIND(L_main_loop_64bytes);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
__ subq(temp1, loop_size[shift]);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::greater, L_main_loop_64bytes);
// Zero length check.
__ cmpq(temp1, 0);
__ jcc(Assembler::lessEqual, L_exit);
__ BIND(L_tail64);
// Tail handling using 64 byte [masked] vector copy operations.
use64byteVector = true;
__ mov64(temp2, 0);
__ movq(temp3, temp1);
__ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
temp4, use64byteVector, L_entry, L_exit);
}
__ BIND(L_exit);
}
address ucme_exit_pc = __ pc();
// When called from generic_arraycopy r11 contains specific values
// used during arraycopy epilogue, re-initializing r11.
if(is_oop) {
__ movq(r11, count);
}
bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
restore_argument_regs(type);
inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// Arguments:
// aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
// ignored
@ -1343,6 +1717,10 @@ class StubGenerator: public StubCodeGenerator {
// used by generate_conjoint_byte_copy().
//
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
aligned, false, false);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1453,6 +1831,10 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
address* entry, const char *name) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
nooverlap_target, aligned, false, false);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1558,6 +1940,11 @@ class StubGenerator: public StubCodeGenerator {
// used by generate_conjoint_short_copy().
//
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
aligned, false, false);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1682,6 +2069,10 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
address *entry, const char *name) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
nooverlap_target, aligned, false, false);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1780,6 +2171,11 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
const char *name, bool dest_uninitialized = false) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
aligned, is_oop, dest_uninitialized);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1884,6 +2280,10 @@ class StubGenerator: public StubCodeGenerator {
address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
bool dest_uninitialized = false) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -1991,6 +2391,10 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
const char *name, bool dest_uninitialized = false) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
aligned, is_oop, dest_uninitialized);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -2095,6 +2499,10 @@ class StubGenerator: public StubCodeGenerator {
address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
address nooverlap_target, address *entry,
const char *name, bool dest_uninitialized = false) {
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();

View File

@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
enum platform_dependent_constants {
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
code_size2 = 35300 LP64_ONLY(+11400) // simply increase if too small (assembler will crash if too small)
code_size2 = 35300 LP64_ONLY(+21400) // simply increase if too small (assembler will crash if too small)
};
class x86 {

View File

@ -763,6 +763,8 @@ void VM_Version::get_processor_features() {
if (is_intel()) { // Intel cpus specific settings
if (is_knights_family()) {
_features &= ~CPU_VZEROUPPER;
_features &= ~CPU_AVX512BW;
_features &= ~CPU_AVX512VL;
}
}

View File

@ -0,0 +1,269 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.arraycopy;
import java.util.Random;
/**
* @test
* @bug 8251871
* @summary Optimize arrayCopy using AVX-512 masked instructions.
*
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
* compiler.arraycopy.TestArrayCopyConjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
* compiler.arraycopy.TestArrayCopyConjoint
*
*/
public class TestArrayCopyConjoint {
public static final int SIZE = 4096;
public static byte[] fromByteArr, toByteArr, valByteArr;
public static char[] fromCharArr, toCharArr, valCharArr;
public static int[] fromIntArr, toIntArr, valIntArr;
public static long[] fromLongArr, toLongArr, valLongArr;
static public void reinit(Class<?> c) {
if (c == byte.class) {
for (int i = 0 ; i < SIZE ; i++) {
fromByteArr[i] = (byte)i;
}
} else if (c == char.class) {
for (int i = 0 ; i < SIZE ; i++) {
fromCharArr[i] = (char)i;
}
} else if (c == int.class) {
for (int i = 0 ; i < SIZE ; i++) {
fromIntArr[i] = i;
}
} else {
assert c == long.class;
for (int i = 0 ; i < SIZE ; i++) {
fromLongArr[i] = i;
}
}
}
static public void setup() {
// Both positions aligned
fromByteArr = new byte[SIZE];
valByteArr = new byte[SIZE];
toByteArr = fromByteArr;
fromCharArr = new char[SIZE];
valCharArr = new char[SIZE];
toCharArr = fromCharArr;
fromIntArr = new int[SIZE];
valIntArr = new int[SIZE];
toIntArr = fromIntArr;
fromLongArr = new long[SIZE];
valLongArr = new long[SIZE];
toLongArr = fromLongArr;
for (int i = 0 ; i < SIZE ; i++) {
fromByteArr[i] = (byte)i;
valByteArr[i] = (byte)i;
fromCharArr[i] = (char)i;
valCharArr[i] = (char)i;
fromIntArr[i] = i;
valIntArr[i] = i;
fromLongArr[i] = i;
valLongArr[i] = i;
}
}
public static int validate_ctr = 0;
public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
validate_ctr++;
if (arr instanceof byte []) {
byte [] barr = (byte [])arr;
for(int i = 0 ; i < length; i++)
if (valByteArr[i+fromPos] != barr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + valByteArr[i+fromPos]
+ " actual = " + barr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof char []) {
char [] carr = (char [])arr;
for(int i = 0 ; i < length; i++)
if (valCharArr[i+fromPos] != carr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + valCharArr[i+fromPos]
+ " actual = " + carr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof int []) {
int [] iarr = (int [])arr;
for(int i = 0 ; i < length; i++)
if (valIntArr[i+fromPos] != iarr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + valIntArr[i+fromPos]
+ " actual = " + iarr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof long []) {
long [] larr = (long [])arr;
for(int i = 0 ; i < length; i++)
if (valLongArr[i+fromPos] != larr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + valLongArr[i+fromPos]
+ " actual = " + larr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
}
public static void testByte(int length, int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
}
public static void testChar(int length, int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
}
public static void testInt(int length, int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
}
public static void testLong(int length, int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
}
public static void testByte_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
}
public static void testByte_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
}
public static void testChar_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
}
public static void testChar_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
}
public static void testInt_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
}
public static void testInt_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
}
public static void testLong_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
}
public static void testLong_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
}
public static void main(String [] args) {
// Cases to test each new optimized stub special blocks.
// Cases to test new PI handling (PI32 and PI64).
// Cases to test vectorized constant array copies for all primitive types.
// LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2
int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 };
Random r = new Random(1024);
setup();
try {
for (int i = 0 ; i < 1000000 ; i++ ) {
int index = r.nextInt(2048);
testByte(lengths[i % lengths.length], index , index+2);
reinit(byte.class);
testByte_constant_LT32B (index , index+2);
reinit(byte.class);
testByte_constant_LT64B (index , index+2);
reinit(byte.class);
testChar(lengths[i % lengths.length] >> 1, index , index+2);
reinit(char.class);
testChar_constant_LT32B (index , index+2);
reinit(char.class);
testChar_constant_LT64B (index , index+2);
reinit(char.class);
testInt(lengths[i % lengths.length] >> 2, index , index+2);
reinit(int.class);
testInt_constant_LT32B (index , index+2);
reinit(int.class);
testInt_constant_LT64B (index , index+2);
reinit(int.class);
testLong(lengths[i % lengths.length] >> 3, index , index+2);
reinit(long.class);
testLong_constant_LT32B (index , index+2);
reinit(long.class);
testLong_constant_LT64B (index , index+2);
reinit(long.class);
}
System.out.println("PASS : " + validate_ctr);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}

View File

@ -0,0 +1,226 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.arraycopy;
import java.util.Random;
/**
* @test
* @bug 8251871
* @summary Optimize arrayCopy using AVX-512 masked instructions.
*
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
* compiler.arraycopy.TestArrayCopyDisjoint
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions
* -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
* compiler.arraycopy.TestArrayCopyDisjoint
*
*/
public class TestArrayCopyDisjoint {
public static final int SIZE = 4096;
public static byte[] fromByteArr, toByteArr;
public static char[] fromCharArr, toCharArr;
public static int[] fromIntArr, toIntArr;
public static long[] fromLongArr, toLongArr;
static public void setup() {
// Both positions aligned
fromByteArr = new byte[SIZE];
toByteArr = new byte[SIZE];
fromCharArr = new char[SIZE];
toCharArr = new char[SIZE];
fromIntArr = new int[SIZE];
toIntArr = new int[SIZE];
fromLongArr = new long[SIZE];
toLongArr = new long[SIZE];
for (int i = 0 ; i < SIZE ; i++) {
fromByteArr[i] = (byte)i;
fromCharArr[i] = (char)i;
fromIntArr[i] = i;
fromLongArr[i] = i;
}
}
public static int validate_ctr = 0;
public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
validate_ctr++;
if (arr instanceof byte []) {
byte [] barr = (byte [])arr;
for(int i = 0 ; i < length; i++)
if (fromByteArr[i+fromPos] != barr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + fromByteArr[i+fromPos]
+ " actual = " + barr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof char []) {
char [] carr = (char [])arr;
for(int i = 0 ; i < length; i++)
if (fromCharArr[i+fromPos] != carr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + fromCharArr[i+fromPos]
+ " actual = " + carr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof int []) {
int [] iarr = (int [])arr;
for(int i = 0 ; i < length; i++)
if (fromIntArr[i+fromPos] != iarr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + fromIntArr[i+fromPos]
+ " actual = " + iarr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
else if (arr instanceof long []) {
long [] larr = (long [])arr;
for(int i = 0 ; i < length; i++)
if (fromLongArr[i+fromPos] != larr[i+toPos]) {
System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+ " expected = " + fromLongArr[i+fromPos]
+ " actual = " + larr[i+toPos]
+ " fromPos = " + fromPos
+ " toPos = " + toPos);
throw new Error("Fail");
}
}
}
public static void testByte(int length, int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
}
public static void testChar(int length, int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
}
public static void testInt(int length, int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
}
public static void testLong(int length, int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
}
public static void testByte_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
}
public static void testByte_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
}
public static void testChar_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
}
public static void testChar_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
}
public static void testInt_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
}
public static void testInt_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
}
public static void testLong_constant_LT32B(int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
}
public static void testLong_constant_LT64B(int fromPos, int toPos) {
System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
}
public static void main(String [] args) {
// Cases to test each new optimized stub special blocks.
// Cases to test new PI handling (PI32 and PI64).
// Cases to test vectorized constant array copies for all primitive types.
// LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2
int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 };
Random r = new Random(1024);
setup();
try {
for (int i = 0 ; i < 1000000 ; i++ ) {
testByte(lengths[i % lengths.length], r.nextInt(2048) , r.nextInt(2048));
testByte_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
testByte_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
testChar(lengths[i % lengths.length] >> 1, r.nextInt(2048) , r.nextInt(2048));
testChar_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
testChar_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
testInt(lengths[i % lengths.length] >> 2, r.nextInt(2048) , r.nextInt(2048));
testInt_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
testInt_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
testLong(lengths[i % lengths.length] >> 3, r.nextInt(2048) , r.nextInt(2048));
testLong_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
testLong_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
}
System.out.println("PASS : " + validate_ctr);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}

View File

@ -0,0 +1,121 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.results.Result;
import org.openjdk.jmh.results.RunResult;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import org.openjdk.jmh.runner.options.TimeValue;
import java.util.concurrent.TimeUnit;
import java.util.Arrays;
class MyClass {
public int field1;
public int field2;
public int field3;
public MyClass(int val) {
field1 = val;
field2 = val;
field3 = val;
}
}
@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class ArrayCopyObject {
@Param({"31", "63", "127" , "2047" , "4095", "8191"}) private int size;
private MyClass [] src;
private MyClass [] dst;
@Setup
public void setup() {
src = new MyClass[size];
dst = new MyClass[size];
for (int i = 0; i < src.length ; i++) {
src[i] = new MyClass(i);
dst[i] = new MyClass(0);
}
}
@Benchmark
public void disjoint_micro() {
System.arraycopy(src, 0 , dst, 0 , size);
}
@Benchmark
public void conjoint_micro() {
System.arraycopy(src, 0 , src, 10 , size - 10 );
}
public static void main(String[] args) throws RunnerException {
String [] base_opts =
{ "-XX:+UnlockDiagnosticVMOptions ",
"-XX:+IgnoreUnrecognizedVMOptions ",
"-XX:UseAVX=3" };
String [] opts_str1 = {"-XX:-UseCompressedOops "};
String [] opts_str2 = {"-XX:+UseCompressedOops "};
Options baseOpts = new OptionsBuilder()
.include(ArrayCopyObject.class.getName())
.warmupTime(TimeValue.seconds(30))
.measurementTime(TimeValue.seconds(10))
.warmupIterations(1)
.measurementIterations(2)
.jvmArgs(base_opts)
.forks(1)
.build();
RunResult r1 = new Runner(new OptionsBuilder()
.parent(baseOpts)
.jvmArgs(opts_str1)
.build()).runSingle();
RunResult r2 = new Runner(new OptionsBuilder()
.parent(baseOpts)
.jvmArgs(opts_str2)
.build()).runSingle();
System.out.println(r1.getPrimaryResult().getScore() + r2.getPrimaryResult().getScore());
}
}