From 4b5ac3abacee0a4b06a9ed0ea57377ff903a90c3 Mon Sep 17 00:00:00 2001 From: Jatin Bhateja Date: Sat, 10 Oct 2020 06:29:38 +0000 Subject: [PATCH] 8252847: Optimize primitive arrayCopy stubs using AVX-512 masked instructions Reviewed-by: neliasso, kvn --- src/hotspot/cpu/x86/assembler_x86.cpp | 39 ++ src/hotspot/cpu/x86/assembler_x86.hpp | 8 +- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 1 + src/hotspot/cpu/x86/macroAssembler_x86.hpp | 29 + .../x86/macroAssembler_x86_arrayCopy_avx3.cpp | 249 ++++++++ src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 602 +++++++++++++++--- src/hotspot/cpu/x86/stubRoutines_x86.hpp | 2 +- src/hotspot/cpu/x86/vm_version_x86.cpp | 2 + .../arraycopy/TestArrayCopyConjoint.java | 269 ++++++++ .../arraycopy/TestArrayCopyDisjoint.java | 226 +++++++ .../bench/java/lang/ArrayCopyObject.java | 121 ++++ 11 files changed, 1449 insertions(+), 99 deletions(-) create mode 100644 src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp create mode 100644 test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java create mode 100644 test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java create mode 100644 test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index ef04d33c7f4..3168567e069 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect emit_operand(dst, src); } +void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, ""); + InstructionMark im(this); + bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG; + int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3; + InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); + emit_int8(0x6F); + emit_operand(dst, src); +} + +void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(src != xnoreg, "sanity"); + assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, ""); + InstructionMark im(this); + bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG; + int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3; + InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); @@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) { emit_int16((unsigned char)0xF7, (0xC0 | encode)); } +void Assembler::shrxq(Register dst, Register src1, Register src2) { + assert(VM_Version::supports_bmi2(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xF7, (0xC0 | encode)); +} + #ifndef _LP64 void Assembler::incl(Register dst) { diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 283285dc347..23d8db40164 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -794,7 +794,6 @@ private: void decl(Register dst); void decl(Address dst); - void decq(Register dst); void decq(Address dst); void incl(Register dst); @@ -879,6 +878,7 @@ private: void popa_uncached(); #endif void vzeroupper_uncached(); + void decq(Register dst); void pusha(); void popa(); @@ -1487,6 +1487,10 @@ private: void evmovdquq(XMMRegister dst, Address src, int vector_len); void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len); + // Generic move instructions. + void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type); + void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type); + // Move lower 64bit to high 64bit in 128bit register void movlhps(XMMRegister dst, XMMRegister src); @@ -1989,6 +1993,8 @@ private: void shlxl(Register dst, Register src1, Register src2); void shlxq(Register dst, Register src1, Register src2); + void shrxq(Register dst, Register src1, Register src2); + //====================VECTOR ARITHMETIC===================================== diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 0a8d11f9764..84cb5eb9ad7 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -7964,6 +7964,7 @@ void MacroAssembler::cache_wbsync(bool is_pre) sfence(); } } + #endif // _LP64 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 325bdf912bc..b052985708a 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -1037,6 +1037,18 @@ public: Register rax, Register rcx, Register rdx, Register tmp); #endif +#ifdef _LP64 + void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, + Register to, Register count, int shift, + Register index, Register temp, + bool use64byteVector, Label& L_entry, Label& L_exit); + + void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, + Register to, Register start_index, Register end_index, + Register count, int shift, Register temp, + bool use64byteVector, Label& L_entry, Label& L_exit); +#endif + private: // these are private because users should be doing movflt/movdbl @@ -1725,6 +1737,23 @@ public: void cache_wb(Address line); void cache_wbsync(bool is_pre); + + void copy64_masked_avx(Register dst, Register src, XMMRegister xmm, + KRegister mask, Register length, Register index, + Register temp, int shift = Address::times_1, int offset = 0, + bool use64byteVector = false); + + void copy32_masked_avx(Register dst, Register src, XMMRegister xmm, + KRegister mask, Register length, Register index, + Register temp, int shift = Address::times_1, int offset = 0); + + void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, + int shift = Address::times_1, int offset = 0); + + void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, + bool conjoint, int shift = Address::times_1, int offset = 0, + bool use64byteVector = false); + #endif // _LP64 void vallones(XMMRegister dst, int vector_len); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp new file mode 100644 index 00000000000..9a6d10db6cc --- /dev/null +++ b/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp @@ -0,0 +1,249 @@ +/* +* Copyright (c) 2020, Intel Corporation. +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#include "precompiled.hpp" +#include "asm/macroAssembler.hpp" +#include "asm/macroAssembler.inline.hpp" + +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#else +#define BLOCK_COMMENT(str) block_comment(str) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") + +#ifdef _LP64 + +void MacroAssembler::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, + Register to, Register count, int shift, + Register index, Register temp, + bool use64byteVector, Label& L_entry, Label& L_exit) { + Label L_entry_64, L_entry_96, L_entry_128; + Label L_entry_160, L_entry_192; + + int size_mat[][6] = { + /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, + /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, + /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, + /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } + }; + + // Case A) Special case for length less than equal to 32 bytes. + cmpq(count, size_mat[shift][0]); + jccb(Assembler::greater, L_entry_64); + copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); + jmp(L_exit); + + // Case B) Special case for length less than equal to 64 bytes. + BIND(L_entry_64); + cmpq(count, size_mat[shift][1]); + jccb(Assembler::greater, L_entry_96); + copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); + jmp(L_exit); + + // Case C) Special case for length less than equal to 96 bytes. + BIND(L_entry_96); + cmpq(count, size_mat[shift][2]); + jccb(Assembler::greater, L_entry_128); + copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); + subq(count, 64 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); + jmp(L_exit); + + // Case D) Special case for length less than equal to 128 bytes. + BIND(L_entry_128); + cmpq(count, size_mat[shift][3]); + jccb(Assembler::greater, L_entry_160); + copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); + copy32_avx(to, from, index, xmm, shift, 64); + subq(count, 96 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); + jmp(L_exit); + + // Case E) Special case for length less than equal to 160 bytes. + BIND(L_entry_160); + cmpq(count, size_mat[shift][4]); + jccb(Assembler::greater, L_entry_192); + copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); + copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); + subq(count, 128 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); + jmp(L_exit); + + // Case F) Special case for length less than equal to 192 bytes. + BIND(L_entry_192); + cmpq(count, size_mat[shift][5]); + jcc(Assembler::greater, L_entry); + copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); + copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); + copy32_avx(to, from, index, xmm, shift, 128); + subq(count, 160 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); + jmp(L_exit); +} + +void MacroAssembler::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, + Register to, Register start_index, Register end_index, + Register count, int shift, Register temp, + bool use64byteVector, Label& L_entry, Label& L_exit) { + Label L_entry_64, L_entry_96, L_entry_128; + Label L_entry_160, L_entry_192; + bool avx3 = MaxVectorSize > 32 && AVX3Threshold == 0; + + int size_mat[][6] = { + /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, + /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, + /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, + /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } + }; + + // Case A) Special case for length less than equal to 32 bytes. + cmpq(count, size_mat[shift][0]); + jccb(Assembler::greater, L_entry_64); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + jmp(L_exit); + + // Case B) Special case for length less than equal to 64 bytes. + BIND(L_entry_64); + cmpq(count, size_mat[shift][1]); + jccb(Assembler::greater, L_entry_96); + if (avx3) { + copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); + } else { + copy32_avx(to, from, end_index, xmm, shift, -32); + subq(count, 32 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + } + jmp(L_exit); + + // Case C) Special case for length less than equal to 96 bytes. + BIND(L_entry_96); + cmpq(count, size_mat[shift][2]); + jccb(Assembler::greater, L_entry_128); + copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); + subq(count, 64 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + jmp(L_exit); + + // Case D) Special case for length less than equal to 128 bytes. + BIND(L_entry_128); + cmpq(count, size_mat[shift][3]); + jccb(Assembler::greater, L_entry_160); + copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); + copy32_avx(to, from, end_index, xmm, shift, -96); + subq(count, 96 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + jmp(L_exit); + + // Case E) Special case for length less than equal to 160 bytes. + BIND(L_entry_160); + cmpq(count, size_mat[shift][4]); + jccb(Assembler::greater, L_entry_192); + copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); + copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); + subq(count, 128 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + jmp(L_exit); + + // Case F) Special case for length less than equal to 192 bytes. + BIND(L_entry_192); + cmpq(count, size_mat[shift][5]); + jcc(Assembler::greater, L_entry); + copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); + copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); + copy32_avx(to, from, end_index, xmm, shift, -160); + subq(count, 160 >> shift); + copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); + jmp(L_exit); +} + +void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, + KRegister mask, Register length, Register index, + Register temp, int shift, int offset, + bool use64byteVector) { + BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + if (!use64byteVector) { + copy32_avx(dst, src, index, xmm, shift, offset); + subptr(length, 32 >> shift); + copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); + } else { + Address::ScaleFactor scale = (Address::ScaleFactor)(shift); + assert(MaxVectorSize == 64, "vector length != 64"); + negptr(length); + addq(length, 64); + mov64(temp, -1); + shrxq(temp, temp, length); + kmovql(mask, temp); + evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]); + evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]); + } +} + + +void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, + KRegister mask, Register length, Register index, + Register temp, int shift, int offset) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + Address::ScaleFactor scale = (Address::ScaleFactor)(shift); + mov64(temp, 1); + shlxq(temp, temp, length); + decq(temp); + kmovql(mask, temp); + evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]); + evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]); +} + + +void MacroAssembler::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, + int shift, int offset) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + Address::ScaleFactor scale = (Address::ScaleFactor)(shift); + vmovdqu(xmm, Address(src, index, scale, offset)); + vmovdqu(Address(dst, index, scale, offset), xmm); +} + + +void MacroAssembler::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, + bool conjoint, int shift, int offset, bool use64byteVector) { + assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); + if (!use64byteVector) { + if (conjoint) { + copy32_avx(dst, src, index, xmm, shift, offset+32); + copy32_avx(dst, src, index, xmm, shift, offset); + } else { + copy32_avx(dst, src, index, xmm, shift, offset); + copy32_avx(dst, src, index, xmm, shift, offset+32); + } + } else { + Address::ScaleFactor scale = (Address::ScaleFactor)(shift); + evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); + evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); + } +} + +#endif diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 3d2c7671304..e47e3baeea4 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -1124,59 +1124,28 @@ class StubGenerator: public StubCodeGenerator { __ align(OptoLoopAlignment); if (UseUnalignedLoadStores) { Label L_end; - // Copy 64-bytes per iteration - if (UseAVX > 2) { - Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; - - __ BIND(L_copy_bytes); - __ cmpptr(qword_count, (-1 * AVX3Threshold / 8)); - __ jccb(Assembler::less, L_above_threshold); - __ jmpb(L_below_threshold); - - __ bind(L_loop_avx512); - __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); - __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); - __ bind(L_above_threshold); - __ addptr(qword_count, 8); - __ jcc(Assembler::lessEqual, L_loop_avx512); - __ jmpb(L_32_byte_head); - - __ bind(L_loop_avx2); + __ BIND(L_loop); + if (UseAVX >= 2) { __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); - __ bind(L_below_threshold); - __ addptr(qword_count, 8); - __ jcc(Assembler::lessEqual, L_loop_avx2); - - __ bind(L_32_byte_head); - __ subptr(qword_count, 4); // sub(8) and add(4) - __ jccb(Assembler::greater, L_end); } else { - __ BIND(L_loop); - if (UseAVX == 2) { - __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); - __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); - __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); - __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); - } else { - __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); - __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); - __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); - __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); - __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); - __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); - __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); - __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); - } - - __ BIND(L_copy_bytes); - __ addptr(qword_count, 8); - __ jcc(Assembler::lessEqual, L_loop); - __ subptr(qword_count, 4); // sub(8) and add(4) - __ jccb(Assembler::greater, L_end); + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); + __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); + __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); } + + __ BIND(L_copy_bytes); + __ addptr(qword_count, 8); + __ jcc(Assembler::lessEqual, L_loop); + __ subptr(qword_count, 4); // sub(8) and add(4) + __ jccb(Assembler::greater, L_end); // Copy trailing 32 bytes if (UseAVX >= 2) { __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); @@ -1232,60 +1201,29 @@ class StubGenerator: public StubCodeGenerator { __ align(OptoLoopAlignment); if (UseUnalignedLoadStores) { Label L_end; - // Copy 64-bytes per iteration - if (UseAVX > 2) { - Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; - - __ BIND(L_copy_bytes); - __ cmpptr(qword_count, (AVX3Threshold / 8)); - __ jccb(Assembler::greater, L_above_threshold); - __ jmpb(L_below_threshold); - - __ BIND(L_loop_avx512); - __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); - __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); - __ bind(L_above_threshold); - __ subptr(qword_count, 8); - __ jcc(Assembler::greaterEqual, L_loop_avx512); - __ jmpb(L_32_byte_head); - - __ bind(L_loop_avx2); + __ BIND(L_loop); + if (UseAVX >= 2) { __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); - __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); - __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); - __ bind(L_below_threshold); - __ subptr(qword_count, 8); - __ jcc(Assembler::greaterEqual, L_loop_avx2); - - __ bind(L_32_byte_head); - __ addptr(qword_count, 4); // add(8) and sub(4) - __ jccb(Assembler::less, L_end); + __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); + __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); } else { - __ BIND(L_loop); - if (UseAVX == 2) { - __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); - __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); - __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); - __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); - } else { - __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); - __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); - __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); - __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); - __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); - __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); - __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); - __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); - } - - __ BIND(L_copy_bytes); - __ subptr(qword_count, 8); - __ jcc(Assembler::greaterEqual, L_loop); - - __ addptr(qword_count, 4); // add(8) and sub(4) - __ jccb(Assembler::less, L_end); + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); + __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); + __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); + __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); + __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); } + + __ BIND(L_copy_bytes); + __ subptr(qword_count, 8); + __ jcc(Assembler::greaterEqual, L_loop); + + __ addptr(qword_count, 4); // add(8) and sub(4) + __ jccb(Assembler::less, L_end); // Copy trailing 32 bytes if (UseAVX >= 2) { __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); @@ -1323,6 +1261,442 @@ class StubGenerator: public StubCodeGenerator { __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords } +#ifndef PRODUCT + int& get_profile_ctr(int shift) { + if ( 0 == shift) + return SharedRuntime::_jbyte_array_copy_ctr; + else if(1 == shift) + return SharedRuntime::_jshort_array_copy_ctr; + else if(2 == shift) + return SharedRuntime::_jint_array_copy_ctr; + else + return SharedRuntime::_jlong_array_copy_ctr; + } +#endif + + void setup_argument_regs(BasicType type) { + if (type == T_BYTE || type == T_SHORT) { + setup_arg_regs(); // from => rdi, to => rsi, count => rdx + // r9 and r10 may be used to save non-volatile registers + } else { + setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx + // r9 is used to save r15_thread + } + } + + void restore_argument_regs(BasicType type) { + if (type == T_BYTE || type == T_SHORT) { + restore_arg_regs(); + } else { + restore_arg_regs_using_thread(); + } + } + + // Note: Following rules apply to AVX3 optimized arraycopy stubs:- + // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) + // for both special cases (various small block sizes) and aligned copy loop. This is the + // default configuration. + // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) + // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. + // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a + // better performance for disjoint copies. For conjoint/backward copy vector based + // copy performs better. + // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over + // 64 byte vector registers (ZMMs). + + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // + // Side Effects: + // disjoint_copy_avx3_masked is set to the no-overlap entry point + // used by generate_conjoint_[byte/int/short/long]_copy(). + // + + address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift, + bool aligned, bool is_oop, bool dest_uninitialized) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0; + Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; + Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; + const Register from = rdi; // source array address + const Register to = rsi; // destination array address + const Register count = rdx; // elements count + const Register temp1 = r8; + const Register temp2 = r11; + const Register temp3 = rax; + const Register temp4 = rcx; + // End pointers are inclusive, and if count is not zero they point + // to the last unit copied: end_to[0] := end_from[0] + + __ enter(); // required for proper stackwalking of RuntimeStub frame + assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. + + if (entry != NULL) { + *entry = __ pc(); + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) + BLOCK_COMMENT("Entry:"); + } + + BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + BasicType type = is_oop ? T_OBJECT : type_vec[shift]; + + setup_argument_regs(type); + + DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; + if (dest_uninitialized) { + decorators |= IS_DEST_UNINITIALIZED; + } + if (aligned) { + decorators |= ARRAYCOPY_ALIGNED; + } + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->arraycopy_prologue(_masm, decorators, type, from, to, count); + + { + // Type(shift) byte(0), short(1), int(2), long(3) + int loop_size[] = { 192, 96, 48, 24}; + int threshold[] = { 4096, 2048, 1024, 512}; + + // UnsafeCopyMemory page error: continue after ucm + UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); + // 'from', 'to' and 'count' are now valid + + // temp1 holds remaining count and temp4 holds running count used to compute + // next address offset for start of to/from addresses (temp4 * scale). + __ mov64(temp4, 0); + __ movq(temp1, count); + + // Zero length check. + __ BIND(L_tail); + __ cmpq(temp1, 0); + __ jcc(Assembler::lessEqual, L_exit); + + // Special cases using 32 byte [masked] vector copy operations. + __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, + temp4, temp3, use64byteVector, L_entry, L_exit); + + // PRE-MAIN-POST loop for aligned copy. + __ BIND(L_entry); + + if (AVX3Threshold != 0) { + __ cmpq(count, threshold[shift]); + if (MaxVectorSize == 64) { + // Copy using 64 byte vectors. + __ jcc(Assembler::greaterEqual, L_pre_main_post_64); + } else { + assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); + // REP MOVS offer a faster copy path. + __ jcc(Assembler::greaterEqual, L_repmovs); + } + } + + if (MaxVectorSize < 64 || AVX3Threshold != 0) { + // Partial copy to make dst address 32 byte aligned. + __ movq(temp2, to); + __ andq(temp2, 31); + __ jcc(Assembler::equal, L_main_pre_loop); + + __ negptr(temp2); + __ addq(temp2, 32); + if (shift) { + __ shrq(temp2, shift); + } + __ movq(temp3, temp2); + __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); + __ movq(temp4, temp2); + __ movq(temp1, count); + __ subq(temp1, temp2); + + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::less, L_tail); + + __ BIND(L_main_pre_loop); + __ subq(temp1, loop_size[shift]); + + // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. + __ BIND(L_main_loop); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 0); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 64); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 128); + __ addptr(temp4, loop_size[shift]); + __ subq(temp1, loop_size[shift]); + __ jcc(Assembler::greater, L_main_loop); + + __ addq(temp1, loop_size[shift]); + + // Tail loop. + __ jmp(L_tail); + + __ BIND(L_repmovs); + __ movq(temp2, temp1); + // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. + __ movq(temp3, to); + __ movq(to, from); + __ movq(from, temp3); + // Save to/from for restoration post rep_mov. + __ movq(temp1, to); + __ movq(temp3, from); + if(shift < 3) { + __ shrq(temp2, 3-shift); // quad word count + } + __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). + __ rep_mov(); + __ shlq(temp2, 3); // convert quad words into byte count. + if(shift) { + __ shrq(temp2, shift); // type specific count. + } + // Restore original addresses in to/from. + __ movq(to, temp3); + __ movq(from, temp1); + __ movq(temp4, temp2); + __ movq(temp1, count); + __ subq(temp1, temp2); // tailing part (less than a quad ward size). + __ jmp(L_tail); + } + + if (MaxVectorSize > 32) { + __ BIND(L_pre_main_post_64); + // Partial copy to make dst address 64 byte aligned. + __ movq(temp2, to); + __ andq(temp2, 63); + __ jcc(Assembler::equal, L_main_pre_loop_64bytes); + + __ negptr(temp2); + __ addq(temp2, 64); + if (shift) { + __ shrq(temp2, shift); + } + __ movq(temp3, temp2); + __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); + __ movq(temp4, temp2); + __ movq(temp1, count); + __ subq(temp1, temp2); + + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::less, L_tail64); + + __ BIND(L_main_pre_loop_64bytes); + __ subq(temp1, loop_size[shift]); + + // Main loop with aligned copy block size of 192 bytes at + // 64 byte copy granularity. + __ BIND(L_main_loop_64bytes); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); + __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); + __ addptr(temp4, loop_size[shift]); + __ subq(temp1, loop_size[shift]); + __ jcc(Assembler::greater, L_main_loop_64bytes); + + __ addq(temp1, loop_size[shift]); + // Zero length check. + __ jcc(Assembler::lessEqual, L_exit); + + __ BIND(L_tail64); + + // Tail handling using 64 byte [masked] vector copy operations. + use64byteVector = true; + __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, + temp4, temp3, use64byteVector, L_entry, L_exit); + } + __ BIND(L_exit); + } + + address ucme_exit_pc = __ pc(); + // When called from generic_arraycopy r11 contains specific values + // used during arraycopy epilogue, re-initializing r11. + if (is_oop) { + __ movq(r11, shift == 3 ? count : to); + } + bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); + restore_argument_regs(type); + inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free + __ xorptr(rax, rax); // return 0 + __ vzeroupper(); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } + + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // + address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, + address nooverlap_target, bool aligned, bool is_oop, + bool dest_uninitialized) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0; + + Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; + Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; + const Register from = rdi; // source array address + const Register to = rsi; // destination array address + const Register count = rdx; // elements count + const Register temp1 = r8; + const Register temp2 = rcx; + const Register temp3 = r11; + const Register temp4 = rax; + // End pointers are inclusive, and if count is not zero they point + // to the last unit copied: end_to[0] := end_from[0] + + __ enter(); // required for proper stackwalking of RuntimeStub frame + assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. + + if (entry != NULL) { + *entry = __ pc(); + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) + BLOCK_COMMENT("Entry:"); + } + + array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); + + BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + BasicType type = is_oop ? T_OBJECT : type_vec[shift]; + + setup_argument_regs(type); + + DecoratorSet decorators = IN_HEAP | IS_ARRAY; + if (dest_uninitialized) { + decorators |= IS_DEST_UNINITIALIZED; + } + if (aligned) { + decorators |= ARRAYCOPY_ALIGNED; + } + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->arraycopy_prologue(_masm, decorators, type, from, to, count); + { + // Type(shift) byte(0), short(1), int(2), long(3) + int loop_size[] = { 192, 96, 48, 24}; + int threshold[] = { 4096, 2048, 1024, 512}; + + // UnsafeCopyMemory page error: continue after ucm + UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); + // 'from', 'to' and 'count' are now valid + + // temp1 holds remaining count. + __ movq(temp1, count); + + // Zero length check. + __ BIND(L_tail); + __ cmpq(temp1, 0); + __ jcc(Assembler::lessEqual, L_exit); + + __ mov64(temp2, 0); + __ movq(temp3, temp1); + // Special cases using 32 byte [masked] vector copy operations. + __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, + temp4, use64byteVector, L_entry, L_exit); + + // PRE-MAIN-POST loop for aligned copy. + __ BIND(L_entry); + + if (MaxVectorSize > 32 && AVX3Threshold != 0) { + __ cmpq(temp1, threshold[shift]); + __ jcc(Assembler::greaterEqual, L_pre_main_post_64); + } + + if (MaxVectorSize < 64 || AVX3Threshold != 0) { + // Partial copy to make dst address 32 byte aligned. + __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); + __ andq(temp2, 31); + __ jcc(Assembler::equal, L_main_pre_loop); + + if (shift) { + __ shrq(temp2, shift); + } + __ subq(temp1, temp2); + __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); + + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::less, L_tail); + + __ BIND(L_main_pre_loop); + + // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. + __ BIND(L_main_loop); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -64); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -128); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -192); + __ subptr(temp1, loop_size[shift]); + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::greater, L_main_loop); + + // Tail loop. + __ jmp(L_tail); + } + + if (MaxVectorSize > 32) { + __ BIND(L_pre_main_post_64); + // Partial copy to make dst address 64 byte aligned. + __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); + __ andq(temp2, 63); + __ jcc(Assembler::equal, L_main_pre_loop_64bytes); + + if (shift) { + __ shrq(temp2, shift); + } + __ subq(temp1, temp2); + __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); + + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::less, L_tail64); + + __ BIND(L_main_pre_loop_64bytes); + + // Main loop with aligned copy block size of 192 bytes at + // 64 byte copy granularity. + __ BIND(L_main_loop_64bytes); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); + __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); + __ subq(temp1, loop_size[shift]); + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::greater, L_main_loop_64bytes); + + // Zero length check. + __ cmpq(temp1, 0); + __ jcc(Assembler::lessEqual, L_exit); + + __ BIND(L_tail64); + + // Tail handling using 64 byte [masked] vector copy operations. + use64byteVector = true; + __ mov64(temp2, 0); + __ movq(temp3, temp1); + __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, + temp4, use64byteVector, L_entry, L_exit); + } + __ BIND(L_exit); + } + address ucme_exit_pc = __ pc(); + // When called from generic_arraycopy r11 contains specific values + // used during arraycopy epilogue, re-initializing r11. + if(is_oop) { + __ movq(r11, count); + } + bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); + restore_argument_regs(type); + inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free + __ xorptr(rax, rax); // return 0 + __ vzeroupper(); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } + + // Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored @@ -1343,6 +1717,10 @@ class StubGenerator: public StubCodeGenerator { // used by generate_conjoint_byte_copy(). // address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0, + aligned, false, false); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1453,6 +1831,10 @@ class StubGenerator: public StubCodeGenerator { // address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, address* entry, const char *name) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0, + nooverlap_target, aligned, false, false); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1558,6 +1940,11 @@ class StubGenerator: public StubCodeGenerator { // used by generate_conjoint_short_copy(). // address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1, + aligned, false, false); + } + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1682,6 +2069,10 @@ class StubGenerator: public StubCodeGenerator { // address generate_conjoint_short_copy(bool aligned, address nooverlap_target, address *entry, const char *name) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1, + nooverlap_target, aligned, false, false); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1780,6 +2171,11 @@ class StubGenerator: public StubCodeGenerator { // address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, const char *name, bool dest_uninitialized = false) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2, + aligned, is_oop, dest_uninitialized); + } + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1884,6 +2280,10 @@ class StubGenerator: public StubCodeGenerator { address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, address *entry, const char *name, bool dest_uninitialized = false) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2, + nooverlap_target, aligned, is_oop, dest_uninitialized); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -1991,6 +2391,10 @@ class StubGenerator: public StubCodeGenerator { // address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, const char *name, bool dest_uninitialized = false) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3, + aligned, is_oop, dest_uninitialized); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -2095,6 +2499,10 @@ class StubGenerator: public StubCodeGenerator { address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target, address *entry, const char *name, bool dest_uninitialized = false) { + if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) { + return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3, + nooverlap_target, aligned, is_oop, dest_uninitialized); + } __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index a23ee3666a6..ff4ed9c737a 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_ enum platform_dependent_constants { code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small) - code_size2 = 35300 LP64_ONLY(+11400) // simply increase if too small (assembler will crash if too small) + code_size2 = 35300 LP64_ONLY(+21400) // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 089d720e88e..5b4d4dbfc7b 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -763,6 +763,8 @@ void VM_Version::get_processor_features() { if (is_intel()) { // Intel cpus specific settings if (is_knights_family()) { _features &= ~CPU_VZEROUPPER; + _features &= ~CPU_AVX512BW; + _features &= ~CPU_AVX512VL; } } diff --git a/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java new file mode 100644 index 00000000000..9a39807b078 --- /dev/null +++ b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.arraycopy; +import java.util.Random; + +/** + * @test + * @bug 8251871 + * @summary Optimize arrayCopy using AVX-512 masked instructions. + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16 + * compiler.arraycopy.TestArrayCopyConjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16 + * compiler.arraycopy.TestArrayCopyConjoint + * + */ + +public class TestArrayCopyConjoint { + + public static final int SIZE = 4096; + public static byte[] fromByteArr, toByteArr, valByteArr; + public static char[] fromCharArr, toCharArr, valCharArr; + public static int[] fromIntArr, toIntArr, valIntArr; + public static long[] fromLongArr, toLongArr, valLongArr; + + static public void reinit(Class c) { + if (c == byte.class) { + for (int i = 0 ; i < SIZE ; i++) { + fromByteArr[i] = (byte)i; + } + } else if (c == char.class) { + for (int i = 0 ; i < SIZE ; i++) { + fromCharArr[i] = (char)i; + } + } else if (c == int.class) { + for (int i = 0 ; i < SIZE ; i++) { + fromIntArr[i] = i; + } + } else { + assert c == long.class; + for (int i = 0 ; i < SIZE ; i++) { + fromLongArr[i] = i; + } + } + } + + static public void setup() { + // Both positions aligned + fromByteArr = new byte[SIZE]; + valByteArr = new byte[SIZE]; + toByteArr = fromByteArr; + fromCharArr = new char[SIZE]; + valCharArr = new char[SIZE]; + toCharArr = fromCharArr; + fromIntArr = new int[SIZE]; + valIntArr = new int[SIZE]; + toIntArr = fromIntArr; + fromLongArr = new long[SIZE]; + valLongArr = new long[SIZE]; + toLongArr = fromLongArr; + + for (int i = 0 ; i < SIZE ; i++) { + fromByteArr[i] = (byte)i; + valByteArr[i] = (byte)i; + fromCharArr[i] = (char)i; + valCharArr[i] = (char)i; + fromIntArr[i] = i; + valIntArr[i] = i; + fromLongArr[i] = i; + valLongArr[i] = i; + } + } + + public static int validate_ctr = 0; + public static void validate(String msg, E arr, int length, int fromPos, int toPos) { + validate_ctr++; + if (arr instanceof byte []) { + byte [] barr = (byte [])arr; + for(int i = 0 ; i < length; i++) + if (valByteArr[i+fromPos] != barr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + valByteArr[i+fromPos] + + " actual = " + barr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + + } + } + else if (arr instanceof char []) { + char [] carr = (char [])arr; + for(int i = 0 ; i < length; i++) + if (valCharArr[i+fromPos] != carr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + valCharArr[i+fromPos] + + " actual = " + carr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + else if (arr instanceof int []) { + int [] iarr = (int [])arr; + for(int i = 0 ; i < length; i++) + if (valIntArr[i+fromPos] != iarr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + valIntArr[i+fromPos] + + " actual = " + iarr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + else if (arr instanceof long []) { + long [] larr = (long [])arr; + for(int i = 0 ; i < length; i++) + if (valLongArr[i+fromPos] != larr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + valLongArr[i+fromPos] + + " actual = " + larr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + } + + public static void testByte(int length, int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length); + validate(" Test ByteArr ", toByteArr, length, fromPos, toPos); + } + + public static void testChar(int length, int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length); + validate(" Test CharArr ", toCharArr, length, fromPos, toPos); + } + + public static void testInt(int length, int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length); + validate(" Test IntArr ", toIntArr, length, fromPos, toPos); + } + + public static void testLong(int length, int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length); + validate(" Test LongArr ", toLongArr, length, fromPos, toPos); + } + + public static void testByte_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7); + validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos); + } + public static void testByte_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45); + validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos); + } + + public static void testChar_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7); + validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos); + } + public static void testChar_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22); + validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos); + } + + public static void testInt_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7); + validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos); + } + public static void testInt_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11); + validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos); + } + + public static void testLong_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3); + validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos); + } + public static void testLong_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6); + validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos); + } + + + public static void main(String [] args) { + // Cases to test each new optimized stub special blocks. + // Cases to test new PI handling (PI32 and PI64). + // Cases to test vectorized constant array copies for all primitive types. + // LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2 + int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 }; + Random r = new Random(1024); + + setup(); + + try { + for (int i = 0 ; i < 1000000 ; i++ ) { + int index = r.nextInt(2048); + testByte(lengths[i % lengths.length], index , index+2); + reinit(byte.class); + testByte_constant_LT32B (index , index+2); + reinit(byte.class); + testByte_constant_LT64B (index , index+2); + reinit(byte.class); + + testChar(lengths[i % lengths.length] >> 1, index , index+2); + reinit(char.class); + testChar_constant_LT32B (index , index+2); + reinit(char.class); + testChar_constant_LT64B (index , index+2); + reinit(char.class); + + testInt(lengths[i % lengths.length] >> 2, index , index+2); + reinit(int.class); + testInt_constant_LT32B (index , index+2); + reinit(int.class); + testInt_constant_LT64B (index , index+2); + reinit(int.class); + + testLong(lengths[i % lengths.length] >> 3, index , index+2); + reinit(long.class); + testLong_constant_LT32B (index , index+2); + reinit(long.class); + testLong_constant_LT64B (index , index+2); + reinit(long.class); + } + System.out.println("PASS : " + validate_ctr); + } catch (Exception e) { + System.out.println(e.getMessage()); + } + } +} diff --git a/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java new file mode 100644 index 00000000000..7bf676d3cae --- /dev/null +++ b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.arraycopy; +import java.util.Random; + +/** + * @test + * @bug 8251871 + * @summary Optimize arrayCopy using AVX-512 masked instructions. + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16 + * compiler.arraycopy.TestArrayCopyDisjoint + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch -XX:+IgnoreUnrecognizedVMOptions + * -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16 + * compiler.arraycopy.TestArrayCopyDisjoint + * + */ + +public class TestArrayCopyDisjoint { + + public static final int SIZE = 4096; + public static byte[] fromByteArr, toByteArr; + public static char[] fromCharArr, toCharArr; + public static int[] fromIntArr, toIntArr; + public static long[] fromLongArr, toLongArr; + + static public void setup() { + // Both positions aligned + fromByteArr = new byte[SIZE]; + toByteArr = new byte[SIZE]; + fromCharArr = new char[SIZE]; + toCharArr = new char[SIZE]; + fromIntArr = new int[SIZE]; + toIntArr = new int[SIZE]; + fromLongArr = new long[SIZE]; + toLongArr = new long[SIZE]; + + for (int i = 0 ; i < SIZE ; i++) { + fromByteArr[i] = (byte)i; + fromCharArr[i] = (char)i; + fromIntArr[i] = i; + fromLongArr[i] = i; + } + } + + public static int validate_ctr = 0; + public static void validate(String msg, E arr, int length, int fromPos, int toPos) { + validate_ctr++; + if (arr instanceof byte []) { + byte [] barr = (byte [])arr; + for(int i = 0 ; i < length; i++) + if (fromByteArr[i+fromPos] != barr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + fromByteArr[i+fromPos] + + " actual = " + barr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + else if (arr instanceof char []) { + char [] carr = (char [])arr; + for(int i = 0 ; i < length; i++) + if (fromCharArr[i+fromPos] != carr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + fromCharArr[i+fromPos] + + " actual = " + carr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + else if (arr instanceof int []) { + int [] iarr = (int [])arr; + for(int i = 0 ; i < length; i++) + if (fromIntArr[i+fromPos] != iarr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + fromIntArr[i+fromPos] + + " actual = " + iarr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + else if (arr instanceof long []) { + long [] larr = (long [])arr; + for(int i = 0 ; i < length; i++) + if (fromLongArr[i+fromPos] != larr[i+toPos]) { + System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i + + " expected = " + fromLongArr[i+fromPos] + + " actual = " + larr[i+toPos] + + " fromPos = " + fromPos + + " toPos = " + toPos); + throw new Error("Fail"); + } + } + } + + public static void testByte(int length, int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length); + validate(" Test ByteArr ", toByteArr, length, fromPos, toPos); + } + + public static void testChar(int length, int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length); + validate(" Test CharArr ", toCharArr, length, fromPos, toPos); + } + + public static void testInt(int length, int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length); + validate(" Test IntArr ", toIntArr, length, fromPos, toPos); + } + + public static void testLong(int length, int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length); + validate(" Test LongArr ", toLongArr, length, fromPos, toPos); + } + + public static void testByte_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7); + validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos); + } + public static void testByte_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45); + validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos); + } + + public static void testChar_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7); + validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos); + } + public static void testChar_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22); + validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos); + } + + public static void testInt_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7); + validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos); + } + public static void testInt_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11); + validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos); + } + + public static void testLong_constant_LT32B(int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3); + validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos); + } + public static void testLong_constant_LT64B(int fromPos, int toPos) { + System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6); + validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos); + } + + + public static void main(String [] args) { + // Cases to test each new optimized stub special blocks. + // Cases to test new PI handling (PI32 and PI64). + // Cases to test vectorized constant array copies for all primitive types. + // LT32B LT64B LT96B LT128B LT160B LT192B LOOP1 LOOP2 + int [] lengths = { 29, 59, 89, 125, 159, 189, 194, 1024 }; + Random r = new Random(1024); + + setup(); + + try { + for (int i = 0 ; i < 1000000 ; i++ ) { + testByte(lengths[i % lengths.length], r.nextInt(2048) , r.nextInt(2048)); + testByte_constant_LT32B (r.nextInt(2048) , r.nextInt(2048)); + testByte_constant_LT64B (r.nextInt(2048) , r.nextInt(2048)); + + testChar(lengths[i % lengths.length] >> 1, r.nextInt(2048) , r.nextInt(2048)); + testChar_constant_LT32B (r.nextInt(2048) , r.nextInt(2048)); + testChar_constant_LT64B (r.nextInt(2048) , r.nextInt(2048)); + + testInt(lengths[i % lengths.length] >> 2, r.nextInt(2048) , r.nextInt(2048)); + testInt_constant_LT32B (r.nextInt(2048) , r.nextInt(2048)); + testInt_constant_LT64B (r.nextInt(2048) , r.nextInt(2048)); + + testLong(lengths[i % lengths.length] >> 3, r.nextInt(2048) , r.nextInt(2048)); + testLong_constant_LT32B (r.nextInt(2048) , r.nextInt(2048)); + testLong_constant_LT64B (r.nextInt(2048) , r.nextInt(2048)); + } + System.out.println("PASS : " + validate_ctr); + } catch (Exception e) { + System.out.println(e.getMessage()); + } + } +} diff --git a/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java b/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java new file mode 100644 index 00000000000..ff33d451fdb --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.results.Result; +import org.openjdk.jmh.results.RunResult; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.TimeValue; + + + + +import java.util.concurrent.TimeUnit; +import java.util.Arrays; + +class MyClass { + public int field1; + public int field2; + public int field3; + + public MyClass(int val) { + field1 = val; + field2 = val; + field3 = val; + } +} + +@State(Scope.Benchmark) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class ArrayCopyObject { + @Param({"31", "63", "127" , "2047" , "4095", "8191"}) private int size; + + private MyClass [] src; + private MyClass [] dst; + + @Setup + public void setup() { + src = new MyClass[size]; + dst = new MyClass[size]; + for (int i = 0; i < src.length ; i++) { + src[i] = new MyClass(i); + dst[i] = new MyClass(0); + } + } + + @Benchmark + public void disjoint_micro() { + System.arraycopy(src, 0 , dst, 0 , size); + } + + @Benchmark + public void conjoint_micro() { + System.arraycopy(src, 0 , src, 10 , size - 10 ); + } + + public static void main(String[] args) throws RunnerException { + String [] base_opts = + { "-XX:+UnlockDiagnosticVMOptions ", + "-XX:+IgnoreUnrecognizedVMOptions ", + "-XX:UseAVX=3" }; + String [] opts_str1 = {"-XX:-UseCompressedOops "}; + String [] opts_str2 = {"-XX:+UseCompressedOops "}; + + Options baseOpts = new OptionsBuilder() + .include(ArrayCopyObject.class.getName()) + .warmupTime(TimeValue.seconds(30)) + .measurementTime(TimeValue.seconds(10)) + .warmupIterations(1) + .measurementIterations(2) + .jvmArgs(base_opts) + .forks(1) + .build(); + + RunResult r1 = new Runner(new OptionsBuilder() + .parent(baseOpts) + .jvmArgs(opts_str1) + .build()).runSingle(); + + RunResult r2 = new Runner(new OptionsBuilder() + .parent(baseOpts) + .jvmArgs(opts_str2) + .build()).runSingle(); + + System.out.println(r1.getPrimaryResult().getScore() + r2.getPrimaryResult().getScore()); + } +} +