8310159: Bulk copy with Unsafe::arrayCopy is slower compared to memcpy

Co-authored-by: Maurizio Cimadamore <mcimadamore@openjdk.org>
Reviewed-by: thartmann, jbhateja, sviswanathan
This commit is contained in:
steveatgh 2023-11-27 17:35:39 +00:00 committed by Sandhya Viswanathan
parent f0a12c571b
commit 82967f45db
5 changed files with 259 additions and 0 deletions

@ -3417,6 +3417,27 @@ void Assembler::evmovdquq(XMMRegister dst, KRegister mask, Address src, bool mer
emit_operand(dst, src, 0);
}
void Assembler::evmovntdquq(Address dst, XMMRegister src, int vector_len) {
// Unmasked instruction
evmovntdquq(dst, k0, src, /*merge*/ true, vector_len);
}
void Assembler::evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(src != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
attributes.set_is_evex_instruction();
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int8(0xE7);
emit_operand(src, dst, 0);
}
void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
// Unmasked instruction
evmovdquq(dst, k0, src, /*merge*/ true, vector_len);

@ -1615,6 +1615,9 @@ private:
void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovntdquq(Address dst, XMMRegister src, int vector_len);
void evmovdquq(Address dst, XMMRegister src, int vector_len);
void evmovdquq(XMMRegister dst, Address src, int vector_len);
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

@ -187,11 +187,20 @@ class StubGenerator: public StubCodeGenerator {
Register index, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);
void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift,
Register index, Register temp, Label& L_exit);
void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);
void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
Register temp3, Register temp4, Register count,
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, int shift);
void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
int shift = Address::times_1, int offset = 0);
@ -199,6 +208,9 @@ class StubGenerator: public StubCodeGenerator {
bool conjoint, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false);
void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2,
XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0);
void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift = Address::times_1, int offset = 0,

@ -515,8 +515,10 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
int avx3threshold = VM_Version::avx3_threshold();
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
const int large_threshold = 2621440; // 2.5 MB
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
Label L_copy_large, L_finish;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -577,6 +579,12 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
// PRE-MAIN-POST loop for aligned copy.
__ BIND(L_entry);
if (MaxVectorSize == 64) {
__ movq(temp2, temp1);
__ shlq(temp2, shift);
__ cmpq(temp2, large_threshold);
__ jcc(Assembler::greaterEqual, L_copy_large);
}
if (avx3threshold != 0) {
__ cmpq(count, threshold[shift]);
if (MaxVectorSize == 64) {
@ -703,6 +711,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
__ BIND(L_exit);
}
__ BIND(L_finish);
address ucme_exit_pc = __ pc();
// When called from generic_arraycopy r11 contains specific values
// used during arraycopy epilogue, re-initializing r11.
@ -717,9 +726,77 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
if (MaxVectorSize == 64) {
__ BIND(L_copy_large);
arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
__ jmp(L_finish);
}
return start;
}
void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
Register temp3, Register temp4, Register count,
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, int shift) {
// Type(shift) byte(0), short(1), int(2), long(3)
int loop_size[] = { 256, 128, 64, 32};
int threshold[] = { 4096, 2048, 1024, 512};
Label L_main_loop_large;
Label L_tail_large;
Label L_exit_large;
Label L_entry_large;
Label L_main_pre_loop_large;
Label L_pre_main_post_large;
assert(MaxVectorSize == 64, "vector length != 64");
__ BIND(L_entry_large);
__ BIND(L_pre_main_post_large);
// Partial copy to make dst address 64 byte aligned.
__ movq(temp2, to);
__ andq(temp2, 63);
__ jcc(Assembler::equal, L_main_pre_loop_large);
__ negptr(temp2);
__ addq(temp2, 64);
if (shift) {
__ shrq(temp2, shift);
}
__ movq(temp3, temp2);
copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
__ movq(temp4, temp2);
__ movq(temp1, count);
__ subq(temp1, temp2);
__ cmpq(temp1, loop_size[shift]);
__ jcc(Assembler::less, L_tail_large);
__ BIND(L_main_pre_loop_large);
__ subq(temp1, loop_size[shift]);
// Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
__ align32();
__ BIND(L_main_loop_large);
copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
__ addptr(temp4, loop_size[shift]);
__ subq(temp1, loop_size[shift]);
__ jcc(Assembler::greater, L_main_loop_large);
// fence needed because copy256_avx3 uses non-temporal stores
__ sfence();
__ addq(temp1, loop_size[shift]);
// Zero length check.
__ jcc(Assembler::lessEqual, L_exit_large);
__ BIND(L_tail_large);
// Tail handling using 64 byte [masked] vector copy operations.
__ cmpq(temp1, 0);
__ jcc(Assembler::lessEqual, L_exit_large);
arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
temp4, temp3, L_exit_large);
__ BIND(L_exit_large);
}
// Inputs:
// c_rarg0 - source array address
@ -965,6 +1042,55 @@ void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask
__ jmp(L_exit);
}
void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift, Register index,
Register temp, Label& L_exit) {
Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
int size_mat[][4] = {
/* T_BYTE */ {64, 128, 192, 256},
/* T_SHORT*/ {32, 64 , 96 , 128},
/* T_INT */ {16, 32 , 48 , 64},
/* T_LONG */ { 8, 16 , 24 , 32}
};
assert(MaxVectorSize == 64, "vector length != 64");
// Case A) Special case for length less than or equal to 64 bytes.
__ BIND(L_entry_64);
__ cmpq(count, size_mat[shift][0]);
__ jccb(Assembler::greater, L_entry_128);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
__ jmp(L_exit);
// Case B) Special case for length less than or equal to 128 bytes.
__ BIND(L_entry_128);
__ cmpq(count, size_mat[shift][1]);
__ jccb(Assembler::greater, L_entry_192);
copy64_avx(to, from, index, xmm, false, shift, 0, true);
__ subq(count, 64 >> shift);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
__ jmp(L_exit);
// Case C) Special case for length less than or equal to 192 bytes.
__ BIND(L_entry_192);
__ cmpq(count, size_mat[shift][2]);
__ jcc(Assembler::greater, L_entry_256);
copy64_avx(to, from, index, xmm, false, shift, 0, true);
copy64_avx(to, from, index, xmm, false, shift, 64, true);
__ subq(count, 128 >> shift);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
__ jmp(L_exit);
// Case D) Special case for length less than or equal to 256 bytes.
__ BIND(L_entry_256);
copy64_avx(to, from, index, xmm, false, shift, 0, true);
copy64_avx(to, from, index, xmm, false, shift, 64, true);
copy64_avx(to, from, index, xmm, false, shift, 128, true);
__ subq(count, 192 >> shift);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
__ jmp(L_exit);
}
void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
@ -1040,6 +1166,33 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
__ jmp(L_exit);
}
void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
int shift, int offset) {
if (MaxVectorSize == 64) {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
__ prefetcht0(Address(src, index, scale, offset + 0x200));
__ prefetcht0(Address(src, index, scale, offset + 0x240));
__ prefetcht0(Address(src, index, scale, offset + 0x280));
__ prefetcht0(Address(src, index, scale, offset + 0x2C0));
__ prefetcht0(Address(src, index, scale, offset + 0x400));
__ prefetcht0(Address(src, index, scale, offset + 0x440));
__ prefetcht0(Address(src, index, scale, offset + 0x480));
__ prefetcht0(Address(src, index, scale, offset + 0x4C0));
__ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
__ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
__ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
__ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
__ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
__ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
__ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
__ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
}
}
void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift, int offset,

@ -0,0 +1,70 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.util.concurrent.TimeUnit;
/**
* Benchmark measuring aligned System.arraycopy.
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
@Warmup(iterations = 10, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 3)
public class ArrayCopyAlignedLarge {
@Param({"100000", "1000000", "2000000", "5000000", "10000000"})
int length;
int fromPos, toPos;
byte[] fromByteArr, toByteArr;
@Setup
public void setup() {
// Both positions aligned
fromPos = 0;
toPos = 0;
fromByteArr = new byte[length];
toByteArr = new byte[length];
}
@Benchmark
public void testByte() {
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
}
}