8310159: Bulk copy with Unsafe::arrayCopy is slower compared to memcpy
Co-authored-by: Maurizio Cimadamore <mcimadamore@openjdk.org> Reviewed-by: thartmann, jbhateja, sviswanathan
This commit is contained in:
parent
f0a12c571b
commit
82967f45db
src/hotspot/cpu/x86
test/micro/org/openjdk/bench/java/lang
@ -3417,6 +3417,27 @@ void Assembler::evmovdquq(XMMRegister dst, KRegister mask, Address src, bool mer
|
||||
emit_operand(dst, src, 0);
|
||||
}
|
||||
|
||||
void Assembler::evmovntdquq(Address dst, XMMRegister src, int vector_len) {
|
||||
// Unmasked instruction
|
||||
evmovntdquq(dst, k0, src, /*merge*/ true, vector_len);
|
||||
}
|
||||
|
||||
void Assembler::evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
|
||||
assert(VM_Version::supports_evex(), "");
|
||||
assert(src != xnoreg, "sanity");
|
||||
InstructionMark im(this);
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||
attributes.set_embedded_opmask_register_specifier(mask);
|
||||
if (merge) {
|
||||
attributes.reset_is_clear_context();
|
||||
}
|
||||
attributes.set_is_evex_instruction();
|
||||
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0xE7);
|
||||
emit_operand(src, dst, 0);
|
||||
}
|
||||
|
||||
void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
|
||||
// Unmasked instruction
|
||||
evmovdquq(dst, k0, src, /*merge*/ true, vector_len);
|
||||
|
@ -1615,6 +1615,9 @@ private:
|
||||
void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
|
||||
void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
|
||||
|
||||
void evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
|
||||
void evmovntdquq(Address dst, XMMRegister src, int vector_len);
|
||||
|
||||
void evmovdquq(Address dst, XMMRegister src, int vector_len);
|
||||
void evmovdquq(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
@ -187,11 +187,20 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Register index, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit);
|
||||
|
||||
void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register count, int shift,
|
||||
Register index, Register temp, Label& L_exit);
|
||||
|
||||
void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register start_index, Register end_index,
|
||||
Register count, int shift, Register temp,
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit);
|
||||
|
||||
void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
|
||||
Register temp3, Register temp4, Register count,
|
||||
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
|
||||
XMMRegister xmm4, int shift);
|
||||
|
||||
void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
|
||||
int shift = Address::times_1, int offset = 0);
|
||||
|
||||
@ -199,6 +208,9 @@ class StubGenerator: public StubCodeGenerator {
|
||||
bool conjoint, int shift = Address::times_1, int offset = 0,
|
||||
bool use64byteVector = false);
|
||||
|
||||
void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2,
|
||||
XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0);
|
||||
|
||||
void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift = Address::times_1, int offset = 0,
|
||||
|
@ -515,8 +515,10 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
|
||||
|
||||
int avx3threshold = VM_Version::avx3_threshold();
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
|
||||
const int large_threshold = 2621440; // 2.5 MB
|
||||
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
|
||||
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
|
||||
Label L_copy_large, L_finish;
|
||||
const Register from = rdi; // source array address
|
||||
const Register to = rsi; // destination array address
|
||||
const Register count = rdx; // elements count
|
||||
@ -577,6 +579,12 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
|
||||
// PRE-MAIN-POST loop for aligned copy.
|
||||
__ BIND(L_entry);
|
||||
|
||||
if (MaxVectorSize == 64) {
|
||||
__ movq(temp2, temp1);
|
||||
__ shlq(temp2, shift);
|
||||
__ cmpq(temp2, large_threshold);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_large);
|
||||
}
|
||||
if (avx3threshold != 0) {
|
||||
__ cmpq(count, threshold[shift]);
|
||||
if (MaxVectorSize == 64) {
|
||||
@ -703,6 +711,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
|
||||
__ BIND(L_exit);
|
||||
}
|
||||
|
||||
__ BIND(L_finish);
|
||||
address ucme_exit_pc = __ pc();
|
||||
// When called from generic_arraycopy r11 contains specific values
|
||||
// used during arraycopy epilogue, re-initializing r11.
|
||||
@ -717,9 +726,77 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
|
||||
if (MaxVectorSize == 64) {
|
||||
__ BIND(L_copy_large);
|
||||
arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
|
||||
__ jmp(L_finish);
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
|
||||
Register temp3, Register temp4, Register count,
|
||||
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
|
||||
XMMRegister xmm4, int shift) {
|
||||
|
||||
// Type(shift) byte(0), short(1), int(2), long(3)
|
||||
int loop_size[] = { 256, 128, 64, 32};
|
||||
int threshold[] = { 4096, 2048, 1024, 512};
|
||||
|
||||
Label L_main_loop_large;
|
||||
Label L_tail_large;
|
||||
Label L_exit_large;
|
||||
Label L_entry_large;
|
||||
Label L_main_pre_loop_large;
|
||||
Label L_pre_main_post_large;
|
||||
|
||||
assert(MaxVectorSize == 64, "vector length != 64");
|
||||
__ BIND(L_entry_large);
|
||||
|
||||
__ BIND(L_pre_main_post_large);
|
||||
// Partial copy to make dst address 64 byte aligned.
|
||||
__ movq(temp2, to);
|
||||
__ andq(temp2, 63);
|
||||
__ jcc(Assembler::equal, L_main_pre_loop_large);
|
||||
|
||||
__ negptr(temp2);
|
||||
__ addq(temp2, 64);
|
||||
if (shift) {
|
||||
__ shrq(temp2, shift);
|
||||
}
|
||||
__ movq(temp3, temp2);
|
||||
copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
|
||||
__ movq(temp4, temp2);
|
||||
__ movq(temp1, count);
|
||||
__ subq(temp1, temp2);
|
||||
|
||||
__ cmpq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::less, L_tail_large);
|
||||
|
||||
__ BIND(L_main_pre_loop_large);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
|
||||
// Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
|
||||
__ align32();
|
||||
__ BIND(L_main_loop_large);
|
||||
copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
|
||||
__ addptr(temp4, loop_size[shift]);
|
||||
__ subq(temp1, loop_size[shift]);
|
||||
__ jcc(Assembler::greater, L_main_loop_large);
|
||||
// fence needed because copy256_avx3 uses non-temporal stores
|
||||
__ sfence();
|
||||
|
||||
__ addq(temp1, loop_size[shift]);
|
||||
// Zero length check.
|
||||
__ jcc(Assembler::lessEqual, L_exit_large);
|
||||
__ BIND(L_tail_large);
|
||||
// Tail handling using 64 byte [masked] vector copy operations.
|
||||
__ cmpq(temp1, 0);
|
||||
__ jcc(Assembler::lessEqual, L_exit_large);
|
||||
arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
|
||||
temp4, temp3, L_exit_large);
|
||||
__ BIND(L_exit_large);
|
||||
}
|
||||
|
||||
// Inputs:
|
||||
// c_rarg0 - source array address
|
||||
@ -965,6 +1042,55 @@ void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask
|
||||
__ jmp(L_exit);
|
||||
}
|
||||
|
||||
void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register count, int shift, Register index,
|
||||
Register temp, Label& L_exit) {
|
||||
Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
|
||||
|
||||
int size_mat[][4] = {
|
||||
/* T_BYTE */ {64, 128, 192, 256},
|
||||
/* T_SHORT*/ {32, 64 , 96 , 128},
|
||||
/* T_INT */ {16, 32 , 48 , 64},
|
||||
/* T_LONG */ { 8, 16 , 24 , 32}
|
||||
};
|
||||
|
||||
assert(MaxVectorSize == 64, "vector length != 64");
|
||||
// Case A) Special case for length less than or equal to 64 bytes.
|
||||
__ BIND(L_entry_64);
|
||||
__ cmpq(count, size_mat[shift][0]);
|
||||
__ jccb(Assembler::greater, L_entry_128);
|
||||
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
|
||||
__ jmp(L_exit);
|
||||
|
||||
// Case B) Special case for length less than or equal to 128 bytes.
|
||||
__ BIND(L_entry_128);
|
||||
__ cmpq(count, size_mat[shift][1]);
|
||||
__ jccb(Assembler::greater, L_entry_192);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, true);
|
||||
__ subq(count, 64 >> shift);
|
||||
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
|
||||
__ jmp(L_exit);
|
||||
|
||||
// Case C) Special case for length less than or equal to 192 bytes.
|
||||
__ BIND(L_entry_192);
|
||||
__ cmpq(count, size_mat[shift][2]);
|
||||
__ jcc(Assembler::greater, L_entry_256);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, true);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 64, true);
|
||||
__ subq(count, 128 >> shift);
|
||||
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
|
||||
__ jmp(L_exit);
|
||||
|
||||
// Case D) Special case for length less than or equal to 256 bytes.
|
||||
__ BIND(L_entry_256);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 0, true);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 64, true);
|
||||
copy64_avx(to, from, index, xmm, false, shift, 128, true);
|
||||
__ subq(count, 192 >> shift);
|
||||
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
|
||||
__ jmp(L_exit);
|
||||
}
|
||||
|
||||
void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
|
||||
Register to, Register start_index, Register end_index,
|
||||
Register count, int shift, Register temp,
|
||||
@ -1040,6 +1166,33 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
|
||||
__ jmp(L_exit);
|
||||
}
|
||||
|
||||
void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
|
||||
XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
|
||||
int shift, int offset) {
|
||||
if (MaxVectorSize == 64) {
|
||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x200));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x240));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x280));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x2C0));
|
||||
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x400));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x440));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x480));
|
||||
__ prefetcht0(Address(src, index, scale, offset + 0x4C0));
|
||||
|
||||
__ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
|
||||
__ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
|
||||
__ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
|
||||
__ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
|
||||
|
||||
__ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
|
||||
__ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
|
||||
__ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
|
||||
__ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
|
||||
}
|
||||
}
|
||||
|
||||
void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
|
||||
KRegister mask, Register length, Register index,
|
||||
Register temp, int shift, int offset,
|
||||
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.java.lang;
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Benchmark measuring aligned System.arraycopy.
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Warmup(iterations = 10, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(value = 3)
|
||||
public class ArrayCopyAlignedLarge {
|
||||
|
||||
@Param({"100000", "1000000", "2000000", "5000000", "10000000"})
|
||||
int length;
|
||||
|
||||
int fromPos, toPos;
|
||||
byte[] fromByteArr, toByteArr;
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
// Both positions aligned
|
||||
fromPos = 0;
|
||||
toPos = 0;
|
||||
|
||||
fromByteArr = new byte[length];
|
||||
toByteArr = new byte[length];
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void testByte() {
|
||||
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user