8302780: Add support for vectorized arraycopy GC barriers

Co-authored-by: Yadong Wang <yadongwang@openjdk.org>
Reviewed-by: ayang, fyang, rcastanedalo, aph
This commit is contained in:
Erik Österlund 2023-03-06 07:58:23 +00:00
parent d00a767047
commit 5f153e056b
11 changed files with 938 additions and 305 deletions

@ -119,6 +119,111 @@ void BarrierSetAssembler::store_at(MacroAssembler* masm, DecoratorSet decorators
}
}
void BarrierSetAssembler::copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst1,
Register dst2,
Address src,
Register tmp) {
if (bytes == 1) {
assert(dst2 == noreg, "invariant");
__ ldrb(dst1, src);
} else if (bytes == 2) {
assert(dst2 == noreg, "invariant");
__ ldrh(dst1, src);
} else if (bytes == 4) {
assert(dst2 == noreg, "invariant");
__ ldrw(dst1, src);
} else if (bytes == 8) {
assert(dst2 == noreg, "invariant");
__ ldr(dst1, src);
} else if (bytes == 16) {
assert(dst2 != noreg, "invariant");
assert(dst2 != dst1, "invariant");
__ ldp(dst1, dst2, src);
} else {
// Not the right size
ShouldNotReachHere();
}
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ decode_heap_oop(dst1);
}
}
void BarrierSetAssembler::copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src1,
Register src2,
Register tmp1,
Register tmp2,
Register tmp3) {
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ encode_heap_oop(src1);
}
if (bytes == 1) {
assert(src2 == noreg, "invariant");
__ strb(src1, dst);
} else if (bytes == 2) {
assert(src2 == noreg, "invariant");
__ strh(src1, dst);
} else if (bytes == 4) {
assert(src2 == noreg, "invariant");
__ strw(src1, dst);
} else if (bytes == 8) {
assert(src2 == noreg, "invariant");
__ str(src1, dst);
} else if (bytes == 16) {
assert(src2 != noreg, "invariant");
assert(src2 != src1, "invariant");
__ stp(src1, src2, dst);
} else {
// Not the right size
ShouldNotReachHere();
}
}
void BarrierSetAssembler::copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
FloatRegister dst1,
FloatRegister dst2,
Address src,
Register tmp1,
Register tmp2,
FloatRegister vec_tmp) {
if (bytes == 32) {
__ ldpq(dst1, dst2, src);
} else {
ShouldNotReachHere();
}
}
void BarrierSetAssembler::copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
FloatRegister src1,
FloatRegister src2,
Register tmp1,
Register tmp2,
Register tmp3,
FloatRegister vec_tmp1,
FloatRegister vec_tmp2,
FloatRegister vec_tmp3) {
if (bytes == 32) {
__ stpq(src1, src2, dst);
} else {
ShouldNotReachHere();
}
}
void BarrierSetAssembler::try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
Register obj, Register tmp, Label& slowpath) {
// If mask changes we need to ensure that the inverse is still encodable as an immediate

@ -48,6 +48,52 @@ public:
Register src, Register dst, Register count, RegSet saved_regs) {}
virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
Register start, Register count, Register tmp, RegSet saved_regs) {}
virtual void copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst1,
Register dst2,
Address src,
Register tmp);
virtual void copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src1,
Register src2,
Register tmp1,
Register tmp2,
Register tmp3);
virtual void copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
FloatRegister dst1,
FloatRegister dst2,
Address src,
Register tmp1,
Register tmp2,
FloatRegister vec_tmp);
virtual void copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
FloatRegister src1,
FloatRegister src2,
Register tmp1,
Register tmp2,
Register tmp3,
FloatRegister vec_tmp1,
FloatRegister vec_tmp2,
FloatRegister vec_tmp3);
virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
Register dst, Address src, Register tmp1, Register tmp2);
virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,

@ -696,6 +696,79 @@ class StubGenerator: public StubCodeGenerator {
copy_backwards = -1
} copy_direction;
// Helper object to reduce noise when telling the GC barriers how to perform loads and stores
// for arraycopy stubs.
class ArrayCopyBarrierSetHelper : StackObj {
BarrierSetAssembler* _bs_asm;
MacroAssembler* _masm;
DecoratorSet _decorators;
BasicType _type;
Register _gct1;
Register _gct2;
Register _gct3;
FloatRegister _gcvt1;
FloatRegister _gcvt2;
FloatRegister _gcvt3;
public:
ArrayCopyBarrierSetHelper(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
Register gct1,
Register gct2,
Register gct3,
FloatRegister gcvt1,
FloatRegister gcvt2,
FloatRegister gcvt3)
: _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
_masm(masm),
_decorators(decorators),
_type(type),
_gct1(gct1),
_gct2(gct2),
_gct3(gct3),
_gcvt1(gcvt1),
_gcvt2(gcvt2),
_gcvt3(gcvt3) {
}
void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
_bs_asm->copy_load_at(_masm, _decorators, _type, 32,
dst1, dst2, src,
_gct1, _gct2, _gcvt1);
}
void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
_bs_asm->copy_store_at(_masm, _decorators, _type, 32,
dst, src1, src2,
_gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
}
void copy_load_at_16(Register dst1, Register dst2, Address src) {
_bs_asm->copy_load_at(_masm, _decorators, _type, 16,
dst1, dst2, src,
_gct1);
}
void copy_store_at_16(Address dst, Register src1, Register src2) {
_bs_asm->copy_store_at(_masm, _decorators, _type, 16,
dst, src1, src2,
_gct1, _gct2, _gct3);
}
void copy_load_at_8(Register dst, Address src) {
_bs_asm->copy_load_at(_masm, _decorators, _type, 8,
dst, noreg, src,
_gct1);
}
void copy_store_at_8(Address dst, Register src) {
_bs_asm->copy_store_at(_masm, _decorators, _type, 8,
dst, src, noreg,
_gct1, _gct2, _gct3);
}
};
// Bulk copy of blocks of 8 words.
//
// count is a count of words.
@ -709,17 +782,20 @@ class StubGenerator: public StubCodeGenerator {
//
// s and d are adjusted to point to the remaining words to copy
//
void generate_copy_longs(Label &start, Register s, Register d, Register count,
void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
copy_direction direction) {
int unit = wordSize * direction;
int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
t4 = r7, t5 = r10, t6 = r11, t7 = r12;
const Register stride = r13;
t4 = r7, t5 = r11, t6 = r12, t7 = r13;
const Register stride = r14;
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v8;
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
assert_different_registers(s, d, count, rscratch1);
assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
assert_different_registers(s, d, count, rscratch1, rscratch2);
Label again, drain;
const char *stub_name;
@ -757,13 +833,13 @@ class StubGenerator: public StubCodeGenerator {
// Fill 8 registers
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 4 * unit));
__ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
} else {
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 16);
@ -783,19 +859,19 @@ class StubGenerator: public StubCodeGenerator {
__ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
if (UseSIMDForMemoryOps) {
__ stpq(v0, v1, Address(d, 4 * unit));
__ ldpq(v0, v1, Address(s, 4 * unit));
__ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
__ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
} else {
__ stp(t0, t1, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 8);
@ -804,26 +880,26 @@ class StubGenerator: public StubCodeGenerator {
// Drain
__ bind(drain);
if (UseSIMDForMemoryOps) {
__ stpq(v0, v1, Address(d, 4 * unit));
__ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
} else {
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
}
{
Label L1, L2;
__ tbz(count, exact_log2(4), L1);
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
__ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
} else {
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(__ pre(d, 4 * unit)));
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
}
__ bind(L1);
@ -833,8 +909,8 @@ class StubGenerator: public StubCodeGenerator {
}
__ tbz(count, 1, L2);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
__ bind(L2);
}
@ -893,10 +969,10 @@ class StubGenerator: public StubCodeGenerator {
// t4 at offset -48, t5 at offset -40
// t6 at offset -64, t7 at offset -56
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
__ subs(count, count, 16);
__ br(Assembler::LO, drain);
@ -925,15 +1001,15 @@ class StubGenerator: public StubCodeGenerator {
// t5 at offset 40, t6 at offset 48
// t7 at offset 56
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t0);
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
} else {
// d was not offset when we started so the registers are
// written into the 64 bit block preceding d with the following
@ -948,15 +1024,15 @@ class StubGenerator: public StubCodeGenerator {
// note that this matches the offsets previously noted for the
// loads
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t1);
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 8);
@ -968,17 +1044,17 @@ class StubGenerator: public StubCodeGenerator {
// as above
__ bind(drain);
if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t0);
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t1);
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
}
// now we need to copy any remaining part block which may
// include a 4 word block subblock and/or a 2 word subblock.
@ -991,16 +1067,16 @@ class StubGenerator: public StubCodeGenerator {
// with only one intervening stp between the str instructions
// but note that the offsets and registers still follow the
// same pattern
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ str(t3, Address(__ pre(d, 4 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t0);
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ str(t2, Address(__ pre(d, 4 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t1);
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
}
__ bind(L1);
@ -1009,13 +1085,13 @@ class StubGenerator: public StubCodeGenerator {
// there is no intervening stp between the str instructions
// but note that the offset and register patterns are still
// the same
__ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ str(t1, Address(__ pre(d, 2 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t0);
bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
} else {
__ str(t1, Address(d, 1 * unit));
__ str(t0, Address(__ pre(d, 2 * unit)));
bs.copy_store_at_8(Address(d, 1 * unit), t1);
bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
}
__ bind(L2);
@ -1038,18 +1114,19 @@ class StubGenerator: public StubCodeGenerator {
// NB: Ignores all of the bits of count which represent more than 15
// bytes, so a caller doesn't have to mask them.
void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
bool is_backwards = step < 0;
size_t granularity = uabs(step);
int direction = is_backwards ? -1 : 1;
int unit = wordSize * direction;
Label Lword, Lint, Lshort, Lbyte;
assert(granularity
&& granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
const Register t0 = r3;
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
// ??? I don't know if this bit-test-and-branch is the right thing
// to do. It does a lot of jumping, resulting in several
@ -1057,33 +1134,35 @@ class StubGenerator: public StubCodeGenerator {
// with something like Duff's device with a single computed branch.
__ tbz(count, 3 - exact_log2(granularity), Lword);
__ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
__ str(tmp, Address(__ adjust(d, unit, is_backwards)));
bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
__ bind(Lword);
if (granularity <= sizeof (jint)) {
__ tbz(count, 2 - exact_log2(granularity), Lint);
__ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
__ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
__ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
__ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
__ bind(Lint);
}
if (granularity <= sizeof (jshort)) {
__ tbz(count, 1 - exact_log2(granularity), Lshort);
__ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
__ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
__ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
__ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
__ bind(Lshort);
}
if (granularity <= sizeof (jbyte)) {
__ tbz(count, 0, Lbyte);
__ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
__ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
__ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
__ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
__ bind(Lbyte);
}
}
Label copy_f, copy_b;
Label copy_obj_f, copy_obj_b;
Label copy_obj_uninit_f, copy_obj_uninit_b;
// All-singing all-dancing memory copy.
//
@ -1092,8 +1171,8 @@ class StubGenerator: public StubCodeGenerator {
// of copy. If is_aligned is false, we align the source address.
//
void copy_memory(bool is_aligned, Register s, Register d,
Register count, Register tmp, int step) {
void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
Register s, Register d, Register count, int step) {
copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
bool is_backwards = step < 0;
unsigned int granularity = uabs(step);
@ -1102,9 +1181,12 @@ class StubGenerator: public StubCodeGenerator {
// <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
// load all the data before writing anything
Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
const Register send = r17, dend = r16;
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v8;
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
if (PrefetchCopyIntervalInBytes > 0)
__ prfm(Address(s, 0), PLDL1KEEP);
@ -1125,37 +1207,38 @@ class StubGenerator: public StubCodeGenerator {
// 33..64 bytes
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 0));
__ ldpq(v2, v3, Address(send, -32));
__ stpq(v0, v1, Address(d, 0));
__ stpq(v2, v3, Address(dend, -32));
bs.copy_load_at_32(v0, v1, Address(s, 0));
bs.copy_load_at_32(v2, v3, Address(send, -32));
bs.copy_store_at_32(Address(d, 0), v0, v1);
bs.copy_store_at_32(Address(dend, -32), v2, v3);
} else {
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(s, 16));
__ ldp(t4, t5, Address(send, -32));
__ ldp(t6, t7, Address(send, -16));
bs.copy_load_at_16(t0, t1, Address(s, 0));
bs.copy_load_at_16(t2, t3, Address(s, 16));
bs.copy_load_at_16(t4, t5, Address(send, -32));
bs.copy_load_at_16(t6, t7, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(d, 16));
__ stp(t4, t5, Address(dend, -32));
__ stp(t6, t7, Address(dend, -16));
bs.copy_store_at_16(Address(d, 0), t0, t1);
bs.copy_store_at_16(Address(d, 16), t2, t3);
bs.copy_store_at_16(Address(dend, -32), t4, t5);
bs.copy_store_at_16(Address(dend, -16), t6, t7);
}
__ b(finish);
// 17..32 bytes
__ bind(copy32);
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(dend, -16));
bs.copy_load_at_16(t0, t1, Address(s, 0));
bs.copy_load_at_16(t6, t7, Address(send, -16));
bs.copy_store_at_16(Address(d, 0), t0, t1);
bs.copy_store_at_16(Address(dend, -16), t6, t7);
__ b(finish);
// 65..80/96 bytes
// (96 bytes if SIMD because we do 32 byes per instruction)
__ bind(copy80);
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 0));
__ ldpq(v2, v3, Address(s, 32));
bs.copy_load_at_32(v0, v1, Address(s, 0));
bs.copy_load_at_32(v2, v3, Address(s, 32));
// Unaligned pointers can be an issue for copying.
// The issue has more chances to happen when granularity of data is
// less than 4(sizeof(jint)). Pointers for arrays of jint are at least
@ -1167,32 +1250,34 @@ class StubGenerator: public StubCodeGenerator {
Label copy96;
__ cmp(count, u1(80/granularity));
__ br(Assembler::HI, copy96);
__ ldp(t0, t1, Address(send, -16));
bs.copy_load_at_16(t0, t1, Address(send, -16));
__ stpq(v0, v1, Address(d, 0));
__ stpq(v2, v3, Address(d, 32));
__ stp(t0, t1, Address(dend, -16));
bs.copy_store_at_32(Address(d, 0), v0, v1);
bs.copy_store_at_32(Address(d, 32), v2, v3);
bs.copy_store_at_16(Address(dend, -16), t0, t1);
__ b(finish);
__ bind(copy96);
}
__ ldpq(v4, v5, Address(send, -32));
bs.copy_load_at_32(v4, v5, Address(send, -32));
__ stpq(v0, v1, Address(d, 0));
__ stpq(v2, v3, Address(d, 32));
__ stpq(v4, v5, Address(dend, -32));
bs.copy_store_at_32(Address(d, 0), v0, v1);
bs.copy_store_at_32(Address(d, 32), v2, v3);
bs.copy_store_at_32(Address(dend, -32), v4, v5);
} else {
__ ldp(t0, t1, Address(s, 0));
__ ldp(t2, t3, Address(s, 16));
__ ldp(t4, t5, Address(s, 32));
__ ldp(t6, t7, Address(s, 48));
__ ldp(t8, t9, Address(send, -16));
bs.copy_load_at_16(t0, t1, Address(s, 0));
bs.copy_load_at_16(t2, t3, Address(s, 16));
bs.copy_load_at_16(t4, t5, Address(s, 32));
bs.copy_load_at_16(t6, t7, Address(s, 48));
bs.copy_load_at_16(t8, t9, Address(send, -16));
__ stp(t0, t1, Address(d, 0));
__ stp(t2, t3, Address(d, 16));
__ stp(t4, t5, Address(d, 32));
__ stp(t6, t7, Address(d, 48));
__ stp(t8, t9, Address(dend, -16));
bs.copy_store_at_16(Address(d, 0), t0, t1);
bs.copy_store_at_16(Address(d, 16), t2, t3);
bs.copy_store_at_16(Address(d, 32), t4, t5);
bs.copy_store_at_16(Address(d, 48), t6, t7);
bs.copy_store_at_16(Address(dend, -16), t8, t9);
}
__ b(finish);
@ -1202,10 +1287,10 @@ class StubGenerator: public StubCodeGenerator {
__ br(Assembler::LO, copy8);
// 8..16 bytes
__ ldr(t0, Address(s, 0));
__ ldr(t1, Address(send, -8));
__ str(t0, Address(d, 0));
__ str(t1, Address(dend, -8));
bs.copy_load_at_8(t0, Address(s, 0));
bs.copy_load_at_8(t1, Address(send, -8));
bs.copy_store_at_8(Address(d, 0), t0);
bs.copy_store_at_8(Address(dend, -8), t1);
__ b(finish);
if (granularity < 8) {
@ -1252,26 +1337,31 @@ class StubGenerator: public StubCodeGenerator {
// Now we've got the small case out of the way we can align the
// source address on a 2-word boundary.
// Here we will materialize a count in r15, which is used by copy_memory_small
// and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
// Up until here, we have used t9, which aliases r15, but from here on, that register
// can not be used as a temp register, as it contains the count.
Label aligned;
if (is_aligned) {
// We may have to adjust by 1 word to get s 2-word-aligned.
__ tbz(s, exact_log2(wordSize), aligned);
__ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
__ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
__ sub(count, count, wordSize/granularity);
} else {
if (is_backwards) {
__ andr(rscratch2, s, 2 * wordSize - 1);
__ andr(r15, s, 2 * wordSize - 1);
} else {
__ neg(rscratch2, s);
__ andr(rscratch2, rscratch2, 2 * wordSize - 1);
__ neg(r15, s);
__ andr(r15, r15, 2 * wordSize - 1);
}
// rscratch2 is the byte adjustment needed to align s.
__ cbz(rscratch2, aligned);
// r15 is the byte adjustment needed to align s.
__ cbz(r15, aligned);
int shift = exact_log2(granularity);
if (shift) __ lsr(rscratch2, rscratch2, shift);
__ sub(count, count, rscratch2);
if (shift) __ lsr(r15, r15, shift);
__ sub(count, count, r15);
#if 0
// ?? This code is only correct for a disjoint copy. It may or
@ -1283,14 +1373,14 @@ class StubGenerator: public StubCodeGenerator {
// Align s and d, adjust count
if (is_backwards) {
__ sub(s, s, rscratch2);
__ sub(d, d, rscratch2);
__ sub(s, s, r15);
__ sub(d, d, r15);
} else {
__ add(s, s, rscratch2);
__ add(d, d, rscratch2);
__ add(s, s, r15);
__ add(d, d, r15);
}
#else
copy_memory_small(s, d, rscratch2, rscratch1, step);
copy_memory_small(decorators, type, s, d, r15, step);
#endif
}
@ -1300,14 +1390,27 @@ class StubGenerator: public StubCodeGenerator {
// We have a count of units and some trailing bytes. Adjust the
// count and do a bulk copy of words.
__ lsr(rscratch2, count, exact_log2(wordSize/granularity));
if (direction == copy_forwards)
__ bl(copy_f);
else
__ bl(copy_b);
__ lsr(r15, count, exact_log2(wordSize/granularity));
if (direction == copy_forwards) {
if (type != T_OBJECT) {
__ bl(copy_f);
} else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
__ bl(copy_obj_uninit_f);
} else {
__ bl(copy_obj_f);
}
} else {
if (type != T_OBJECT) {
__ bl(copy_b);
} else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
__ bl(copy_obj_uninit_b);
} else {
__ bl(copy_obj_b);
}
}
// And the tail.
copy_memory_small(s, d, count, tmp, step);
copy_memory_small(decorators, type, s, d, count, step);
if (granularity >= 8) __ bind(copy8);
if (granularity >= 4) __ bind(copy4);
@ -1402,7 +1505,7 @@ class StubGenerator: public StubCodeGenerator {
// UnsafeCopyMemory page error: continue after ucm
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeCopyMemoryMark ucmm(this, add_entry, true);
copy_memory(aligned, s, d, count, rscratch1, size);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
}
if (is_oop) {
@ -1473,7 +1576,7 @@ class StubGenerator: public StubCodeGenerator {
// UnsafeCopyMemory page error: continue after ucm
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeCopyMemoryMark ucmm(this, add_entry, true);
copy_memory(aligned, s, d, count, rscratch1, -size);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
}
if (is_oop) {
__ pop(RegSet::of(d, count), sp);
@ -1764,6 +1867,9 @@ class StubGenerator: public StubCodeGenerator {
const Register start_to = r20; // destination array start address
const Register r19_klass = r19; // oop._klass
// Registers used as gc temps (r5, r6, r7 are save-on-call)
const Register gct1 = r5, gct2 = r6, gct3 = r7;
//---------------------------------------------------------------
// Assembler stub will be used for this call to arraycopy
// if the two arrays are subtypes of Object[] but the
@ -1816,6 +1922,7 @@ class StubGenerator: public StubCodeGenerator {
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
bool is_oop = true;
int element_size = UseCompressedOops ? 4 : 8;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
@ -1841,13 +1948,17 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop
bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
__ post(to, element_size), copied_oop, noreg,
gct1, gct2, gct3);
__ sub(count, count, 1);
__ cbz(count, L_do_card_marks);
// ======== loop entry is here ========
__ BIND(L_load_element);
__ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
copied_oop, noreg, __ post(from, element_size),
gct1);
__ cbz(copied_oop, L_store_element);
__ load_klass(r19_klass, copied_oop);// query the object klass
@ -2444,8 +2555,14 @@ class StubGenerator: public StubCodeGenerator {
address entry_jlong_arraycopy;
address entry_checkcast_arraycopy;
generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();

@ -1,6 +1,6 @@
/*
* Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
* Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -119,6 +119,57 @@ void BarrierSetAssembler::store_at(MacroAssembler* masm, DecoratorSet decorators
}
void BarrierSetAssembler::copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst,
Address src,
Register tmp) {
if (bytes == 1) {
__ lbu(dst, src);
} else if (bytes == 2) {
__ lhu(dst, src);
} else if (bytes == 4) {
__ lwu(dst, src);
} else if (bytes == 8) {
__ ld(dst, src);
} else {
// Not the right size
ShouldNotReachHere();
}
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ decode_heap_oop(dst);
}
}
void BarrierSetAssembler::copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src,
Register tmp1,
Register tmp2,
Register tmp3) {
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ encode_heap_oop(src);
}
if (bytes == 1) {
__ sb(src, dst);
} else if (bytes == 2) {
__ sh(src, dst);
} else if (bytes == 4) {
__ sw(src, dst);
} else if (bytes == 8) {
__ sd(src, dst);
} else {
// Not the right size
ShouldNotReachHere();
}
}
void BarrierSetAssembler::try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
Register obj, Register tmp, Label& slowpath) {
// If mask changes we need to ensure that the inverse is still encodable as an immediate

@ -1,6 +1,6 @@
/*
* Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2021, Huawei Technologies Co., Ltd. All rights reserved.
* Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -49,6 +49,27 @@ public:
Register src, Register dst, Register count, RegSet saved_regs) {}
virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
Register start, Register count, Register tmp, RegSet saved_regs) {}
virtual void copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst,
Address src,
Register tmp);
virtual void copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src,
Register tmp1,
Register tmp2,
Register tmp3);
virtual bool supports_rvv_arraycopy() { return true; }
virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
Register dst, Address src, Register tmp1, Register tmp2);
virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,

@ -941,9 +941,10 @@ class StubGenerator: public StubCodeGenerator {
}
}
void copy_memory(bool is_aligned, Register s, Register d,
Register count, Register tmp, int step) {
if (UseRVV) {
void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
Register s, Register d, Register count, Register tmp, int step) {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
return copy_memory_v(s, d, count, tmp, step);
}
@ -951,32 +952,11 @@ class StubGenerator: public StubCodeGenerator {
int granularity = uabs(step);
const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
const Register gct1 = x28, gct2 = x29, gct3 = t2;
Label same_aligned;
Label copy_big, copy32_loop, copy8_loop, copy_small, done;
copy_insn ld_arr = NULL, st_arr = NULL;
switch (granularity) {
case 1 :
ld_arr = (copy_insn)&MacroAssembler::lbu;
st_arr = (copy_insn)&MacroAssembler::sb;
break;
case 2 :
ld_arr = (copy_insn)&MacroAssembler::lhu;
st_arr = (copy_insn)&MacroAssembler::sh;
break;
case 4 :
ld_arr = (copy_insn)&MacroAssembler::lwu;
st_arr = (copy_insn)&MacroAssembler::sw;
break;
case 8 :
ld_arr = (copy_insn)&MacroAssembler::ld;
st_arr = (copy_insn)&MacroAssembler::sd;
break;
default :
ShouldNotReachHere();
}
__ beqz(count, done);
__ slli(cnt, count, exact_log2(granularity));
if (is_backwards) {
@ -1008,8 +988,8 @@ class StubGenerator: public StubCodeGenerator {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
(_masm->*ld_arr)(tmp3, Address(src), t0);
(_masm->*st_arr)(tmp3, Address(dst), t0);
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
@ -1028,14 +1008,15 @@ class StubGenerator: public StubCodeGenerator {
__ addi(dst, dst, -wordSize * 4);
}
// we first load 32 bytes, then write it, so the direction here doesn't matter
__ ld(tmp3, Address(src));
__ ld(tmp4, Address(src, 8));
__ ld(tmp5, Address(src, 16));
__ ld(tmp6, Address(src, 24));
__ sd(tmp3, Address(dst));
__ sd(tmp4, Address(dst, 8));
__ sd(tmp5, Address(dst, 16));
__ sd(tmp6, Address(dst, 24));
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, wordSize * 4);
@ -1055,8 +1036,9 @@ class StubGenerator: public StubCodeGenerator {
__ addi(src, src, -wordSize);
__ addi(dst, dst, -wordSize);
}
__ ld(tmp3, Address(src));
__ sd(tmp3, Address(dst));
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, wordSize);
__ addi(dst, dst, wordSize);
@ -1072,8 +1054,10 @@ class StubGenerator: public StubCodeGenerator {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
(_masm->*ld_arr)(tmp3, Address(src), t0);
(_masm->*st_arr)(tmp3, Address(dst), t0);
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
@ -1160,7 +1144,7 @@ class StubGenerator: public StubCodeGenerator {
// UnsafeCopyMemory page error: continue after ucm
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeCopyMemoryMark ucmm(this, add_entry, true);
copy_memory(aligned, s, d, count, t0, size);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, t0, size);
}
if (is_oop) {
@ -1211,7 +1195,10 @@ class StubGenerator: public StubCodeGenerator {
// use fwd copy when (d-s) above_equal (count*size)
__ sub(t0, d, s);
__ slli(t1, count, exact_log2(size));
__ bgeu(t0, t1, nooverlap_target);
Label L_continue;
__ bltu(t0, t1, L_continue);
__ j(nooverlap_target);
__ bind(L_continue);
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (dest_uninitialized) {
@ -1233,7 +1220,7 @@ class StubGenerator: public StubCodeGenerator {
// UnsafeCopyMemory page error: continue after ucm
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeCopyMemoryMark ucmm(this, add_entry, true);
copy_memory(aligned, s, d, count, t0, -size);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, t0, -size);
}
if (is_oop) {
@ -1523,6 +1510,9 @@ class StubGenerator: public StubCodeGenerator {
const Register copied_oop = x7; // actual oop copied
const Register r9_klass = x9; // oop._klass
// Registers used as gc temps (x15, x16, x17 are save-on-call)
const Register gct1 = x15, gct2 = x16, gct3 = x17;
//---------------------------------------------------------------
// Assembler stub will be used for this call to arraycopy
// if the two arrays are subtypes of Object[] but the
@ -1564,11 +1554,13 @@ class StubGenerator: public StubCodeGenerator {
#endif //ASSERT
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
bool is_oop = true;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
bool is_oop = true;
int element_size = UseCompressedOops ? 4 : 8;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
@ -1591,14 +1583,18 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop
bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
Address(to, 0), copied_oop,
gct1, gct2, gct3);
__ add(to, to, UseCompressedOops ? 4 : 8);
__ sub(count, count, 1);
__ beqz(count, L_do_card_marks);
// ======== loop entry is here ========
__ BIND(L_load_element);
__ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop
bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
copied_oop, Address(from, 0),
gct1);
__ add(from, from, UseCompressedOops ? 4 : 8);
__ beqz(copied_oop, L_store_element);

@ -195,6 +195,113 @@ void BarrierSetAssembler::store_at(MacroAssembler* masm, DecoratorSet decorators
}
}
void BarrierSetAssembler::copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst,
Address src,
Register tmp) {
assert(bytes <= 8, "can only deal with non-vector registers");
switch (bytes) {
case 1:
__ movb(dst, src);
break;
case 2:
__ movw(dst, src);
break;
case 4:
__ movl(dst, src);
break;
case 8:
#ifdef _LP64
__ movq(dst, src);
#else
fatal("No support for 8 bytes copy");
#endif
break;
default:
fatal("Unexpected size");
}
#ifdef _LP64
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ decode_heap_oop(dst);
}
#endif
}
void BarrierSetAssembler::copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src,
Register tmp) {
#ifdef _LP64
if ((decorators & ARRAYCOPY_CHECKCAST) != 0 && UseCompressedOops) {
__ encode_heap_oop(src);
}
#endif
assert(bytes <= 8, "can only deal with non-vector registers");
switch (bytes) {
case 1:
__ movb(dst, src);
break;
case 2:
__ movw(dst, src);
break;
case 4:
__ movl(dst, src);
break;
case 8:
#ifdef _LP64
__ movq(dst, src);
#else
fatal("No support for 8 bytes copy");
#endif
break;
default:
fatal("Unexpected size");
}
}
void BarrierSetAssembler::copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
XMMRegister dst,
Address src,
Register tmp,
XMMRegister xmm_tmp) {
assert(bytes > 8, "can only deal with vector registers");
if (bytes == 16) {
__ movdqu(dst, src);
} else if (bytes == 32) {
__ vmovdqu(dst, src);
} else {
fatal("No support for >32 bytes copy");
}
}
void BarrierSetAssembler::copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
XMMRegister src,
Register tmp1,
Register tmp2,
XMMRegister xmm_tmp) {
assert(bytes > 8, "can only deal with vector registers");
if (bytes == 16) {
__ movdqu(dst, src);
} else if (bytes == 32) {
__ vmovdqu(dst, src);
} else {
fatal("No support for >32 bytes copy");
}
}
void BarrierSetAssembler::try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
Register obj, Register tmp, Label& slowpath) {
__ clear_jobject_tag(obj);

@ -49,6 +49,46 @@ public:
virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
Address dst, Register val, Register tmp1, Register tmp2, Register tmp3);
// The copy_[load/store]_at functions are used by arraycopy stubs. Be careful to only use
// r10 (aka rscratch1) in a context where restore_arg_regs_using_thread has been used instead
// of the looser setup_arg_regs. Currently this is done when using type T_OBJECT.
virtual void copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Register dst,
Address src,
Register tmp);
virtual void copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
Register src,
Register tmp);
virtual void copy_load_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
XMMRegister dst,
Address src,
Register tmp,
XMMRegister xmm_tmp);
virtual void copy_store_at(MacroAssembler* masm,
DecoratorSet decorators,
BasicType type,
size_t bytes,
Address dst,
XMMRegister src,
Register tmp1,
Register tmp2,
XMMRegister xmm_tmp);
virtual bool supports_avx3_masked_arraycopy() { return true; }
// Support for jniFastGetField to try resolving a jobject/jweak in native
virtual void try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
Register obj, Register tmp, Label& slowpath);

@ -1162,15 +1162,17 @@ void StubGenerator::setup_arg_regs(int nargs) {
#ifdef _WIN64
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
"unexpected argument registers");
if (nargs >= 4)
if (nargs == 4) {
__ mov(rax, r9); // r9 is also saved_rdi
}
__ movptr(saved_rdi, rdi);
__ movptr(saved_rsi, rsi);
__ mov(rdi, rcx); // c_rarg0
__ mov(rsi, rdx); // c_rarg1
__ mov(rdx, r8); // c_rarg2
if (nargs >= 4)
if (nargs == 4) {
__ mov(rcx, rax); // c_rarg3 (via rax)
}
#else
assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
"unexpected argument registers");
@ -1192,9 +1194,13 @@ void StubGenerator::restore_arg_regs() {
// This is used in places where r10 is a scratch register, and can
// be adapted if r9 is needed also.
void StubGenerator::setup_arg_regs_using_thread() {
void StubGenerator::setup_arg_regs_using_thread(int nargs) {
const Register saved_r15 = r9;
assert(nargs == 3 || nargs == 4, "else fix");
#ifdef _WIN64
if (nargs == 4) {
__ mov(rax, r9); // r9 is also saved_r15
}
__ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
__ get_thread(r15_thread);
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
@ -1205,6 +1211,9 @@ void StubGenerator::setup_arg_regs_using_thread() {
__ mov(rdi, rcx); // c_rarg0
__ mov(rsi, rdx); // c_rarg1
__ mov(rdx, r8); // c_rarg2
if (nargs == 4) {
__ mov(rcx, rax); // c_rarg3 (via rax)
}
#else
assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
"unexpected argument registers");

@ -140,19 +140,23 @@ class StubGenerator: public StubCodeGenerator {
// This is used in places where r10 is a scratch register, and can
// be adapted if r9 is needed also.
void setup_arg_regs_using_thread();
void setup_arg_regs_using_thread(int nargs = 3);
void restore_arg_regs_using_thread();
// Copy big chunks forward
void copy_bytes_forward(Register end_from, Register end_to,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes);
Register qword_count, Register tmp1,
Register tmp2, Label& L_copy_bytes,
Label& L_copy_8_bytes, DecoratorSet decorators,
BasicType type);
// Copy big chunks backward
void copy_bytes_backward(Register from, Register dest,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes);
Register qword_count, Register tmp1,
Register tmp2, Label& L_copy_bytes,
Label& L_copy_8_bytes, DecoratorSet decorators,
BasicType type);
void setup_argument_regs(BasicType type);

@ -231,13 +231,16 @@ void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, A
// end_from - source arrays end address
// end_to - destination array end address
// qword_count - 64-bits element count, negative
// to - scratch
// tmp1 - scratch
// L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes) {
Register qword_count, Register tmp1,
Register tmp2, Label& L_copy_bytes,
Label& L_copy_8_bytes, DecoratorSet decorators,
BasicType type) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
@ -245,49 +248,102 @@ void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
Label L_end;
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(end_from, qword_count, Address::times_8, -56),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(end_to, qword_count, Address::times_8, -56), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(end_from, qword_count, Address::times_8, -24),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(end_to, qword_count, Address::times_8, -24), xmm0,
tmp1, tmp2, xmm1);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -56),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -56), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -40),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -40), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -24),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -24), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -8),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -8), xmm0,
tmp1, tmp2, xmm1);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
__ jcc(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(end_from, qword_count, Address::times_8, -24),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(end_to, qword_count, Address::times_8, -24), xmm0,
tmp1, tmp2, xmm1);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -24),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -24), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(end_from, qword_count, Address::times_8, -8),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(end_to, qword_count, Address::times_8, -8), xmm0,
tmp1, tmp2, xmm1);
}
__ addptr(qword_count, 4);
__ BIND(L_end);
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
__ movq(to, Address(end_from, qword_count, Address::times_8, -24));
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
__ movq(Address(end_to, qword_count, Address::times_8, -16), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(end_from, qword_count, Address::times_8, -24),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(end_to, qword_count, Address::times_8, -24), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(end_from, qword_count, Address::times_8, -16),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(end_to, qword_count, Address::times_8, -16), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(end_from, qword_count, Address::times_8, -8),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(end_to, qword_count, Address::times_8, -8), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(end_from, qword_count, Address::times_8, 0),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(end_to, qword_count, Address::times_8, 0), tmp1,
tmp2);
__ BIND(L_copy_bytes);
__ addptr(qword_count, 4);
@ -304,13 +360,16 @@ void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
// from - source arrays address
// dest - destination array address
// qword_count - 64-bits element count
// to - scratch
// tmp1 - scratch
// L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
void StubGenerator::copy_bytes_backward(Register from, Register dest,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes) {
Register qword_count, Register tmp1,
Register tmp2, Label& L_copy_bytes,
Label& L_copy_8_bytes, DecoratorSet decorators,
BasicType type) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
@ -318,19 +377,43 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest,
Label L_end;
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(from, qword_count, Address::times_8, 32),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(dest, qword_count, Address::times_8, 32), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(from, qword_count, Address::times_8, 0),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(dest, qword_count, Address::times_8, 0), xmm0,
tmp1, tmp2, xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 48),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 48), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 32),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 32), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 16),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 16), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 0),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 0), xmm0,
tmp1, tmp2, xmm1);
}
__ BIND(L_copy_bytes);
@ -338,30 +421,58 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest,
__ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
__ jcc(Assembler::less, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
bs->copy_load_at(_masm, decorators, type, 32,
xmm0, Address(from, qword_count, Address::times_8, 0),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 32,
Address(dest, qword_count, Address::times_8, 0), xmm0,
tmp1, tmp2, xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 16),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 16), xmm0,
tmp1, tmp2, xmm1);
bs->copy_load_at(_masm, decorators, type, 16,
xmm0, Address(from, qword_count, Address::times_8, 0),
tmp1, xmm1);
bs->copy_store_at(_masm, decorators, type, 16,
Address(dest, qword_count, Address::times_8, 0), xmm0,
tmp1, tmp2, xmm1);
}
__ subptr(qword_count, 4);
__ BIND(L_end);
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
__ movq(to, Address(from, qword_count, Address::times_8, 24));
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
__ movq(to, Address(from, qword_count, Address::times_8, 16));
__ movq(Address(dest, qword_count, Address::times_8, 16), to);
__ movq(to, Address(from, qword_count, Address::times_8, 8));
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
__ movq(to, Address(from, qword_count, Address::times_8, 0));
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(from, qword_count, Address::times_8, 24),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(dest, qword_count, Address::times_8, 24), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(from, qword_count, Address::times_8, 16),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(dest, qword_count, Address::times_8, 16), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(from, qword_count, Address::times_8, 8),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(dest, qword_count, Address::times_8, 8), tmp1,
tmp2);
bs->copy_load_at(_masm, decorators, type, 8,
tmp1, Address(from, qword_count, Address::times_8, 0),
tmp2);
bs->copy_store_at(_masm, decorators, type, 8,
Address(dest, qword_count, Address::times_8, 0), tmp1,
tmp2);
__ BIND(L_copy_bytes);
__ subptr(qword_count, 4);
@ -1024,6 +1135,7 @@ address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry,
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
Label L_copy_byte, L_exit;
@ -1108,7 +1220,7 @@ __ BIND(L_exit);
{
UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
__ jmp(L_copy_4_bytes);
}
return start;
@ -1141,6 +1253,7 @@ address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverl
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
const Register from = rdi; // source array address
@ -1211,7 +1324,7 @@ address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverl
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !aligned, true);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
}
restore_arg_regs();
INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
@ -1254,6 +1367,7 @@ address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
const Register from = rdi; // source array address
@ -1330,7 +1444,7 @@ __ BIND(L_exit);
{
UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
__ jmp(L_copy_4_bytes);
}
@ -1388,6 +1502,7 @@ address StubGenerator::generate_conjoint_short_copy(bool aligned, address noover
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
const Register from = rdi; // source array address
@ -1450,7 +1565,7 @@ address StubGenerator::generate_conjoint_short_copy(bool aligned, address noover
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !aligned, true);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
}
restore_arg_regs();
INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
@ -1484,8 +1599,9 @@ address StubGenerator::generate_conjoint_short_copy(bool aligned, address noover
//
address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
const char *name, bool dest_uninitialized) {
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
aligned, is_oop, dest_uninitialized);
}
@ -1527,7 +1643,6 @@ address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop,
}
BasicType type = is_oop ? T_OBJECT : T_INT;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
{
@ -1570,7 +1685,7 @@ __ BIND(L_exit);
{
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
__ jmp(L_copy_4_bytes);
}
@ -1596,8 +1711,9 @@ __ BIND(L_exit);
address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
bool dest_uninitialized) {
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
@ -1635,7 +1751,6 @@ address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop,
}
BasicType type = is_oop ? T_OBJECT : T_INT;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
// no registers are destroyed by this call
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
@ -1677,7 +1792,7 @@ address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop,
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
}
__ BIND(L_exit);
@ -1710,8 +1825,9 @@ __ BIND(L_exit);
//
address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
const char *name, bool dest_uninitialized) {
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
aligned, is_oop, dest_uninitialized);
}
@ -1753,7 +1869,6 @@ address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop
}
BasicType type = is_oop ? T_OBJECT : T_LONG;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
{
// UnsafeCopyMemory page error: continue after ucm
@ -1767,8 +1882,12 @@ address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
__ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
__ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
bs->copy_load_at(_masm, decorators, type, 8,
rax, Address(end_from, qword_count, Address::times_8, 8),
r10);
bs->copy_store_at(_masm, decorators, type, 8,
Address(end_to, qword_count, Address::times_8, 8), rax,
r10);
__ increment(qword_count);
__ jcc(Assembler::notZero, L_copy_8_bytes);
}
@ -1787,7 +1906,7 @@ address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
}
__ BIND(L_exit);
@ -1819,8 +1938,9 @@ address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop
address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
bool dest_uninitialized) {
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
@ -1858,7 +1978,6 @@ address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop
}
BasicType type = is_oop ? T_OBJECT : T_LONG;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
{
// UnsafeCopyMemory page error: continue after ucm
@ -1868,8 +1987,12 @@ address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
__ movq(rax, Address(from, qword_count, Address::times_8, -8));
__ movq(Address(to, qword_count, Address::times_8, -8), rax);
bs->copy_load_at(_masm, decorators, type, 8,
rax, Address(from, qword_count, Address::times_8, -8),
r10);
bs->copy_store_at(_masm, decorators, type, 8,
Address(to, qword_count, Address::times_8, -8), rax,
r10);
__ decrement(qword_count);
__ jcc(Assembler::notZero, L_copy_8_bytes);
}
@ -1888,7 +2011,7 @@ address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop
UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
}
__ BIND(L_exit);
bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
@ -1987,9 +2110,9 @@ address StubGenerator::generate_checkcast_copy(const char *name, address *entry,
}
#endif //ASSERT
setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
// ckoff => rcx, ckval => r8
// r9 and r10 may be used to save non-volatile registers
setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
// ckoff => rcx, ckval => r8
// r9 is used to save r15_thread
#ifdef _WIN64
// last argument (#4) is on stack on Win64
__ movptr(ckval, Address(rsp, 6 * wordSize));
@ -2052,6 +2175,8 @@ address StubGenerator::generate_checkcast_copy(const char *name, address *entry,
}
BasicType type = T_OBJECT;
size_t element_size = UseCompressedOops ? 4 : 8;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
@ -2075,13 +2200,25 @@ address StubGenerator::generate_checkcast_copy(const char *name, address *entry,
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW); // store the oop
bs->copy_store_at(_masm,
decorators,
type,
element_size,
to_element_addr,
rax_oop,
r10);
__ increment(count); // increment the count toward zero
__ jcc(Assembler::zero, L_do_card_marks);
// ======== loop entry is here ========
__ BIND(L_load_element);
__ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
bs->copy_load_at(_masm,
decorators,
type,
element_size,
rax_oop,
from_element_addr,
r10);
__ testptr(rax_oop, rax_oop);
__ jcc(Assembler::zero, L_store_element);
@ -2113,7 +2250,7 @@ address StubGenerator::generate_checkcast_copy(const char *name, address *entry,
__ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
__ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
__ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
restore_arg_regs();
restore_arg_regs_using_thread();
INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@ -2529,7 +2666,7 @@ __ BIND(L_checkcast_copy);
// the checkcast_copy loop needs two extra arguments:
assert(c_rarg3 == sco_temp, "#3 already in place");
// Set up arguments for checkcast_copy_entry.
setup_arg_regs(4);
setup_arg_regs_using_thread(4);
__ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
__ jump(RuntimeAddress(checkcast_copy_entry));
}