7039731: arraycopy could use prefetch on SPARC
Use BIS and prefetch in arraycopy stubs for Sparc (BIS for T4 only). Reviewed-by: never, iveresov
This commit is contained in:
parent
1038fed51d
commit
f7d7a6071a
@ -1124,6 +1124,126 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Generate main code for disjoint arraycopy
|
||||||
|
//
|
||||||
|
typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
|
||||||
|
Label& L_loop, bool use_prefetch, bool use_bis);
|
||||||
|
|
||||||
|
void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
|
||||||
|
int iter_size, CopyLoopFunc copy_loop_func) {
|
||||||
|
Label L_copy;
|
||||||
|
|
||||||
|
assert(log2_elem_size <= 3, "the following code should be changed");
|
||||||
|
int count_dec = 16>>log2_elem_size;
|
||||||
|
|
||||||
|
int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
|
||||||
|
assert(prefetch_dist < 4096, "invalid value");
|
||||||
|
prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
|
||||||
|
int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
|
||||||
|
|
||||||
|
if (UseBlockCopy) {
|
||||||
|
Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
|
||||||
|
|
||||||
|
// 64 bytes tail + bytes copied in one loop iteration
|
||||||
|
int tail_size = 64 + iter_size;
|
||||||
|
int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
|
||||||
|
// Use BIS copy only for big arrays since it requires membar.
|
||||||
|
__ set(block_copy_count, O4);
|
||||||
|
__ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
|
||||||
|
// This code is for disjoint source and destination:
|
||||||
|
// to <= from || to >= from+count
|
||||||
|
// but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
|
||||||
|
__ sub(from, to, O4);
|
||||||
|
__ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
|
||||||
|
__ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
|
||||||
|
|
||||||
|
__ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
|
||||||
|
// BIS should not be used to copy tail (64 bytes+iter_size)
|
||||||
|
// to avoid zeroing of following values.
|
||||||
|
__ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
|
||||||
|
|
||||||
|
if (prefetch_count > 0) { // rounded up to one iteration count
|
||||||
|
// Do prefetching only if copy size is bigger
|
||||||
|
// than prefetch distance.
|
||||||
|
__ set(prefetch_count, O4);
|
||||||
|
__ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
|
||||||
|
__ sub(count, prefetch_count, count);
|
||||||
|
|
||||||
|
(this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
|
||||||
|
__ add(count, prefetch_count, count); // restore count
|
||||||
|
|
||||||
|
} // prefetch_count > 0
|
||||||
|
|
||||||
|
(this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
|
||||||
|
__ add(count, (tail_size>>log2_elem_size), count); // restore count
|
||||||
|
|
||||||
|
__ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
|
||||||
|
// BIS needs membar.
|
||||||
|
__ membar(Assembler::StoreLoad);
|
||||||
|
// Copy tail
|
||||||
|
__ ba_short(L_copy);
|
||||||
|
|
||||||
|
__ BIND(L_skip_block_copy);
|
||||||
|
} // UseBlockCopy
|
||||||
|
|
||||||
|
if (prefetch_count > 0) { // rounded up to one iteration count
|
||||||
|
// Do prefetching only if copy size is bigger
|
||||||
|
// than prefetch distance.
|
||||||
|
__ set(prefetch_count, O4);
|
||||||
|
__ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
|
||||||
|
__ sub(count, prefetch_count, count);
|
||||||
|
|
||||||
|
Label L_copy_prefetch;
|
||||||
|
(this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
|
||||||
|
__ add(count, prefetch_count, count); // restore count
|
||||||
|
|
||||||
|
} // prefetch_count > 0
|
||||||
|
|
||||||
|
(this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Helper methods for copy_16_bytes_forward_with_shift()
|
||||||
|
//
|
||||||
|
void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
|
||||||
|
Label& L_loop, bool use_prefetch, bool use_bis) {
|
||||||
|
|
||||||
|
const Register left_shift = G1; // left shift bit counter
|
||||||
|
const Register right_shift = G5; // right shift bit counter
|
||||||
|
|
||||||
|
__ align(OptoLoopAlignment);
|
||||||
|
__ BIND(L_loop);
|
||||||
|
if (use_prefetch) {
|
||||||
|
if (ArraycopySrcPrefetchDistance > 0) {
|
||||||
|
__ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
|
||||||
|
}
|
||||||
|
if (ArraycopyDstPrefetchDistance > 0) {
|
||||||
|
__ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__ ldx(from, 0, O4);
|
||||||
|
__ ldx(from, 8, G4);
|
||||||
|
__ inc(to, 16);
|
||||||
|
__ inc(from, 16);
|
||||||
|
__ deccc(count, count_dec); // Can we do next iteration after this one?
|
||||||
|
__ srlx(O4, right_shift, G3);
|
||||||
|
__ bset(G3, O3);
|
||||||
|
__ sllx(O4, left_shift, O4);
|
||||||
|
__ srlx(G4, right_shift, G3);
|
||||||
|
__ bset(G3, O4);
|
||||||
|
if (use_bis) {
|
||||||
|
__ stxa(O3, to, -16);
|
||||||
|
__ stxa(O4, to, -8);
|
||||||
|
} else {
|
||||||
|
__ stx(O3, to, -16);
|
||||||
|
__ stx(O4, to, -8);
|
||||||
|
}
|
||||||
|
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
|
||||||
|
__ delayed()->sllx(G4, left_shift, O3);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy big chunks forward with shift
|
// Copy big chunks forward with shift
|
||||||
//
|
//
|
||||||
@ -1135,64 +1255,51 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
// L_copy_bytes - copy exit label
|
// L_copy_bytes - copy exit label
|
||||||
//
|
//
|
||||||
void copy_16_bytes_forward_with_shift(Register from, Register to,
|
void copy_16_bytes_forward_with_shift(Register from, Register to,
|
||||||
Register count, int count_dec, Label& L_copy_bytes) {
|
Register count, int log2_elem_size, Label& L_copy_bytes) {
|
||||||
Label L_loop, L_aligned_copy, L_copy_last_bytes;
|
Label L_aligned_copy, L_copy_last_bytes;
|
||||||
|
assert(log2_elem_size <= 3, "the following code should be changed");
|
||||||
|
int count_dec = 16>>log2_elem_size;
|
||||||
|
|
||||||
// if both arrays have the same alignment mod 8, do 8 bytes aligned copy
|
// if both arrays have the same alignment mod 8, do 8 bytes aligned copy
|
||||||
__ andcc(from, 7, G1); // misaligned bytes
|
__ andcc(from, 7, G1); // misaligned bytes
|
||||||
__ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
|
__ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
|
||||||
__ delayed()->nop();
|
__ delayed()->nop();
|
||||||
|
|
||||||
const Register left_shift = G1; // left shift bit counter
|
const Register left_shift = G1; // left shift bit counter
|
||||||
const Register right_shift = G5; // right shift bit counter
|
const Register right_shift = G5; // right shift bit counter
|
||||||
|
|
||||||
__ sll(G1, LogBitsPerByte, left_shift);
|
__ sll(G1, LogBitsPerByte, left_shift);
|
||||||
__ mov(64, right_shift);
|
__ mov(64, right_shift);
|
||||||
__ sub(right_shift, left_shift, right_shift);
|
__ sub(right_shift, left_shift, right_shift);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Load 2 aligned 8-bytes chunks and use one from previous iteration
|
// Load 2 aligned 8-bytes chunks and use one from previous iteration
|
||||||
// to form 2 aligned 8-bytes chunks to store.
|
// to form 2 aligned 8-bytes chunks to store.
|
||||||
//
|
//
|
||||||
__ deccc(count, count_dec); // Pre-decrement 'count'
|
__ dec(count, count_dec); // Pre-decrement 'count'
|
||||||
__ andn(from, 7, from); // Align address
|
__ andn(from, 7, from); // Align address
|
||||||
__ ldx(from, 0, O3);
|
__ ldx(from, 0, O3);
|
||||||
__ inc(from, 8);
|
__ inc(from, 8);
|
||||||
__ align(OptoLoopAlignment);
|
__ sllx(O3, left_shift, O3);
|
||||||
__ BIND(L_loop);
|
|
||||||
__ ldx(from, 0, O4);
|
|
||||||
__ deccc(count, count_dec); // Can we do next iteration after this one?
|
|
||||||
__ ldx(from, 8, G4);
|
|
||||||
__ inc(to, 16);
|
|
||||||
__ inc(from, 16);
|
|
||||||
__ sllx(O3, left_shift, O3);
|
|
||||||
__ srlx(O4, right_shift, G3);
|
|
||||||
__ bset(G3, O3);
|
|
||||||
__ stx(O3, to, -16);
|
|
||||||
__ sllx(O4, left_shift, O4);
|
|
||||||
__ srlx(G4, right_shift, G3);
|
|
||||||
__ bset(G3, O4);
|
|
||||||
__ stx(O4, to, -8);
|
|
||||||
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
|
|
||||||
__ delayed()->mov(G4, O3);
|
|
||||||
|
|
||||||
__ inccc(count, count_dec>>1 ); // + 8 bytes
|
disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
|
||||||
__ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
|
|
||||||
__ delayed()->inc(count, count_dec>>1); // restore 'count'
|
|
||||||
|
|
||||||
// copy 8 bytes, part of them already loaded in O3
|
__ inccc(count, count_dec>>1 ); // + 8 bytes
|
||||||
__ ldx(from, 0, O4);
|
__ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
|
||||||
__ inc(to, 8);
|
__ delayed()->inc(count, count_dec>>1); // restore 'count'
|
||||||
__ inc(from, 8);
|
|
||||||
__ sllx(O3, left_shift, O3);
|
// copy 8 bytes, part of them already loaded in O3
|
||||||
__ srlx(O4, right_shift, G3);
|
__ ldx(from, 0, O4);
|
||||||
__ bset(O3, G3);
|
__ inc(to, 8);
|
||||||
__ stx(G3, to, -8);
|
__ inc(from, 8);
|
||||||
|
__ srlx(O4, right_shift, G3);
|
||||||
|
__ bset(O3, G3);
|
||||||
|
__ stx(G3, to, -8);
|
||||||
|
|
||||||
__ BIND(L_copy_last_bytes);
|
__ BIND(L_copy_last_bytes);
|
||||||
__ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
|
__ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
|
||||||
__ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
|
__ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
|
||||||
__ delayed()->sub(from, right_shift, from); // restore address
|
__ delayed()->sub(from, right_shift, from); // restore address
|
||||||
|
|
||||||
__ BIND(L_aligned_copy);
|
__ BIND(L_aligned_copy);
|
||||||
}
|
}
|
||||||
@ -1348,7 +1455,7 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
// The compare above (count >= 23) guarantes 'count' >= 16 bytes.
|
// The compare above (count >= 23) guarantes 'count' >= 16 bytes.
|
||||||
// Also jump over aligned copy after the copy with shift completed.
|
// Also jump over aligned copy after the copy with shift completed.
|
||||||
|
|
||||||
copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
|
copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Both array are 8 bytes aligned, copy 16 bytes at a time
|
// Both array are 8 bytes aligned, copy 16 bytes at a time
|
||||||
@ -1576,7 +1683,7 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
// The compare above (count >= 11) guarantes 'count' >= 16 bytes.
|
// The compare above (count >= 11) guarantes 'count' >= 16 bytes.
|
||||||
// Also jump over aligned copy after the copy with shift completed.
|
// Also jump over aligned copy after the copy with shift completed.
|
||||||
|
|
||||||
copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
|
copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Both array are 8 bytes aligned, copy 16 bytes at a time
|
// Both array are 8 bytes aligned, copy 16 bytes at a time
|
||||||
@ -1949,6 +2056,45 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Helper methods for generate_disjoint_int_copy_core()
|
||||||
|
//
|
||||||
|
void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
|
||||||
|
Label& L_loop, bool use_prefetch, bool use_bis) {
|
||||||
|
|
||||||
|
__ align(OptoLoopAlignment);
|
||||||
|
__ BIND(L_loop);
|
||||||
|
if (use_prefetch) {
|
||||||
|
if (ArraycopySrcPrefetchDistance > 0) {
|
||||||
|
__ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
|
||||||
|
}
|
||||||
|
if (ArraycopyDstPrefetchDistance > 0) {
|
||||||
|
__ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__ ldx(from, 4, O4);
|
||||||
|
__ ldx(from, 12, G4);
|
||||||
|
__ inc(to, 16);
|
||||||
|
__ inc(from, 16);
|
||||||
|
__ deccc(count, 4); // Can we do next iteration after this one?
|
||||||
|
|
||||||
|
__ srlx(O4, 32, G3);
|
||||||
|
__ bset(G3, O3);
|
||||||
|
__ sllx(O4, 32, O4);
|
||||||
|
__ srlx(G4, 32, G3);
|
||||||
|
__ bset(G3, O4);
|
||||||
|
if (use_bis) {
|
||||||
|
__ stxa(O3, to, -16);
|
||||||
|
__ stxa(O4, to, -8);
|
||||||
|
} else {
|
||||||
|
__ stx(O3, to, -16);
|
||||||
|
__ stx(O4, to, -8);
|
||||||
|
}
|
||||||
|
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
|
||||||
|
__ delayed()->sllx(G4, 32, O3);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Generate core code for disjoint int copy (and oop copy on 32-bit).
|
// Generate core code for disjoint int copy (and oop copy on 32-bit).
|
||||||
// If "aligned" is true, the "from" and "to" addresses are assumed
|
// If "aligned" is true, the "from" and "to" addresses are assumed
|
||||||
@ -1962,7 +2108,7 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
void generate_disjoint_int_copy_core(bool aligned) {
|
void generate_disjoint_int_copy_core(bool aligned) {
|
||||||
|
|
||||||
Label L_skip_alignment, L_aligned_copy;
|
Label L_skip_alignment, L_aligned_copy;
|
||||||
Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
|
Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
|
||||||
|
|
||||||
const Register from = O0; // source array address
|
const Register from = O0; // source array address
|
||||||
const Register to = O1; // destination array address
|
const Register to = O1; // destination array address
|
||||||
@ -2013,30 +2159,16 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
|
|
||||||
// copy with shift 4 elements (16 bytes) at a time
|
// copy with shift 4 elements (16 bytes) at a time
|
||||||
__ dec(count, 4); // The cmp at the beginning guaranty count >= 4
|
__ dec(count, 4); // The cmp at the beginning guaranty count >= 4
|
||||||
|
__ sllx(O3, 32, O3);
|
||||||
|
|
||||||
__ align(OptoLoopAlignment);
|
disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
|
||||||
__ BIND(L_copy_16_bytes);
|
|
||||||
__ ldx(from, 4, O4);
|
|
||||||
__ deccc(count, 4); // Can we do next iteration after this one?
|
|
||||||
__ ldx(from, 12, G4);
|
|
||||||
__ inc(to, 16);
|
|
||||||
__ inc(from, 16);
|
|
||||||
__ sllx(O3, 32, O3);
|
|
||||||
__ srlx(O4, 32, G3);
|
|
||||||
__ bset(G3, O3);
|
|
||||||
__ stx(O3, to, -16);
|
|
||||||
__ sllx(O4, 32, O4);
|
|
||||||
__ srlx(G4, 32, G3);
|
|
||||||
__ bset(G3, O4);
|
|
||||||
__ stx(O4, to, -8);
|
|
||||||
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
|
|
||||||
__ delayed()->mov(G4, O3);
|
|
||||||
|
|
||||||
__ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
|
__ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
|
||||||
__ delayed()->inc(count, 4); // restore 'count'
|
__ delayed()->inc(count, 4); // restore 'count'
|
||||||
|
|
||||||
__ BIND(L_aligned_copy);
|
__ BIND(L_aligned_copy);
|
||||||
}
|
} // !aligned
|
||||||
|
|
||||||
// copy 4 elements (16 bytes) at a time
|
// copy 4 elements (16 bytes) at a time
|
||||||
__ and3(count, 1, G4); // Save
|
__ and3(count, 1, G4); // Save
|
||||||
__ srl(count, 1, count);
|
__ srl(count, 1, count);
|
||||||
@ -2222,6 +2354,38 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Helper methods for generate_disjoint_long_copy_core()
|
||||||
|
//
|
||||||
|
void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
|
||||||
|
Label& L_loop, bool use_prefetch, bool use_bis) {
|
||||||
|
__ align(OptoLoopAlignment);
|
||||||
|
__ BIND(L_loop);
|
||||||
|
for (int off = 0; off < 64; off += 16) {
|
||||||
|
if (use_prefetch && (off & 31) == 0) {
|
||||||
|
if (ArraycopySrcPrefetchDistance > 0) {
|
||||||
|
__ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
|
||||||
|
}
|
||||||
|
if (ArraycopyDstPrefetchDistance > 0) {
|
||||||
|
__ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__ ldx(from, off+0, O4);
|
||||||
|
__ ldx(from, off+8, O5);
|
||||||
|
if (use_bis) {
|
||||||
|
__ stxa(O4, to, off+0);
|
||||||
|
__ stxa(O5, to, off+8);
|
||||||
|
} else {
|
||||||
|
__ stx(O4, to, off+0);
|
||||||
|
__ stx(O5, to, off+8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__ deccc(count, 8);
|
||||||
|
__ inc(from, 64);
|
||||||
|
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
|
||||||
|
__ delayed()->inc(to, 64);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Generate core code for disjoint long copy (and oop copy on 64-bit).
|
// Generate core code for disjoint long copy (and oop copy on 64-bit).
|
||||||
// "aligned" is ignored, because we must make the stronger
|
// "aligned" is ignored, because we must make the stronger
|
||||||
@ -2261,38 +2425,28 @@ class StubGenerator: public StubCodeGenerator {
|
|||||||
const Register offset0 = O4; // element offset
|
const Register offset0 = O4; // element offset
|
||||||
const Register offset8 = O5; // next element offset
|
const Register offset8 = O5; // next element offset
|
||||||
|
|
||||||
__ deccc(count, 2);
|
__ deccc(count, 2);
|
||||||
__ mov(G0, offset0); // offset from start of arrays (0)
|
__ mov(G0, offset0); // offset from start of arrays (0)
|
||||||
__ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
|
__ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
|
||||||
__ delayed()->add(offset0, 8, offset8);
|
__ delayed()->add(offset0, 8, offset8);
|
||||||
|
|
||||||
// Copy by 64 bytes chunks
|
// Copy by 64 bytes chunks
|
||||||
Label L_copy_64_bytes;
|
|
||||||
const Register from64 = O3; // source address
|
const Register from64 = O3; // source address
|
||||||
const Register to64 = G3; // destination address
|
const Register to64 = G3; // destination address
|
||||||
__ subcc(count, 6, O3);
|
__ subcc(count, 6, O3);
|
||||||
__ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
|
__ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
|
||||||
__ delayed()->mov(to, to64);
|
__ delayed()->mov(to, to64);
|
||||||
// Now we can use O4(offset0), O5(offset8) as temps
|
// Now we can use O4(offset0), O5(offset8) as temps
|
||||||
__ mov(O3, count);
|
__ mov(O3, count);
|
||||||
__ mov(from, from64);
|
// count >= 0 (original count - 8)
|
||||||
|
__ mov(from, from64);
|
||||||
|
|
||||||
__ align(OptoLoopAlignment);
|
disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
|
||||||
__ BIND(L_copy_64_bytes);
|
|
||||||
for( int off = 0; off < 64; off += 16 ) {
|
|
||||||
__ ldx(from64, off+0, O4);
|
|
||||||
__ ldx(from64, off+8, O5);
|
|
||||||
__ stx(O4, to64, off+0);
|
|
||||||
__ stx(O5, to64, off+8);
|
|
||||||
}
|
|
||||||
__ deccc(count, 8);
|
|
||||||
__ inc(from64, 64);
|
|
||||||
__ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
|
|
||||||
__ delayed()->inc(to64, 64);
|
|
||||||
|
|
||||||
// Restore O4(offset0), O5(offset8)
|
// Restore O4(offset0), O5(offset8)
|
||||||
__ sub(from64, from, offset0);
|
__ sub(from64, from, offset0);
|
||||||
__ inccc(count, 6);
|
__ inccc(count, 6); // restore count
|
||||||
__ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
|
__ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
|
||||||
__ delayed()->add(offset0, 8, offset8);
|
__ delayed()->add(offset0, 8, offset8);
|
||||||
|
|
||||||
|
@ -75,6 +75,24 @@ void VM_Version::initialize() {
|
|||||||
FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1);
|
FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (has_v9()) {
|
||||||
|
assert(ArraycopySrcPrefetchDistance < 4096, "invalid value");
|
||||||
|
if (ArraycopySrcPrefetchDistance >= 4096)
|
||||||
|
ArraycopySrcPrefetchDistance = 4064;
|
||||||
|
assert(ArraycopyDstPrefetchDistance < 4096, "invalid value");
|
||||||
|
if (ArraycopyDstPrefetchDistance >= 4096)
|
||||||
|
ArraycopyDstPrefetchDistance = 4064;
|
||||||
|
} else {
|
||||||
|
if (ArraycopySrcPrefetchDistance > 0) {
|
||||||
|
warning("prefetch instructions are not available on this CPU");
|
||||||
|
FLAG_SET_DEFAULT(ArraycopySrcPrefetchDistance, 0);
|
||||||
|
}
|
||||||
|
if (ArraycopyDstPrefetchDistance > 0) {
|
||||||
|
warning("prefetch instructions are not available on this CPU");
|
||||||
|
FLAG_SET_DEFAULT(ArraycopyDstPrefetchDistance, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
UseSSE = 0; // Only on x86 and x64
|
UseSSE = 0; // Only on x86 and x64
|
||||||
|
|
||||||
_supports_cx8 = has_v9();
|
_supports_cx8 = has_v9();
|
||||||
@ -180,6 +198,16 @@ void VM_Version::initialize() {
|
|||||||
FLAG_SET_DEFAULT(UseBlockZeroing, false);
|
FLAG_SET_DEFAULT(UseBlockZeroing, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
assert(BlockCopyLowLimit > 0, "invalid value");
|
||||||
|
if (has_block_zeroing()) { // has_blk_init() && is_T4(): core's local L2 cache
|
||||||
|
if (FLAG_IS_DEFAULT(UseBlockCopy)) {
|
||||||
|
FLAG_SET_DEFAULT(UseBlockCopy, true);
|
||||||
|
}
|
||||||
|
} else if (UseBlockCopy) {
|
||||||
|
warning("BIS instructions are not available or expensive on this CPU");
|
||||||
|
FLAG_SET_DEFAULT(UseBlockCopy, false);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef COMPILER2
|
#ifdef COMPILER2
|
||||||
// T4 and newer Sparc cpus have fast RDPC.
|
// T4 and newer Sparc cpus have fast RDPC.
|
||||||
if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
|
if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {
|
||||||
|
@ -1985,6 +1985,12 @@ class CommandLineFlags {
|
|||||||
product(intx, BlockZeroingLowLimit, 2048, \
|
product(intx, BlockZeroingLowLimit, 2048, \
|
||||||
"Minimum size in bytes when block zeroing will be used") \
|
"Minimum size in bytes when block zeroing will be used") \
|
||||||
\
|
\
|
||||||
|
product(bool, UseBlockCopy, false, \
|
||||||
|
"Use special cpu instructions for block copy") \
|
||||||
|
\
|
||||||
|
product(intx, BlockCopyLowLimit, 2048, \
|
||||||
|
"Minimum size in bytes when block copy will be used") \
|
||||||
|
\
|
||||||
product(bool, PrintRevisitStats, false, \
|
product(bool, PrintRevisitStats, false, \
|
||||||
"Print revisit (klass and MDO) stack related information") \
|
"Print revisit (klass and MDO) stack related information") \
|
||||||
\
|
\
|
||||||
@ -2918,6 +2924,12 @@ class CommandLineFlags {
|
|||||||
product(intx, ReadPrefetchInstr, 0, \
|
product(intx, ReadPrefetchInstr, 0, \
|
||||||
"Prefetch instruction to prefetch ahead") \
|
"Prefetch instruction to prefetch ahead") \
|
||||||
\
|
\
|
||||||
|
product(uintx, ArraycopySrcPrefetchDistance, 0, \
|
||||||
|
"Distance to prefetch source array in arracopy") \
|
||||||
|
\
|
||||||
|
product(uintx, ArraycopyDstPrefetchDistance, 0, \
|
||||||
|
"Distance to prefetch destination array in arracopy") \
|
||||||
|
\
|
||||||
/* deoptimization */ \
|
/* deoptimization */ \
|
||||||
develop(bool, TraceDeoptimization, false, \
|
develop(bool, TraceDeoptimization, false, \
|
||||||
"Trace deoptimization") \
|
"Trace deoptimization") \
|
||||||
|
Loading…
Reference in New Issue
Block a user