8158232: PPC64: improve byte, int and long array copy stubs by using VSX instructions
Reviewed-by: goetz, mdoerr
This commit is contained in:
parent
c6c73deca4
commit
2ade029123
@ -1123,7 +1123,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Register tmp3 = R8_ARG6;
|
||||
Register tmp4 = R9_ARG7;
|
||||
|
||||
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
|
||||
VectorSRegister tmp_vsr1 = VSR1;
|
||||
VectorSRegister tmp_vsr2 = VSR2;
|
||||
|
||||
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
|
||||
|
||||
// Don't try anything fancy if arrays don't have many elements.
|
||||
__ li(tmp3, 0);
|
||||
@ -1178,6 +1181,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ andi_(R5_ARG3, R5_ARG3, 31);
|
||||
__ mtctr(tmp1);
|
||||
|
||||
if (!VM_Version::has_vsx()) {
|
||||
|
||||
__ bind(l_8);
|
||||
// Use unrolled version for mass copying (copy 32 elements a time)
|
||||
// Load feeding store gets zero latency on Power6, however not on Power5.
|
||||
@ -1193,7 +1198,44 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ addi(R3_ARG1, R3_ARG1, 32);
|
||||
__ addi(R4_ARG2, R4_ARG2, 32);
|
||||
__ bdnz(l_8);
|
||||
}
|
||||
|
||||
} else { // Processor supports VSX, so use it to mass copy.
|
||||
|
||||
// Prefetch the data into the L2 cache.
|
||||
__ dcbt(R3_ARG1, 0);
|
||||
|
||||
// If supported set DSCR pre-fetch to deepest.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
__ li(tmp1, 16);
|
||||
|
||||
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
||||
// loop contains < 8 instructions that fit inside a single
|
||||
// i-cache sector.
|
||||
__ align(32);
|
||||
|
||||
__ bind(l_10);
|
||||
// Use loop with VSX load/store instructions to
|
||||
// copy 32 elements a time.
|
||||
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
|
||||
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
|
||||
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
||||
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
||||
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
||||
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
||||
__ bdnz(l_10); // Dec CTR and loop if not zero.
|
||||
|
||||
// Restore DSCR pre-fetch value.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
} // VSX
|
||||
} // FasterArrayCopy
|
||||
|
||||
__ bind(l_6);
|
||||
|
||||
@ -1557,7 +1599,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Register tmp3 = R8_ARG6;
|
||||
Register tmp4 = R0;
|
||||
|
||||
Label l_1, l_2, l_3, l_4, l_5, l_6;
|
||||
VectorSRegister tmp_vsr1 = VSR1;
|
||||
VectorSRegister tmp_vsr2 = VSR2;
|
||||
|
||||
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
|
||||
|
||||
// for short arrays, just do single element copy
|
||||
__ li(tmp3, 0);
|
||||
@ -1593,6 +1638,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ andi_(R5_ARG3, R5_ARG3, 7);
|
||||
__ mtctr(tmp1);
|
||||
|
||||
if (!VM_Version::has_vsx()) {
|
||||
|
||||
__ bind(l_6);
|
||||
// Use unrolled version for mass copying (copy 8 elements a time).
|
||||
// Load feeding store gets zero latency on power6, however not on power 5.
|
||||
@ -1608,7 +1655,44 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ addi(R3_ARG1, R3_ARG1, 32);
|
||||
__ addi(R4_ARG2, R4_ARG2, 32);
|
||||
__ bdnz(l_6);
|
||||
}
|
||||
|
||||
} else { // Processor supports VSX, so use it to mass copy.
|
||||
|
||||
// Prefetch the data into the L2 cache.
|
||||
__ dcbt(R3_ARG1, 0);
|
||||
|
||||
// If supported set DSCR pre-fetch to deepest.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
__ li(tmp1, 16);
|
||||
|
||||
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
||||
// loop contains < 8 instructions that fit inside a single
|
||||
// i-cache sector.
|
||||
__ align(32);
|
||||
|
||||
__ bind(l_7);
|
||||
// Use loop with VSX load/store instructions to
|
||||
// copy 8 elements a time.
|
||||
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
|
||||
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
|
||||
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
||||
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
||||
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
||||
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
||||
__ bdnz(l_7); // Dec CTR and loop if not zero.
|
||||
|
||||
// Restore DSCR pre-fetch value.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
} // VSX
|
||||
} // FasterArrayCopy
|
||||
|
||||
// copy 1 element at a time
|
||||
__ bind(l_2);
|
||||
@ -1757,7 +1841,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Register tmp3 = R8_ARG6;
|
||||
Register tmp4 = R0;
|
||||
|
||||
Label l_1, l_2, l_3, l_4;
|
||||
Label l_1, l_2, l_3, l_4, l_5;
|
||||
|
||||
VectorSRegister tmp_vsr1 = VSR1;
|
||||
VectorSRegister tmp_vsr2 = VSR2;
|
||||
|
||||
{ // FasterArrayCopy
|
||||
__ cmpwi(CCR0, R5_ARG3, 3);
|
||||
@ -1767,6 +1854,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ andi_(R5_ARG3, R5_ARG3, 3);
|
||||
__ mtctr(tmp1);
|
||||
|
||||
if (!VM_Version::has_vsx()) {
|
||||
__ bind(l_4);
|
||||
// Use unrolled version for mass copying (copy 4 elements a time).
|
||||
// Load feeding store gets zero latency on Power6, however not on Power5.
|
||||
@ -1782,7 +1870,44 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ addi(R3_ARG1, R3_ARG1, 32);
|
||||
__ addi(R4_ARG2, R4_ARG2, 32);
|
||||
__ bdnz(l_4);
|
||||
}
|
||||
|
||||
} else { // Processor supports VSX, so use it to mass copy.
|
||||
|
||||
// Prefetch the data into the L2 cache.
|
||||
__ dcbt(R3_ARG1, 0);
|
||||
|
||||
// If supported set DSCR pre-fetch to deepest.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
__ li(tmp1, 16);
|
||||
|
||||
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
||||
// loop contains < 8 instructions that fit inside a single
|
||||
// i-cache sector.
|
||||
__ align(32);
|
||||
|
||||
__ bind(l_5);
|
||||
// Use loop with VSX load/store instructions to
|
||||
// copy 4 elements a time.
|
||||
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
|
||||
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
|
||||
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
||||
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
||||
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
||||
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
||||
__ bdnz(l_5); // Dec CTR and loop if not zero.
|
||||
|
||||
// Restore DSCR pre-fetch value.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
||||
__ mtdscr(tmp2);
|
||||
}
|
||||
|
||||
} // VSX
|
||||
} // FasterArrayCopy
|
||||
|
||||
// copy 1 element at a time
|
||||
__ bind(l_3);
|
||||
|
Loading…
x
Reference in New Issue
Block a user