8158232: PPC64: improve byte, int and long array copy stubs by using VSX instructions

Reviewed-by: goetz, mdoerr
2016-06-15 13:47:17 +02:00 · 2016-06-15 13:47:17 +02:00 · 2ade029123
commit 2ade029123
parent c6c73deca4
1 changed files with 131 additions and 6 deletions
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
@ -1123,7 +1123,10 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp3 = R8_ARG6;
    Register tmp4 = R9_ARG7;

-    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;

    // Don't try anything fancy if arrays don't have many elements.
    __ li(tmp3, 0);
@ -1178,6 +1181,8 @@ class StubGenerator: public StubCodeGenerator {
      __ andi_(R5_ARG3, R5_ARG3, 31);
      __ mtctr(tmp1);

+     if (!VM_Version::has_vsx()) {
+
      __ bind(l_8);
      // Use unrolled version for mass copying (copy 32 elements a time)
      // Load feeding store gets zero latency on Power6, however not on Power5.
@ -1193,7 +1198,44 @@ class StubGenerator: public StubCodeGenerator {
      __ addi(R3_ARG1, R3_ARG1, 32);
      __ addi(R4_ARG2, R4_ARG2, 32);
      __ bdnz(l_8);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_10);
+      // Use loop with VSX load/store instructions to
+      // copy 32 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_10);                       // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy

    __ bind(l_6);

@ -1557,7 +1599,10 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp3 = R8_ARG6;
    Register tmp4 = R0;

-    Label l_1, l_2, l_3, l_4, l_5, l_6;
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;

    // for short arrays, just do single element copy
    __ li(tmp3, 0);
@ -1593,6 +1638,8 @@ class StubGenerator: public StubCodeGenerator {
      __ andi_(R5_ARG3, R5_ARG3, 7);
      __ mtctr(tmp1);

+     if (!VM_Version::has_vsx()) {
+
      __ bind(l_6);
      // Use unrolled version for mass copying (copy 8 elements a time).
      // Load feeding store gets zero latency on power6, however not on power 5.
@ -1608,7 +1655,44 @@ class StubGenerator: public StubCodeGenerator {
      __ addi(R3_ARG1, R3_ARG1, 32);
      __ addi(R4_ARG2, R4_ARG2, 32);
      __ bdnz(l_6);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_7);
+      // Use loop with VSX load/store instructions to
+      // copy 8 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_7);                        // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy

    // copy 1 element at a time
    __ bind(l_2);
@ -1757,7 +1841,10 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp3 = R8_ARG6;
    Register tmp4 = R0;

-    Label l_1, l_2, l_3, l_4;
+    Label l_1, l_2, l_3, l_4, l_5;
+
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;

    { // FasterArrayCopy
      __ cmpwi(CCR0, R5_ARG3, 3);
@ -1767,6 +1854,7 @@ class StubGenerator: public StubCodeGenerator {
      __ andi_(R5_ARG3, R5_ARG3, 3);
      __ mtctr(tmp1);

+    if (!VM_Version::has_vsx()) {
      __ bind(l_4);
      // Use unrolled version for mass copying (copy 4 elements a time).
      // Load feeding store gets zero latency on Power6, however not on Power5.
@ -1782,7 +1870,44 @@ class StubGenerator: public StubCodeGenerator {
      __ addi(R3_ARG1, R3_ARG1, 32);
      __ addi(R4_ARG2, R4_ARG2, 32);
      __ bdnz(l_4);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_5);
+      // Use loop with VSX load/store instructions to
+      // copy 4 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_5);                        // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy

    // copy 1 element at a time
    __ bind(l_3);