767 lines
28 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
import jdk.test.lib.Utils;
import jdk.test.whitebox.WhiteBox;
import java.lang.reflect.Array;
import java.util.Map;
import java.util.HashMap;
import java.util.Random;
import java.nio.ByteOrder;
/*
* @test
* @bug 8326139
* @summary Test splitting packs in SuperWord
* @library /test/lib /
* @requires vm.compiler2.enabled
* @run driver compiler.loopopts.superword.TestSplitPacks
*/
public class TestSplitPacks {
static int RANGE = 1024*8;
static int RANGE_FINAL = 1024*8;
private static final Random RANDOM = Utils.getRandomInstance();
// Inputs
byte[] aB;
byte[] bB;
byte mB = (byte)31;
short[] aS;
short[] bS;
short mS = (short)0xF0F0;
int[] aI;
int[] bI;
int mI = 0xF0F0F0F0;
long[] aL;
long[] bL;
long mL = 0xF0F0F0F0F0F0F0F0L;
// List of tests
Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
// List of gold, the results from the first run before compilation
Map<String,Object[]> golds = new HashMap<String,Object[]>();
interface TestFunction {
Object[] run();
}
public static void main(String[] args) {
TestFramework.runWithFlags("-XX:LoopUnrollLimit=1000");
}
public TestSplitPacks() {
// Generate input once
aB = generateB();
bB = generateB();
aS = generateS();
bS = generateS();
aI = generateI();
bI = generateI();
aL = generateL();
bL = generateL();
// Add all tests to list
tests.put("test0", () -> { return test0(aI.clone(), bI.clone(), mI); });
tests.put("test1a", () -> { return test1a(aI.clone(), bI.clone(), mI); });
tests.put("test1b", () -> { return test1b(aI.clone(), bI.clone(), mI); });
tests.put("test1c", () -> { return test1c(aI.clone(), bI.clone(), mI); });
tests.put("test1d", () -> { return test1d(aI.clone(), bI.clone(), mI); });
tests.put("test2a", () -> { return test2a(aI.clone(), bI.clone(), mI); });
tests.put("test2b", () -> { return test2b(aI.clone(), bI.clone(), mI); });
tests.put("test2c", () -> { return test2c(aI.clone(), bI.clone(), mI); });
tests.put("test2d", () -> { return test2d(aI.clone(), bI.clone(), mI); });
tests.put("test3a", () -> { return test3a(aS.clone(), bS.clone(), mS); });
tests.put("test4a", () -> { return test4a(aS.clone(), bS.clone()); });
tests.put("test4b", () -> { return test4b(aS.clone(), bS.clone()); });
tests.put("test4c", () -> { return test4c(aS.clone(), bS.clone()); });
tests.put("test4d", () -> { return test4d(aS.clone(), bS.clone()); });
tests.put("test4e", () -> { return test4e(aS.clone(), bS.clone()); });
tests.put("test4f", () -> { return test4f(aS.clone(), bS.clone()); });
tests.put("test4g", () -> { return test4g(aS.clone(), bS.clone()); });
tests.put("test5a", () -> { return test5a(aS.clone(), bS.clone(), mS); });
tests.put("test6a", () -> { return test6a(aI.clone(), bI.clone()); });
tests.put("test7a", () -> { return test7a(aI.clone(), bI.clone()); });
// Compute gold value for all test methods before compilation
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
Object[] gold = test.run();
golds.put(name, gold);
}
}
@Warmup(100)
@Run(test = {"test0",
"test1a",
"test1b",
"test1c",
"test1d",
"test2a",
"test2b",
"test2c",
"test2d",
"test3a",
"test4a",
"test4b",
"test4c",
"test4d",
"test4e",
"test4f",
"test4g",
"test5a",
"test6a",
"test7a"})
public void runTests() {
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
// Recall gold value from before compilation
Object[] gold = golds.get(name);
// Compute new result
Object[] result = test.run();
// Compare gold and new result
verify(name, gold, result);
}
}
static byte[] generateB() {
byte[] a = new byte[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = (byte)RANDOM.nextInt();
}
return a;
}
static short[] generateS() {
short[] a = new short[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = (short)RANDOM.nextInt();
}
return a;
}
static int[] generateI() {
int[] a = new int[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = RANDOM.nextInt();
}
return a;
}
static long[] generateL() {
long[] a = new long[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = RANDOM.nextLong();
}
return a;
}
static void verify(String name, Object[] gold, Object[] result) {
if (gold.length != result.length) {
throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
gold.length + ", result.length = " + result.length);
}
for (int i = 0; i < gold.length; i++) {
Object g = gold[i];
Object r = result[i];
if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
throw new RuntimeException("verify " + name + ": must both be array of same type:" +
" gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
" result[" + i + "].getClass() = " + r.getClass().getSimpleName());
}
if (g == r) {
throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
" gold[" + i + "] == result[" + i + "]");
}
if (Array.getLength(g) != Array.getLength(r)) {
throw new RuntimeException("verify " + name + ": arrays must have same length:" +
" gold[" + i + "].length = " + Array.getLength(g) +
" result[" + i + "].length = " + Array.getLength(r));
}
Class c = g.getClass().getComponentType();
if (c == byte.class) {
verifyB(name, i, (byte[])g, (byte[])r);
} else if (c == short.class) {
verifyS(name, i, (short[])g, (short[])r);
} else if (c == int.class) {
verifyI(name, i, (int[])g, (int[])r);
} else if (c == long.class) {
verifyL(name, i, (long[])g, (long[])r);
} else {
throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
" gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
" result[" + i + "].getClass() = " + r.getClass().getSimpleName());
}
}
}
static void verifyB(String name, int i, byte[] g, byte[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyS(String name, int i, short[] g, short[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyI(String name, int i, int[] g, int[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyL(String name, int i, long[] g, long[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Load and store are already split
//
// 0 1 - - 4 5 6 7
// | | | | | |
// 0 1 - - 4 5 6 7
static Object[] test0(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
int b1 = a[i+1] & mask;
int b4 = a[i+4] & mask;
int b5 = a[i+5] & mask;
int b6 = a[i+6] & mask;
int b7 = a[i+7] & mask;
b[i+0] = b0;
b[i+1] = b1;
b[i+4] = b4;
b[i+5] = b5;
b[i+6] = b6;
b[i+7] = b7;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Adjacent Load and Store, but split by Add/Mul
static Object[] test1a(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
b[i+0] = a[i+0] + mask; // Add
b[i+1] = a[i+1] + mask;
b[i+2] = a[i+2] + mask;
b[i+3] = a[i+3] + mask;
b[i+4] = a[i+4] * mask; // Mul
b[i+5] = a[i+5] * mask;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Adjacent Load and Store, but split by Add/Mul
static Object[] test1b(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
b[i+0] = a[i+0] * mask; // Mul
b[i+1] = a[i+1] * mask;
b[i+2] = a[i+2] * mask;
b[i+3] = a[i+3] * mask;
b[i+4] = a[i+4] + mask; // Add
b[i+5] = a[i+5] + mask;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
// Adjacent Load and Store, but split by Add/Mul
static Object[] test1c(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
b[i+0] = a[i+0] + mask; // Add
b[i+1] = a[i+1] + mask;
b[i+2] = a[i+2] * mask; // Mul
b[i+3] = a[i+3] * mask;
b[i+4] = a[i+4] * mask;
b[i+5] = a[i+5] * mask;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
// Adjacent Load and Store, but split by Add/Mul
static Object[] test1d(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
b[i+0] = a[i+0] * mask; // Mul
b[i+1] = a[i+1] * mask;
b[i+2] = a[i+2] + mask; // Add
b[i+3] = a[i+3] + mask;
b[i+4] = a[i+4] + mask;
b[i+5] = a[i+5] + mask;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split the load
//
// 0 1 2 3 4 5 - -
// | | \ \ \ \
// | | \ \ \ \
// | | \ \ \ \
// 0 1 - - 4 5 6 7
//
// The 4-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2a(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
int b1 = a[i+1] & mask;
int b2 = a[i+2] & mask;
int b3 = a[i+3] & mask;
int b4 = a[i+4] & mask;
int b5 = a[i+5] & mask;
b[i+0] = b0;
b[i+1] = b1;
b[i+4] = b2;
b[i+5] = b3;
b[i+6] = b4;
b[i+7] = b5;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split the load
//
// 0 1 2 3 4 5 - -
// | | | | \ \
// | | | | \ \
// | | | | \ \
// 0 1 2 3 -- 6 7
//
// The 2-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2b(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
int b1 = a[i+1] & mask;
int b2 = a[i+2] & mask;
int b3 = a[i+3] & mask;
int b4 = a[i+4] & mask;
int b5 = a[i+5] & mask;
b[i+0] = b0;
b[i+1] = b1;
b[i+2] = b2;
b[i+3] = b3;
b[i+6] = b4;
b[i+7] = b5;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split the load
//
// 0 1 - - 4 5 6 7
// | | / / / /
// | | / / / /
// | | / / / /
// 0 1 2 3 4 5 - -
//
// The 4-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2c(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
int b1 = a[i+1] & mask;
int b4 = a[i+4] & mask;
int b5 = a[i+5] & mask;
int b6 = a[i+6] & mask;
int b7 = a[i+7] & mask;
b[i+0] = b0;
b[i+1] = b1;
b[i+2] = b4;
b[i+3] = b5;
b[i+4] = b6;
b[i+5] = b7;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split the load
//
// 0 1 2 3 - - 6 7
// | | | | / /
// | | | | / /
// | | | | / /
// 0 1 2 3 4 5 - -
//
// The 2-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2d(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
int b1 = a[i+1] & mask;
int b2 = a[i+2] & mask;
int b3 = a[i+3] & mask;
int b6 = a[i+6] & mask;
int b7 = a[i+7] & mask;
b[i+0] = b0;
b[i+1] = b1;
b[i+2] = b2;
b[i+3] = b3;
b[i+4] = b6;
b[i+5] = b7;
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// 0 1 2 3 4 5 6 7 -
// | | | | | | | |
// | + + + | | | |
// | | | | |
// | v | | | | v
// | | | | | | |
// 1 - - 3 4 5 6 7 8
static Object[] test3a(short[] a, short[] b, short val) {
int sum = 0;
for (int i = 0; i < RANGE; i+=16) {
short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
short a2 = a[i+2];
short a3 = a[i+3];
short a4 = a[i+4]; // 4-pack
short a5 = a[i+5];
short a6 = a[i+6];
short a7 = a[i+7];
b[i+0] = a0; // required for alignment / offsets, technical limitation.
sum += a1 + a2 + a3; // not packed
b[i+3] = val; // adjacent to 4-pack but needs to be split off
b[i+4] = a4; // 4-pack
b[i+5] = a5;
b[i+6] = a6;
b[i+7] = a7;
b[i+8] = val; // adjacent to 4-pack but needs to be split off
}
return new Object[]{ a, b, new int[]{ sum } };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true"})
// Cyclic dependency with distance 2 -> split into 2-packs
static Object[] test4a(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+2] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true"})
// Cyclic dependency with distance 3 -> split into 2-packs
static Object[] test4b(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+3] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=8"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Cyclic dependency with distance 4 -> split into 4-packs
static Object[] test4c(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+4] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Cyclic dependency with distance 5 -> split into 4-packs
static Object[] test4d(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+5] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Cyclic dependency with distance 6 -> split into 4-packs
static Object[] test4e(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+6] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Cyclic dependency with distance 7 -> split into 4-packs
static Object[] test4f(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+7] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Cyclic dependency with distance 8 -> split into 8-packs
static Object[] test4g(short[] a, short[] b) {
for (int i = 0; i < RANGE-64; i++) {
b[i+8] = a[i+0];
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
IRNode.ADD_VS, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.ADD_VS, IRNode.VECTOR_SIZE_8, "> 0",
IRNode.ADD_VS, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split pack into power-of-2 sizes
static Object[] test5a(short[] a, short[] b, short val) {
for (int i = 0; i < RANGE; i+=16) {
b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack
b[i+ 1] = (short)(a[i+ 1] + val);
b[i+ 2] = (short)(a[i+ 2] + val);
b[i+ 3] = (short)(a[i+ 3] + val);
b[i+ 4] = (short)(a[i+ 4] + val);
b[i+ 5] = (short)(a[i+ 5] + val);
b[i+ 6] = (short)(a[i+ 6] + val);
b[i+ 7] = (short)(a[i+ 7] + val);
b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
b[i+ 9] = (short)(a[i+ 9] + val);
b[i+10] = (short)(a[i+10] + val);
b[i+11] = (short)(a[i+11] + val);
b[i+12] = (short)(a[i+12] + val); // 2-pack
b[i+13] = (short)(a[i+13] + val);
b[i+14] = (short)(a[i+14] + val);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
IRNode.ADD_REDUCTION_V, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Split packs including reductions
static Object[] test6a(int[] a, int[] b) {
int s = 0;
for (int i = 0; i < RANGE; i+=8) {
s += a[i+0] * b[i+0];
s += a[i+1] * b[i+1];
s += a[i+2] * b[i+2];
s += a[i+3] * b[i+3];
s += a[i+4] & b[i+4];
s += a[i+5] & b[i+5];
s += a[i+6] & b[i+6];
s += a[i+7] & b[i+7];
}
return new Object[]{ a, b, new int[]{ s } };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.MUL_VI, "> 0",
IRNode.POPULATE_INDEX, "> 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"})
// Index Populate:
// There can be an issue when all the (iv + 1), (iv + 2), ...
// get packed, but not (iv). Then we have a pack that is one element
// too short, and we start splitting everything in a bad way.
static Object[] test7a(int[] a, int[] b) {
for (int i = 0; i < RANGE; i++) {
a[i] = b[i] * i;
}
return new Object[]{ a, b };
}
}