diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorThroughputForIterationCount.java b/test/micro/org/openjdk/bench/vm/compiler/VectorThroughputForIterationCount.java new file mode 100644 index 00000000000..79f693b16d9 --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorThroughputForIterationCount.java @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.*; + +import java.util.concurrent.TimeUnit; +import java.util.Random; + +// Note: I commented out the short, char, float and double benchmarks, so it only takes 5h instead of 12h. +// The goal is to track the performance of various loop sizes, and see the effect of pre/post loops. + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 2, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 3, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Fork(value = 1) +public abstract class VectorThroughputForIterationCount { + @Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", + "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", + "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", + "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", + "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", + "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", + "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", + "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", + "100","101","102","103","104","105","106","107","108","109", + "110","111","112","113","114","115","116","117","118","119", + "120","121","122","123","124","125","126","127","128","129", + "130","131","132","133","134","135","136","137","138","139", + "140","141","142","143","144","145","146","147","148","149", + "150","151","152","153","154","155","156","157","158","159", + "160","161","162","163","164","165","166","167","168","169", + "170","171","172","173","174","175","176","177","178","179", + "180","181","182","183","184","185","186","187","188","189", + "190","191","192","193","194","195","196","197","198","199", + "200","201","202","203","204","205","206","207","208","209", + "210","211","212","213","214","215","216","217","218","219", + "220","221","222","223","224","225","226","227","228","229", + "230","231","232","233","234","235","236","237","238","239", + "240","241","242","243","244","245","246","247","248","249", + "250","251","252","253","254","255","256","257","258","259", + "260","261","262","263","264","265","266","267","268","269", + "270","271","272","273","274","275","276","277","278","279", + "280","281","282","283","284","285","286","287","288","289", + "290","291","292","293","294","295","296","297","298","299", + "300", + // Above, the "small loops". + // Below, some "medium" to "large" loops. + "1000", "3000", "10000"}) + // Number of iterations spent in a loop. + public static int ITERATION_COUNT; + + // Add enough slack so we can play with offsets / alignment. + public static int CONTAINER_SIZE = 20_000; + + private byte[] aB; + private byte[] bB; + private byte[] rB; + + private short[] aS; + private short[] bS; + private short[] rS; + + private char[] aC; + private char[] bC; + private char[] rC; + + private int[] aI; + private int[] bI; + private int[] rI; + + private long[] aL; + private long[] bL; + private long[] rL; + + private float[] aF; + private float[] bF; + private float[] rF; + + private double[] aD; + private double[] bD; + private double[] rD; + + @Param({"1024"}) + // Number of times we run the loop, possibly with different offsets. + public static int REPETITIONS; + + @Param({"true", "false"}) + public static boolean RANDOMIZE_OFFSETS; + + @Param({"0"}) + // If RANDOMIZE_OFFSETS is disabled, use this offset: + public static int FIXED_OFFSET; + + // A different offset for each repetition of the loop. Depending on + // RANDOMIZE_OFFSETS, the values are random or all FIXED_OFFSET. + private int[] offsets; + + @Param("42") + private int seed; + private Random r = new Random(seed); + + @Setup + public void init() { + aI = new int[CONTAINER_SIZE]; + bI = new int[CONTAINER_SIZE]; + rI = new int[CONTAINER_SIZE]; + + aL = new long[CONTAINER_SIZE]; + bL = new long[CONTAINER_SIZE]; + rL = new long[CONTAINER_SIZE]; + + aS = new short[CONTAINER_SIZE]; + bS = new short[CONTAINER_SIZE]; + rS = new short[CONTAINER_SIZE]; + + aC = new char[CONTAINER_SIZE]; + bC = new char[CONTAINER_SIZE]; + rC = new char[CONTAINER_SIZE]; + + aB = new byte[CONTAINER_SIZE]; + bB = new byte[CONTAINER_SIZE]; + rB = new byte[CONTAINER_SIZE]; + + aF = new float[CONTAINER_SIZE]; + bF = new float[CONTAINER_SIZE]; + rF = new float[CONTAINER_SIZE]; + + aD = new double[CONTAINER_SIZE]; + bD = new double[CONTAINER_SIZE]; + rD = new double[CONTAINER_SIZE]; + + for (int i = 0; i < CONTAINER_SIZE; i++) { + aB[i] = (byte) r.nextInt(); + bB[i] = (byte) r.nextInt(); + + aS[i] = (short) r.nextInt(); + bS[i] = (short) r.nextInt(); + + aC[i] = (char) r.nextInt(); + bC[i] = (char) r.nextInt(); + + aI[i] = r.nextInt(); + bI[i] = r.nextInt(); + + aL[i] = r.nextLong(); + bL[i] = r.nextLong(); + + aF[i] = r.nextFloat(); + bF[i] = r.nextFloat(); + + aD[i] = r.nextDouble(); + bD[i] = r.nextDouble(); + } + + offsets = new int[REPETITIONS]; + if (RANDOMIZE_OFFSETS) { + for (int i = 0; i < REPETITIONS; i++) { + // Make sure it is predictable and uniform. + offsets[i] = i % 64; + } + } else { + for (int i = 0; i < REPETITIONS; i++) { + offsets[i] = FIXED_OFFSET; + } + } + } + + @Benchmark + public void bench001B_aligned_computeBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + // Have multiple MUL operations to make loop compute bound (more compute than load/store) + rB[i] = (byte)(aB[i] * aB[i] * aB[i] * aB[i]); + } + } + } + + @Benchmark + public void bench011B_aligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rB[i] = (byte)(aB[i] + bB[i]); + } + } + } + + @Benchmark + public void bench021B_unaligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rB[i] = (byte)(aB[i+1] + bB[i+2]); + } + } + } + +// @Benchmark +// public void bench002S_aligned_computeBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// // Have multiple MUL operations to make loop compute bound (more compute than load/store) +// rS[i] = (short)(aS[i] * aS[i] * aS[i] * aS[i]); +// } +// } +// } +// +// @Benchmark +// public void bench012S_aligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rS[i] = (short)(aS[i] + bS[i]); +// } +// } +// } +// +// @Benchmark +// public void bench022S_unaligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rS[i] = (short)(aS[i+1] + bS[i+2]); +// } +// } +// } +// +// @Benchmark +// public void bench003C_aligned_computeBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// // Have multiple MUL operations to make loop compute bound (more compute than load/store) +// rC[i] = (char)(aC[i] * aC[i] * aC[i] * aC[i]); +// } +// } +// } +// +// @Benchmark +// public void bench013C_aligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rC[i] = (char)(aC[i] + bC[i]); +// } +// } +// } +// +// @Benchmark +// public void bench023C_unaligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rC[i] = (char)(aC[i+1] + bC[i+2]); +// } +// } +// } + + @Benchmark + public void bench004I_aligned_computeBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + // Have multiple MUL operations to make loop compute bound (more compute than load/store) + rI[i] = (int)(aI[i] * aI[i] * aI[i] * aI[i]); + } + } + } + + @Benchmark + public void bench014I_aligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rI[i] = (int)(aI[i] + bI[i]); + } + } + } + + @Benchmark + public void bench024I_unaligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rI[i] = (int)(aI[i+1] + bI[i+2]); + } + } + } + + @Benchmark + public void bench005L_aligned_computeBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + // Have multiple MUL operations to make loop compute bound (more compute than load/store) + rL[i] = (long)(aL[i] * aL[i] * aL[i] * aL[i]); + } + } + } + + @Benchmark + public void bench015L_aligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rL[i] = (long)(aL[i] + bL[i]); + } + } + } + + @Benchmark + public void bench025L_unaligned_memoryBound() { + for (int r = 0; r < REPETITIONS; r++) { + int init = offsets[r]; + int limit = init + ITERATION_COUNT; + for (int i = init; i < limit; i++) { + rL[i] = (long)(aL[i+1] + bL[i+2]); + } + } + } + +// @Benchmark +// public void bench006F_aligned_computeBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// // Have multiple MUL operations to make loop compute bound (more compute than load/store) +// rF[i] = (float)(aF[i] * aF[i] * aF[i] * aF[i]); +// } +// } +// } +// +// @Benchmark +// public void bench016F_aligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rF[i] = (float)(aF[i] + bF[i]); +// } +// } +// } +// +// @Benchmark +// public void bench026F_unaligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rF[i] = (float)(aF[i+1] + bF[i+2]); +// } +// } +// } +// +// @Benchmark +// public void bench007D_aligned_computeBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// // Have multiple MUL operations to make loop compute bound (more compute than load/store) +// rD[i] = (double)(aD[i] * aD[i] * aD[i] * aD[i]); +// } +// } +// } +// +// @Benchmark +// public void bench017D_aligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rD[i] = (double)(aD[i] + bD[i]); +// } +// } +// } +// +// @Benchmark +// public void bench027D_unaligned_memoryBound() { +// for (int r = 0; r < REPETITIONS; r++) { +// int init = offsets[r]; +// int limit = init + ITERATION_COUNT; +// for (int i = init; i < limit; i++) { +// rD[i] = (double)(aD[i+1] + bD[i+2]); +// } +// } +// } + + @Fork(value = 1, jvmArgs = { + "-XX:+UseSuperWord" + }) + public static class SuperWord extends VectorThroughputForIterationCount {} +}