8279258: Auto-vectorization enhancement for two-dimensional array operations

Reviewed-by: neliasso, kvn
This commit is contained in:
Jie Fu 2022-01-07 00:04:51 +00:00
parent 8d0f385fd0
commit 6a42fbaf9b
5 changed files with 218 additions and 8 deletions
src/hotspot/share/opto
test
hotspot/jtreg/compiler
micro/org/openjdk/bench/vm/compiler

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -899,15 +899,27 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
return false;
}
bool should_unroll = true;
// When unroll count is greater than LoopUnrollMin, don't unroll if:
// the residual iterations are more than 10% of the trip count
// and rounds of "unroll,optimize" are not making significant progress
// Progress defined as current size less than 20% larger than previous size.
if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
future_unroll_cnt > LoopUnrollMin &&
(future_unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
is_residual_iters_large(future_unroll_cnt, cl) &&
1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
return false;
if ((cl->slp_max_unroll() == 0) && !is_residual_iters_large(cl->unrolled_count(), cl)) {
// cl->slp_max_unroll() = 0 means that the previous slp analysis never passed.
// slp analysis may fail due to the loop IR is too complicated especially during the early stage
// of loop unrolling analysis. But after several rounds of loop unrolling and other optimizations,
// it's possible that the loop IR becomes simple enough to pass the slp analysis.
// So we don't return immediately in hoping that the next slp analysis can succeed.
should_unroll = false;
future_unroll_cnt = cl->unrolled_count();
} else {
return false;
}
}
Node *init_n = cl->init_trip();
@ -985,7 +997,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
}
// Only attempt slp analysis when user controls do not prohibit it
if (LoopMaxUnroll > _local_loop_unroll_factor) {
if (!cl->range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
// Once policy_slp_analysis succeeds, mark the loop with the
// maximal unroll factor so that we minimize analysis passes
if (future_unroll_cnt >= _local_loop_unroll_factor) {
@ -1003,7 +1015,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
if (cl->has_passed_slp()) {
if (slp_max_unroll_factor >= future_unroll_cnt) {
return phase->may_require_nodes(estimate);
return should_unroll && phase->may_require_nodes(estimate);
}
return false; // Loop too big.
}
@ -1011,7 +1023,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
// Check for being too big
if (body_size > (uint)_local_loop_unroll_limit) {
if ((cl->is_subword_loop() || xors_in_loop >= 4) && body_size < 4u * LoopUnrollLimit) {
return phase->may_require_nodes(estimate);
return should_unroll && phase->may_require_nodes(estimate);
}
return false; // Loop too big.
}
@ -1024,7 +1036,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
}
// Unroll once! (Each trip will soon do double iterations)
return phase->may_require_nodes(estimate);
return should_unroll && phase->may_require_nodes(estimate);
}
void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_cnt) {
@ -3528,6 +3540,8 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
if (should_rce) {
if (phase->do_range_check(this, old_new) != 0) {
cl->mark_has_range_checks();
} else {
cl->clear_has_range_checks();
}
} else if (PostLoopMultiversioning) {
phase->has_range_checks(this);

@ -1,5 +1,5 @@
/*
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -114,6 +114,7 @@ public:
void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
void mark_has_range_checks() { _loop_flags |= HasRangeChecks; }
void clear_has_range_checks() { _loop_flags &= ~HasRangeChecks; }
void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
void mark_strip_mined() { _loop_flags |= StripMined; }
void clear_strip_mined() { _loop_flags &= ~StripMined; }
@ -773,6 +774,12 @@ public:
// Estimate the number of nodes resulting from control and data flow merge.
uint est_loop_flow_merge_sz() const;
// Check if the number of residual iterations is large with unroll_cnt.
// Return true if the residual iterations are more than 10% of the trip count.
bool is_residual_iters_large(int unroll_cnt, CountedLoopNode *cl) const {
return (unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt();
}
};
// -----------------------------PhaseIdealLoop---------------------------------

@ -0,0 +1,63 @@
/*
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.c2.irTests;
import compiler.lib.ir_framework.*;
/*
* @test
* @bug 8279258
* @summary Auto-vectorization enhancement for two-dimensional array operations
* @library /test/lib /
* @run driver compiler.c2.irTests.TestAutoVectorization2DArray
*/
public class TestAutoVectorization2DArray {
final private static int NUM = 64;
private static double[][] a = new double[NUM][NUM];
private static double[][] b = new double[NUM][NUM];
private static double[][] c = new double[NUM][NUM];
public static void main(String[] args) {
TestFramework.run();
}
@Test
@IR(counts = { IRNode.LOAD_VECTOR, " >0 " })
@IR(counts = { IRNode.ADD_VD, " >0 " })
@IR(counts = { IRNode.STORE_VECTOR, " >0 " })
private static void testDouble(double[][] a , double[][] b, double[][] c) {
for(int i = 0; i < a.length; i++) {
for (int j = 0; j < a[0].length; j++) {
a[i][j] = b[i][j] + c[i][j];
}
}
}
@Run(test = "testDouble")
private void testDouble_runner() {
testDouble(a, b, c);
}
}

@ -74,6 +74,7 @@ public class IRNode {
public static final String STORE_D = START + "StoreD" + MID + END;
public static final String STORE_P = START + "StoreP" + MID + END;
public static final String STORE_N = START + "StoreN" + MID + END;
public static final String STORE_VECTOR = START + "StoreVector" + MID + END;
public static final String STORE_OF_CLASS = COMPOSITE_PREFIX + START + "Store(B|C|S|I|L|F|D|P|N)" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
public static final String STORE_B_OF_CLASS = COMPOSITE_PREFIX + START + "StoreB" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
public static final String STORE_C_OF_CLASS = COMPOSITE_PREFIX + START + "StoreC" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
@ -96,6 +97,7 @@ public class IRNode {
public static final String LOAD_D = START + "LoadD" + MID + END;
public static final String LOAD_P = START + "LoadP" + MID + END;
public static final String LOAD_N = START + "LoadN" + MID + END;
public static final String LOAD_VECTOR = START + "LoadVector" + MID + END;
public static final String LOAD_OF_CLASS = COMPOSITE_PREFIX + START + "Load(B|UB|S|US|I|L|F|D|P|N)" + MID + "@\\S*"+ IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
public static final String LOAD_B_OF_CLASS = COMPOSITE_PREFIX + START + "LoadB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
public static final String LOAD_UB_OF_CLASS = COMPOSITE_PREFIX + START + "LoadUB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
@ -139,6 +141,7 @@ public class IRNode {
public static final String LSHIFT_L = START + "LShiftL" + MID + END;
public static final String ADD_I = START + "AddI" + MID + END;
public static final String ADD_L = START + "AddL" + MID + END;
public static final String ADD_VD = START + "AddVD" + MID + END;
public static final String SUB_I = START + "SubI" + MID + END;
public static final String SUB_L = START + "SubL" + MID + END;
public static final String MUL_I = START + "MulI" + MID + END;

@ -0,0 +1,123 @@
/*
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.*;
import java.util.concurrent.TimeUnit;
@Warmup(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
@Fork(value=1)
public class AutoVectorization2DArray {
@Param({"16", "32", "64"})
private int LEN;
private byte[][] a_byte;
private byte[][] b_byte;
private byte[][] c_byte;
private int[][] a_int;
private int[][] b_int;
private int[][] c_int;
private double[][] a_double;
private double[][] b_double;
private double[][] c_double;
@Setup
public void init() {
a_byte = new byte[LEN][LEN];
b_byte = new byte[LEN][LEN];
c_byte = new byte[LEN][LEN];
a_int = new int[LEN][LEN];
b_int = new int[LEN][LEN];
c_int = new int[LEN][LEN];
a_double = new double[LEN][LEN];
b_double = new double[LEN][LEN];
c_double = new double[LEN][LEN];
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private int run_byte(int count, byte[][] a , byte[][] b, byte[][] c) {
for(int i = 0; i < a.length; i++) {
for (int j = 0; j < a[0].length; j++) {
a[i][j] = (byte)(b[i][j] + c[i][j]);
}
}
return a[count][count];
}
@Benchmark
public void test_run_byte(Blackhole bh) {
int r = 0;
for(int i = 0 ; i < 100; i++) {
r += run_byte(i % a_byte.length, a_byte, b_byte, c_byte);
}
bh.consume(r);
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private int run_int(int count, int[][] a, int[][] b, int[][] c) {
for(int i = 0; i < a.length; i++) {
for (int j = 0; j < a[0].length; j++) {
a[i][j] = b[i][j] + c[i][j];
}
}
return a[count][count];
}
@Benchmark
public void test_run_int(Blackhole bh) {
int r = 0;
for(int i = 0 ; i < 100; i++) {
r += run_int(i % a_int.length, a_int, b_int, c_int);
}
bh.consume(r);
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private double run_double(int count, double[][] a, double[][] b, double[][] c) {
for(int i = 0; i < a.length; i++) {
for (int j = 0; j < a[0].length; j++) {
a[i][j] = b[i][j] + c[i][j];
}
}
return a[count][count];
}
@Benchmark
public void test_run_double(Blackhole bh) {
double r = 0;
for(int i = 0 ; i < 100; i++) {
r += run_double(i % a_double.length, a_double, b_double, c_double);
}
bh.consume(r);
}
}