8279258: Auto-vectorization enhancement for two-dimensional array operations
Reviewed-by: neliasso, kvn
This commit is contained in:
parent
8d0f385fd0
commit
6a42fbaf9b
src/hotspot/share/opto
test
hotspot/jtreg/compiler
micro/org/openjdk/bench/vm/compiler
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -899,15 +899,27 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool should_unroll = true;
|
||||
|
||||
// When unroll count is greater than LoopUnrollMin, don't unroll if:
|
||||
// the residual iterations are more than 10% of the trip count
|
||||
// and rounds of "unroll,optimize" are not making significant progress
|
||||
// Progress defined as current size less than 20% larger than previous size.
|
||||
if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
|
||||
future_unroll_cnt > LoopUnrollMin &&
|
||||
(future_unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
|
||||
is_residual_iters_large(future_unroll_cnt, cl) &&
|
||||
1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
|
||||
return false;
|
||||
if ((cl->slp_max_unroll() == 0) && !is_residual_iters_large(cl->unrolled_count(), cl)) {
|
||||
// cl->slp_max_unroll() = 0 means that the previous slp analysis never passed.
|
||||
// slp analysis may fail due to the loop IR is too complicated especially during the early stage
|
||||
// of loop unrolling analysis. But after several rounds of loop unrolling and other optimizations,
|
||||
// it's possible that the loop IR becomes simple enough to pass the slp analysis.
|
||||
// So we don't return immediately in hoping that the next slp analysis can succeed.
|
||||
should_unroll = false;
|
||||
future_unroll_cnt = cl->unrolled_count();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Node *init_n = cl->init_trip();
|
||||
@ -985,7 +997,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
}
|
||||
|
||||
// Only attempt slp analysis when user controls do not prohibit it
|
||||
if (LoopMaxUnroll > _local_loop_unroll_factor) {
|
||||
if (!cl->range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
|
||||
// Once policy_slp_analysis succeeds, mark the loop with the
|
||||
// maximal unroll factor so that we minimize analysis passes
|
||||
if (future_unroll_cnt >= _local_loop_unroll_factor) {
|
||||
@ -1003,7 +1015,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
|
||||
if (cl->has_passed_slp()) {
|
||||
if (slp_max_unroll_factor >= future_unroll_cnt) {
|
||||
return phase->may_require_nodes(estimate);
|
||||
return should_unroll && phase->may_require_nodes(estimate);
|
||||
}
|
||||
return false; // Loop too big.
|
||||
}
|
||||
@ -1011,7 +1023,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
// Check for being too big
|
||||
if (body_size > (uint)_local_loop_unroll_limit) {
|
||||
if ((cl->is_subword_loop() || xors_in_loop >= 4) && body_size < 4u * LoopUnrollLimit) {
|
||||
return phase->may_require_nodes(estimate);
|
||||
return should_unroll && phase->may_require_nodes(estimate);
|
||||
}
|
||||
return false; // Loop too big.
|
||||
}
|
||||
@ -1024,7 +1036,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
}
|
||||
|
||||
// Unroll once! (Each trip will soon do double iterations)
|
||||
return phase->may_require_nodes(estimate);
|
||||
return should_unroll && phase->may_require_nodes(estimate);
|
||||
}
|
||||
|
||||
void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_cnt) {
|
||||
@ -3528,6 +3540,8 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
|
||||
if (should_rce) {
|
||||
if (phase->do_range_check(this, old_new) != 0) {
|
||||
cl->mark_has_range_checks();
|
||||
} else {
|
||||
cl->clear_has_range_checks();
|
||||
}
|
||||
} else if (PostLoopMultiversioning) {
|
||||
phase->has_range_checks(this);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1998, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -114,6 +114,7 @@ public:
|
||||
void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
|
||||
void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
|
||||
void mark_has_range_checks() { _loop_flags |= HasRangeChecks; }
|
||||
void clear_has_range_checks() { _loop_flags &= ~HasRangeChecks; }
|
||||
void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
|
||||
void mark_strip_mined() { _loop_flags |= StripMined; }
|
||||
void clear_strip_mined() { _loop_flags &= ~StripMined; }
|
||||
@ -773,6 +774,12 @@ public:
|
||||
|
||||
// Estimate the number of nodes resulting from control and data flow merge.
|
||||
uint est_loop_flow_merge_sz() const;
|
||||
|
||||
// Check if the number of residual iterations is large with unroll_cnt.
|
||||
// Return true if the residual iterations are more than 10% of the trip count.
|
||||
bool is_residual_iters_large(int unroll_cnt, CountedLoopNode *cl) const {
|
||||
return (unroll_cnt - 1) * (100.0 / LoopPercentProfileLimit) > cl->profile_trip_cnt();
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------PhaseIdealLoop---------------------------------
|
||||
|
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.c2.irTests;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 8279258
|
||||
* @summary Auto-vectorization enhancement for two-dimensional array operations
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.c2.irTests.TestAutoVectorization2DArray
|
||||
*/
|
||||
|
||||
public class TestAutoVectorization2DArray {
|
||||
final private static int NUM = 64;
|
||||
|
||||
private static double[][] a = new double[NUM][NUM];
|
||||
private static double[][] b = new double[NUM][NUM];
|
||||
private static double[][] c = new double[NUM][NUM];
|
||||
|
||||
public static void main(String[] args) {
|
||||
TestFramework.run();
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = { IRNode.LOAD_VECTOR, " >0 " })
|
||||
@IR(counts = { IRNode.ADD_VD, " >0 " })
|
||||
@IR(counts = { IRNode.STORE_VECTOR, " >0 " })
|
||||
private static void testDouble(double[][] a , double[][] b, double[][] c) {
|
||||
for(int i = 0; i < a.length; i++) {
|
||||
for (int j = 0; j < a[0].length; j++) {
|
||||
a[i][j] = b[i][j] + c[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Run(test = "testDouble")
|
||||
private void testDouble_runner() {
|
||||
testDouble(a, b, c);
|
||||
}
|
||||
}
|
@ -74,6 +74,7 @@ public class IRNode {
|
||||
public static final String STORE_D = START + "StoreD" + MID + END;
|
||||
public static final String STORE_P = START + "StoreP" + MID + END;
|
||||
public static final String STORE_N = START + "StoreN" + MID + END;
|
||||
public static final String STORE_VECTOR = START + "StoreVector" + MID + END;
|
||||
public static final String STORE_OF_CLASS = COMPOSITE_PREFIX + START + "Store(B|C|S|I|L|F|D|P|N)" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
|
||||
public static final String STORE_B_OF_CLASS = COMPOSITE_PREFIX + START + "StoreB" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
|
||||
public static final String STORE_C_OF_CLASS = COMPOSITE_PREFIX + START + "StoreC" + MID + "@\\S*" + IS_REPLACED + STORE_OF_CLASS_POSTFIX;
|
||||
@ -96,6 +97,7 @@ public class IRNode {
|
||||
public static final String LOAD_D = START + "LoadD" + MID + END;
|
||||
public static final String LOAD_P = START + "LoadP" + MID + END;
|
||||
public static final String LOAD_N = START + "LoadN" + MID + END;
|
||||
public static final String LOAD_VECTOR = START + "LoadVector" + MID + END;
|
||||
public static final String LOAD_OF_CLASS = COMPOSITE_PREFIX + START + "Load(B|UB|S|US|I|L|F|D|P|N)" + MID + "@\\S*"+ IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
|
||||
public static final String LOAD_B_OF_CLASS = COMPOSITE_PREFIX + START + "LoadB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
|
||||
public static final String LOAD_UB_OF_CLASS = COMPOSITE_PREFIX + START + "LoadUB" + MID + "@\\S*" + IS_REPLACED + LOAD_OF_CLASS_POSTFIX;
|
||||
@ -139,6 +141,7 @@ public class IRNode {
|
||||
public static final String LSHIFT_L = START + "LShiftL" + MID + END;
|
||||
public static final String ADD_I = START + "AddI" + MID + END;
|
||||
public static final String ADD_L = START + "AddL" + MID + END;
|
||||
public static final String ADD_VD = START + "AddVD" + MID + END;
|
||||
public static final String SUB_I = START + "SubI" + MID + END;
|
||||
public static final String SUB_L = START + "SubL" + MID + END;
|
||||
public static final String MUL_I = START + "MulI" + MID + END;
|
||||
|
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.openjdk.jmh.infra.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Warmup(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
|
||||
@Measurement(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.SECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Fork(value=1)
|
||||
public class AutoVectorization2DArray {
|
||||
@Param({"16", "32", "64"})
|
||||
private int LEN;
|
||||
|
||||
private byte[][] a_byte;
|
||||
private byte[][] b_byte;
|
||||
private byte[][] c_byte;
|
||||
|
||||
private int[][] a_int;
|
||||
private int[][] b_int;
|
||||
private int[][] c_int;
|
||||
|
||||
private double[][] a_double;
|
||||
private double[][] b_double;
|
||||
private double[][] c_double;
|
||||
|
||||
@Setup
|
||||
public void init() {
|
||||
a_byte = new byte[LEN][LEN];
|
||||
b_byte = new byte[LEN][LEN];
|
||||
c_byte = new byte[LEN][LEN];
|
||||
|
||||
a_int = new int[LEN][LEN];
|
||||
b_int = new int[LEN][LEN];
|
||||
c_int = new int[LEN][LEN];
|
||||
|
||||
a_double = new double[LEN][LEN];
|
||||
b_double = new double[LEN][LEN];
|
||||
c_double = new double[LEN][LEN];
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private int run_byte(int count, byte[][] a , byte[][] b, byte[][] c) {
|
||||
for(int i = 0; i < a.length; i++) {
|
||||
for (int j = 0; j < a[0].length; j++) {
|
||||
a[i][j] = (byte)(b[i][j] + c[i][j]);
|
||||
}
|
||||
}
|
||||
return a[count][count];
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void test_run_byte(Blackhole bh) {
|
||||
int r = 0;
|
||||
for(int i = 0 ; i < 100; i++) {
|
||||
r += run_byte(i % a_byte.length, a_byte, b_byte, c_byte);
|
||||
}
|
||||
bh.consume(r);
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private int run_int(int count, int[][] a, int[][] b, int[][] c) {
|
||||
for(int i = 0; i < a.length; i++) {
|
||||
for (int j = 0; j < a[0].length; j++) {
|
||||
a[i][j] = b[i][j] + c[i][j];
|
||||
}
|
||||
}
|
||||
return a[count][count];
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void test_run_int(Blackhole bh) {
|
||||
int r = 0;
|
||||
for(int i = 0 ; i < 100; i++) {
|
||||
r += run_int(i % a_int.length, a_int, b_int, c_int);
|
||||
}
|
||||
bh.consume(r);
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private double run_double(int count, double[][] a, double[][] b, double[][] c) {
|
||||
for(int i = 0; i < a.length; i++) {
|
||||
for (int j = 0; j < a[0].length; j++) {
|
||||
a[i][j] = b[i][j] + c[i][j];
|
||||
}
|
||||
}
|
||||
return a[count][count];
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void test_run_double(Blackhole bh) {
|
||||
double r = 0;
|
||||
for(int i = 0 ; i < 100; i++) {
|
||||
r += run_double(i % a_double.length, a_double, b_double, c_double);
|
||||
}
|
||||
bh.consume(r);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user