diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index c242e5a4ef7..4c4cdd074a1 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -3106,14 +3106,37 @@ VStatus VLoopBody::construct() { } else if (!post_visited.test(bb_idx(n))) { // cross or back arc const int old_length = stack.length(); + + // If a Load depends on the same memory state as a Store, we must make sure that + // the Load is ordered before the Store. + // + // mem + // | + // +--+--+ + // | | + // | Load (n) + // | + // Store (mem_use) + // + if (n->is_Load()) { + Node* mem = n->in(MemNode::Memory); + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { + Node* mem_use = mem->fast_out(i); + if (mem_use->is_Store() && _vloop.in_bb(mem_use) && !visited.test(bb_idx(mem_use))) { + stack.push(mem_use); // Ordering edge: Load (n) -> Store (mem_use) + } + } + } + for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { Node* use = n->fast_out(i); if (_vloop.in_bb(use) && !visited.test(bb_idx(use)) && // Don't go around backedge (!use->is_Phi() || n == _vloop.cl())) { - stack.push(use); + stack.push(use); // Ordering edge: n -> use } } + if (stack.length() == old_length) { // There were no additional uses, post visit node now stack.pop(); // Remove node from stack diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 0ad87256b27..7197b3a58a5 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -292,23 +292,40 @@ void VLoopDependencyGraph::add_node(MemNode* n, GrowableArray& memory_pred_ _dependency_nodes.at_put_grow(_body.bb_idx(n), dn, nullptr); } +int VLoopDependencyGraph::find_max_pred_depth(const Node* n) const { + int max_pred_depth = 0; + if (!n->is_Phi()) { // ignore backedge + for (PredsIterator it(*this, n); !it.done(); it.next()) { + Node* pred = it.current(); + if (_vloop.in_bb(pred)) { + max_pred_depth = MAX2(max_pred_depth, depth(pred)); + } + } + } + return max_pred_depth; +} + // We iterate over the body, which is already ordered by the dependencies, i.e. pred comes // before use. With a single pass, we can compute the depth of every node, since we can // assume that the depth of all preds is already computed when we compute the depth of use. void VLoopDependencyGraph::compute_depth() { for (int i = 0; i < _body.body().length(); i++) { Node* n = _body.body().at(i); - int max_pred_depth = 0; - if (n->is_Phi()) { - for (PredsIterator it(*this, n); !it.done(); it.next()) { - Node* pred = it.current(); - if (_vloop.in_bb(pred)) { - max_pred_depth = MAX2(max_pred_depth, depth(pred)); - } - } - } - set_depth(n, max_pred_depth + 1); + set_depth(n, find_max_pred_depth(n) + 1); } + +#ifdef ASSERT + for (int i = 0; i < _body.body().length(); i++) { + Node* n = _body.body().at(i); + int max_pred_depth = find_max_pred_depth(n); + if (depth(n) != max_pred_depth + 1) { + print(); + tty->print_cr("Incorrect depth: %d vs %d", depth(n), max_pred_depth + 1); + n->dump(); + } + assert(depth(n) == max_pred_depth + 1, "must have correct depth"); + } +#endif } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 9dc029efb6b..6840b01bb93 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -553,6 +553,7 @@ private: void add_node(MemNode* n, GrowableArray& memory_pred_edges); int depth(const Node* n) const { return _depths.at(_body.bb_idx(n)); } void set_depth(const Node* n, int d) { _depths.at_put(_body.bb_idx(n), d); } + int find_max_pred_depth(const Node* n) const; void compute_depth(); NOT_PRODUCT( void print() const; ) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeCompilation.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeCompilation.java new file mode 100644 index 00000000000..afe19fa1cd8 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeCompilation.java @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +/* + * @test + * @bug 8327978 + * @summary Test compile time for large compilation, where SuperWord takes especially much time. + * @requires vm.compiler2.enabled + * @run main/othervm/timeout=30 -XX:LoopUnrollLimit=1000 -Xbatch + * -XX:CompileCommand=compileonly,compiler.loopopts.superword.TestLargeCompilation::test* + * compiler.loopopts.superword.TestLargeCompilation + */ + +import java.util.Random; + +public class TestLargeCompilation { + private static final Random random = new Random(); + static final int RANGE_CON = 1024 * 8; + + static int init = 593436; + static int limit = 599554; + static int offset1 = -592394; + static int offset2 = -592386; + static final int offset3 = -592394; + static final int stride = 4; + static final int scale = 1; + static final int hand_unrolling1 = 2; + static final int hand_unrolling2 = 8; + static final int hand_unrolling3 = 15; + + public static void main(String[] args) { + byte[] aB = generateB(); + byte[] bB = generateB(); + byte[] cB = generateB(); + + for (int i = 1; i < 100; i++) { + testUUBBBH(aB, bB, cB); + } + } + + static byte[] generateB() { + byte[] a = new byte[RANGE_CON]; + for (int i = 0; i < a.length; i++) { + a[i] = (byte)random.nextInt(); + } + return a; + } + + static Object[] testUUBBBH(byte[] a, byte[] b, byte[] c) { + int h1 = hand_unrolling1; + int h2 = hand_unrolling2; + int h3 = hand_unrolling3; + + for (int i = init; i < limit; i += stride) { + if (h1 >= 1) { a[offset1 + i * scale + 0]++; } + if (h1 >= 2) { a[offset1 + i * scale + 1]++; } + if (h1 >= 3) { a[offset1 + i * scale + 2]++; } + if (h1 >= 4) { a[offset1 + i * scale + 3]++; } + if (h1 >= 5) { a[offset1 + i * scale + 4]++; } + if (h1 >= 6) { a[offset1 + i * scale + 5]++; } + if (h1 >= 7) { a[offset1 + i * scale + 6]++; } + if (h1 >= 8) { a[offset1 + i * scale + 7]++; } + if (h1 >= 9) { a[offset1 + i * scale + 8]++; } + if (h1 >= 10) { a[offset1 + i * scale + 9]++; } + if (h1 >= 11) { a[offset1 + i * scale + 10]++; } + if (h1 >= 12) { a[offset1 + i * scale + 11]++; } + if (h1 >= 13) { a[offset1 + i * scale + 12]++; } + if (h1 >= 14) { a[offset1 + i * scale + 13]++; } + if (h1 >= 15) { a[offset1 + i * scale + 14]++; } + if (h1 >= 16) { a[offset1 + i * scale + 15]++; } + + if (h2 >= 1) { b[offset2 + i * scale + 0]++; } + if (h2 >= 2) { b[offset2 + i * scale + 1]++; } + if (h2 >= 3) { b[offset2 + i * scale + 2]++; } + if (h2 >= 4) { b[offset2 + i * scale + 3]++; } + if (h2 >= 5) { b[offset2 + i * scale + 4]++; } + if (h2 >= 6) { b[offset2 + i * scale + 5]++; } + if (h2 >= 7) { b[offset2 + i * scale + 6]++; } + if (h2 >= 8) { b[offset2 + i * scale + 7]++; } + if (h2 >= 9) { b[offset2 + i * scale + 8]++; } + if (h2 >= 10) { b[offset2 + i * scale + 9]++; } + if (h2 >= 11) { b[offset2 + i * scale + 10]++; } + if (h2 >= 12) { b[offset2 + i * scale + 11]++; } + if (h2 >= 13) { b[offset2 + i * scale + 12]++; } + if (h2 >= 14) { b[offset2 + i * scale + 13]++; } + if (h2 >= 15) { b[offset2 + i * scale + 14]++; } + if (h2 >= 16) { b[offset2 + i * scale + 15]++; } + + if (h3 >= 1) { c[offset3 + i * scale + 0]++; } + if (h3 >= 2) { c[offset3 + i * scale + 1]++; } + if (h3 >= 3) { c[offset3 + i * scale + 2]++; } + if (h3 >= 4) { c[offset3 + i * scale + 3]++; } + if (h3 >= 5) { c[offset3 + i * scale + 4]++; } + if (h3 >= 6) { c[offset3 + i * scale + 5]++; } + if (h3 >= 7) { c[offset3 + i * scale + 6]++; } + if (h3 >= 8) { c[offset3 + i * scale + 7]++; } + if (h3 >= 9) { c[offset3 + i * scale + 8]++; } + if (h3 >= 10) { c[offset3 + i * scale + 9]++; } + if (h3 >= 11) { c[offset3 + i * scale + 10]++; } + if (h3 >= 12) { c[offset3 + i * scale + 11]++; } + if (h3 >= 13) { c[offset3 + i * scale + 12]++; } + if (h3 >= 14) { c[offset3 + i * scale + 13]++; } + if (h3 >= 15) { c[offset3 + i * scale + 14]++; } + if (h3 >= 16) { c[offset3 + i * scale + 15]++; } + } + return new Object[]{ a, b, c }; + } +}