diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestEliminateAllocationWithCastP2XUse.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestEliminateAllocationWithCastP2XUse.java new file mode 100644 index 00000000000..416781d1b00 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestEliminateAllocationWithCastP2XUse.java @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +/* + * @test + * @bug 8342498 + * @summary Test SuperWord, when it aligns to field-store, and the corresponding allocation is eliminated. + * @run driver compiler.loopopts.superword.TestEliminateAllocationWithCastP2XUse + * @run main/othervm -Xbatch + * -XX:-SplitIfBlocks -XX:LoopMaxUnroll=8 + * -XX:+UnlockDiagnosticVMOptions -XX:DominatorSearchLimit=45 + * compiler.loopopts.superword.TestEliminateAllocationWithCastP2XUse + */ + +public class TestEliminateAllocationWithCastP2XUse { + public static void main(String args[]) { + byte[] a = new byte[10_000]; + for (int i = 0; i < 10000; i++) { + test(a); + } + } + + // Summary: + // - Some B allocations are detected as NoEscape, but cannot be removed because of a field load. + // - The field loads cannot be LoadNode::split_through_phi because DominatorSearchLimit is too low + // for the dominates query to look through some IfNode / IfProj path. + // - We go into loop-opts. + // - In theory, the Stores of B::offset would be moved out of the loop. But we disable + // PhaseIdealLoop::try_move_store_after_loop by setting -XX:-SplitIfBlocks. + // - The field loads are folded away because of some MaxUnroll trick, where the val constant folds to 1. + // - SuperWord eventually kicks in, and vectorizes the array stores. + // - Since some vectorization has happened, SuperWord wants to align the main loop with a memory reference + // in the loop. The code here is not very smart, and just picks the memory reference that occurs the + // most often. But the B::offset stores occur more often than the array stores, and so we align to + // one of the B::offset stores. This inserts a CastP2X under the CheckCastPP of the B allocation. + // - Once loop opts is over, we eventually go into macro expansion. + // - During macro expansion, we now discover that the Allocations were marked NoEscape, and that by now + // there are no field loads any more: yay, we can remove the allocation! + // - ... except that there is the CastP2X from SuperWord alignment ... + // - The Allocation removal code wants to pattern match the CastP2X as part of a GC barrier, but then + // the pattern does not conform to the expecatation - it is after all from SuperWord. This leads to + // an assert, and SIGSEGV in product, at least with G1GC. + public static long test(byte[] a) { + // Delay val == 1 until loop-opts, with MaxUnroll trick. + int val = 0; + for (int i = 0; i < 4; i++) { + if ((i % 2) == 0) { + val = 1; + } + } + // during loop opts, we learn val == 1 + // But we don't know that during EscapeAnalysis (EA) yet. + + // 9 Allocations, discovered as NoEscape during EA. + B b1 = new B(); + B b2 = new B(); + B b3 = new B(); + B b4 = new B(); + B b5 = new B(); + B b6 = new B(); + B b7 = new B(); + B b8 = new B(); + B b9 = new B(); + + // Some path of IfNode / IfProj. + // Only folds away once we know val == 1 + // This delays the LoadNode::split_through_phi, because it needs a dominates call + // to succeed, but it cannot look through this path because we set -XX:DominatorSearchLimit=45 + // i.e. just a little too low to be able to look through. + // Without the LoadNode::split_through_phi before the end of EA, the Allocation cannot yet be + // removed, due to a "Field load", i.e. that Load for B::offset. + // But later, this path can actually fold away, when we know that val == 1. At that point, + // also the Load from B::offset folds away because LoadNode::split_through_phi succeeds + // At that point the B allocations have no Loads any more, and can be removed... but this only + // happens at macro expansion, after all loop opts. + if (val == 1010) { throw new RuntimeException("never"); } + if (val == 1020) { throw new RuntimeException("never"); } + if (val == 1030) { throw new RuntimeException("never"); } + if (val == 1040) { throw new RuntimeException("never"); } + if (val == 1060) { throw new RuntimeException("never"); } + if (val == 1070) { throw new RuntimeException("never"); } + if (val == 1080) { throw new RuntimeException("never"); } + if (val == 1090) { throw new RuntimeException("never"); } + + if (val == 2010) { throw new RuntimeException("never"); } + if (val == 2020) { throw new RuntimeException("never"); } + if (val == 2030) { throw new RuntimeException("never"); } + if (val == 2040) { throw new RuntimeException("never"); } + if (val == 2060) { throw new RuntimeException("never"); } + if (val == 2070) { throw new RuntimeException("never"); } + if (val == 2080) { throw new RuntimeException("never"); } + if (val == 2090) { throw new RuntimeException("never"); } + + if (val == 3010) { throw new RuntimeException("never"); } + if (val == 3020) { throw new RuntimeException("never"); } + if (val == 3030) { throw new RuntimeException("never"); } + if (val == 3040) { throw new RuntimeException("never"); } + if (val == 3060) { throw new RuntimeException("never"); } + if (val == 3070) { throw new RuntimeException("never"); } + if (val == 3080) { throw new RuntimeException("never"); } + if (val == 3090) { throw new RuntimeException("never"); } + + if (val == 4010) { throw new RuntimeException("never"); } + if (val == 4020) { throw new RuntimeException("never"); } + if (val == 4030) { throw new RuntimeException("never"); } + if (val == 4040) { throw new RuntimeException("never"); } + if (val == 4060) { throw new RuntimeException("never"); } + if (val == 4070) { throw new RuntimeException("never"); } + if (val == 4080) { throw new RuntimeException("never"); } + if (val == 4090) { throw new RuntimeException("never"); } + + long mulVal = 1; + for (int i = 0; i < a.length; i++) { + mulVal *= 3; + // We do some vector store, so that SuperWord succeeds, and creates the + // alignment code, which emits the CastP2X. + a[i]++; + // But we also have 9 Stores for the B::offset. + // SuperWord now sees more of these stores than of the array stores, and picks + // one of the B::offset stores as the alignment reference... creating a CastP2X + // for the CheckCastPP of the B allocation. + b1.offset = mulVal; + b2.offset = mulVal; + b3.offset = mulVal; + b4.offset = mulVal; + b5.offset = mulVal; + b6.offset = mulVal; + b7.offset = mulVal; + b8.offset = mulVal; + b9.offset = mulVal; + } + + // This folds the loads away, once we know val == 1 + // That happens during loop-opts, so after EA, but before macro expansion. + long ret = 0; + if (val == 42) { + ret = b1.offset + + b2.offset + + b3.offset + + b4.offset + + b5.offset + + b6.offset + + b7.offset + + b8.offset + + b9.offset; + } + + return ret; + } + + static class B { + // Add padding so that the old SuperWord::can_create_pairs accepts the field store to B.offset + long pad1 = 0; // at 16 + long pad2 = 0; // at 24 + long pad3 = 0; // at 32 + long pad4 = 0; // at 40 + long pad5 = 0; // at 48 + long pad6 = 0; // at 56 + long offset = 0; // offset at 64 bytes + } +}