From 886ac1c261a1b7e91e3981e32810c405a0d90329 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 21 Jun 2023 06:40:50 +0000 Subject: [PATCH] 8308606: C2 SuperWord: remove alignment checks when not required Reviewed-by: fgao, kvn, pli --- src/hotspot/share/opto/superword.cpp | 190 +++++------ src/hotspot/share/opto/superword.hpp | 14 +- .../superword/TestDependencyOffsets.java | 76 ++--- .../runner/LoopArrayIndexComputeTest.java | 89 ++++- .../bench/vm/compiler/VectorAlignment.java | 315 ++++++++++++++++++ 5 files changed, 517 insertions(+), 167 deletions(-) create mode 100644 test/micro/org/openjdk/bench/vm/compiler/VectorAlignment.java diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index faf2f7e51d9..f665089d0bf 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -724,9 +724,9 @@ void SuperWord::find_adjacent_refs() { } } - if (can_create_pairs(mem_ref, iv_adjustment, align_to_ref_p, - best_align_to_mem_ref, best_iv_adjustment, - align_to_refs)) { + if (mem_ref_has_no_alignment_violation(mem_ref, iv_adjustment, align_to_ref_p, + best_align_to_mem_ref, best_iv_adjustment, + align_to_refs)) { // Create initial pack pairs of memory operations for which alignment was set. for (uint i = 0; i < memops.size(); i++) { Node* s1 = memops.at(i); @@ -836,93 +836,45 @@ void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best } #endif -// Check if we can create the pack pairs for mem_ref: -// If required, enforce strict alignment requirements of hardware. -// Else, only enforce alignment within a memory slice, so that there cannot be any -// memory-dependence between different vector "lanes". -bool SuperWord::can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p, - MemNode* best_align_to_mem_ref, int best_iv_adjustment, - Node_List &align_to_refs) { - bool is_aligned_with_best = memory_alignment(mem_ref, best_iv_adjustment) == 0; - - if (vectors_should_be_aligned()) { - // All vectors need to be memory aligned, modulo their vector_width. This is more strict - // than the hardware probably requires. Most hardware at most requires 4-byte alignment. - // - // In the pre-loop, we align best_align_to_mem_ref to its vector_length. To ensure that - // all mem_ref's are memory aligned modulo their vector_width, we only need to check that - // they are all aligned to best_align_to_mem_ref, modulo their vector_width. For that, - // we check the following 3 conditions. - - // (1) All packs are aligned with best_align_to_mem_ref. - if (!is_aligned_with_best) { - return false; - } - // (2) All other vectors have vector_size less or equal to that of best_align_to_mem_ref. - int vw = vector_width(mem_ref); - int vw_best = vector_width(best_align_to_mem_ref); - if (vw > vw_best) { - // We only align to vector_width of best_align_to_mem_ref during pre-loop. - // A mem_ref with a larger vector_width might thus not be vector_width aligned. - return false; - } - // (3) Ensure that all vectors have the same invariant. We model memory accesses like this - // address = base + k*iv + constant [+ invar] - // memory_alignment ignores the invariant. - SWPointer p2(best_align_to_mem_ref, this, nullptr, false); - if (!align_to_ref_p.invar_equals(p2)) { - // Do not vectorize memory accesses with different invariants - // if unaligned memory accesses are not allowed. - return false; - } +// If strict memory alignment is required (vectors_should_be_aligned), then check if +// mem_ref is aligned with best_align_to_mem_ref. +bool SuperWord::mem_ref_has_no_alignment_violation(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p, + MemNode* best_align_to_mem_ref, int best_iv_adjustment, + Node_List &align_to_refs) { + if (!vectors_should_be_aligned()) { + // Alignment is not required by the hardware. No violation possible. return true; - } else { - // Alignment is not required by the hardware. - - // However, we need to ensure that the pack for mem_ref is independent, i.e. all members - // of the pack are mutually independent. - - if (_do_vector_loop) { - // Wait until combine_packs to check independence of packs. For now we just know that - // the adjacent pairs are independent. This allows us to vectorize when we do not have - // alignment modulo vector_width. For example (forward read): - // for (int i ...) { v[i] = v[i + 1] + 5; } - // The following will be filtered out in combine_packs (forward write): - // for (int i ...) { v[i + 1] = v[i] + 5; } - return true; - } - - // If all mem_ref's are modulo vector_width aligned with all other mem_ref's of their - // memory slice, then the VectorLoad / VectorStore regions are either exactly overlapping - // or completely non-overlapping. This ensures that there cannot be memory-dependencies - // between different vector "lanes". - // During SuperWord::filter_packs -> SuperWord::profitable -> SuperWord::is_vector_use, - // we check that all inputs are vectors that match on every element (with some reasonable - // exceptions). This ensures that every "lane" is isomorpic and independent to all other - // "lanes". This allows us to vectorize these cases: - // for (int i ...) { v[i] = v[i] + 5; } // same alignment - // for (int i ...) { v[i] = v[i + 32] + 5; } // alignment modulo vector_width - if (same_memory_slice(mem_ref, best_align_to_mem_ref)) { - return is_aligned_with_best; - } else { - return is_mem_ref_aligned_with_same_memory_slice(mem_ref, iv_adjustment, align_to_refs); - } } -} -// Check if alignment of mem_ref is consistent with the other packs of the same memory slice -bool SuperWord::is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment, - Node_List &align_to_refs) { - for (uint i = 0; i < align_to_refs.size(); i++) { - MemNode* mr = align_to_refs.at(i)->as_Mem(); - if (mr != mem_ref && - same_memory_slice(mr, mem_ref) && - memory_alignment(mr, iv_adjustment) != 0) { - // mem_ref is misaligned with mr, another ref of the same memory slice. - return false; - } + // All vectors need to be memory aligned, modulo their vector_width. This is more strict + // than the hardware probably requires. Most hardware at most requires 4-byte alignment. + // + // In the pre-loop, we align best_align_to_mem_ref to its vector_length. To ensure that + // all mem_ref's are memory aligned modulo their vector_width, we only need to check that + // they are all aligned to best_align_to_mem_ref, modulo their vector_width. For that, + // we check the following 3 conditions. + + // (1) All packs are aligned with best_align_to_mem_ref. + if (memory_alignment(mem_ref, best_iv_adjustment) != 0) { + return false; + } + // (2) All other vectors have vector_size less or equal to that of best_align_to_mem_ref. + int vw = vector_width(mem_ref); + int vw_best = vector_width(best_align_to_mem_ref); + if (vw > vw_best) { + // We only align to vector_width of best_align_to_mem_ref during pre-loop. + // A mem_ref with a larger vector_width might thus not be vector_width aligned. + return false; + } + // (3) Ensure that all vectors have the same invariant. We model memory accesses like this + // address = base + k*iv + constant [+ invar] + // memory_alignment ignores the invariant. + SWPointer p2(best_align_to_mem_ref, this, nullptr, false); + if (!align_to_ref_p.invar_equals(p2)) { + // Do not vectorize memory accesses with different invariants + // if unaligned memory accesses are not allowed. + return false; } - // No misalignment found. return true; } @@ -1901,9 +1853,14 @@ void SuperWord::combine_packs() { assert(is_power_of_2(max_vlen), "sanity"); uint psize = p1->size(); if (!is_power_of_2(psize)) { - // Skip pack which can't be vector. - // case1: for(...) { a[i] = i; } elements values are different (i+x) - // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store + // We currently only support power-of-2 sizes for vectors. +#ifndef PRODUCT + if (TraceSuperWord) { + tty->cr(); + tty->print_cr("WARNING: Removed pack[%d] with size that is not a power of 2:", i); + print_pack(p1); + } +#endif _packset.at_put(i, nullptr); continue; } @@ -1922,28 +1879,41 @@ void SuperWord::combine_packs() { } } - if (_do_vector_loop) { - // Since we did not enforce exact alignment of the packsets, we only know that there - // is no dependence with distance 1, because we have checked independent(s1, s2) for - // all adjacent memops. But there could be a dependence of a different distance. - // Hence: remove the pack if there is a dependence. - for (int i = 0; i < _packset.length(); i++) { - Node_List* p = _packset.at(i); - if (p != nullptr) { - Node* dependence = find_dependence(p); - if (dependence != nullptr) { + // We know that the nodes in a pair pack were independent - this gives us independence + // at distance 1. But now that we may have more than 2 nodes in a pack, we need to check + // if they are all mutually independent. If there is a dependence we remove the pack. + // This is better than giving up completely - we can have partial vectorization if some + // are rejected and others still accepted. + // + // Examples with dependence at distance 1 (pack pairs are not created): + // for (int i ...) { v[i + 1] = v[i] + 5; } + // for (int i ...) { v[i] = v[i - 1] + 5; } + // + // Example with independence at distance 1, but dependence at distance 2 (pack pairs are + // created and we need to filter them out now): + // for (int i ...) { v[i + 2] = v[i] + 5; } + // for (int i ...) { v[i] = v[i - 2] + 5; } + // + // Note: dependencies are created when a later load may reference the same memory location + // as an earlier store. This happens in "read backward" or "store forward" cases. On the + // other hand, "read forward" or "store backward" cases do not have such dependencies: + // for (int i ...) { v[i] = v[i + 1] + 5; } + // for (int i ...) { v[i - 1] = v[i] + 5; } + for (int i = 0; i < _packset.length(); i++) { + Node_List* p = _packset.at(i); + if (p != nullptr) { + Node* dependence = find_dependence(p); + if (dependence != nullptr) { #ifndef PRODUCT - if (TraceSuperWord) { - tty->cr(); - tty->print_cr("WARNING: Found dependency."); - tty->print_cr("Cannot vectorize despite compile directive Vectorize."); - dependence->dump(); - tty->print_cr("In pack[%d]", i); - print_pack(p); - } -#endif - _packset.at_put(i, nullptr); + if (TraceSuperWord) { + tty->cr(); + tty->print_cr("WARNING: Found dependency at distance greater than 1."); + dependence->dump(); + tty->print_cr("In pack[%d]", i); + print_pack(p); } +#endif + _packset.at_put(i, nullptr); } } } @@ -3757,7 +3727,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { int off_mod = off_rem >= 0 ? off_rem : off_rem + vw; #ifndef PRODUCT if ((TraceSuperWord && Verbose) || is_trace_alignment()) { - tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod); + tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d (offset = %d)", off_rem, off_mod, offset); } #endif return off_mod; diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 6e2689b19ad..2fcc169f8af 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -513,15 +513,11 @@ private: void find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment); void print_loop(bool whole); #endif - // Check if we can create the pack pairs for mem_ref: - // If required, enforce strict alignment requirements of hardware. - // Else, only enforce alignment within a memory slice, so that there cannot be any - // memory-dependence between different vector "lanes". - bool can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p, - MemNode* best_align_to_mem_ref, int best_iv_adjustment, - Node_List &align_to_refs); - // Check if alignment of mem_ref is consistent with the other packs of the same memory slice. - bool is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment, Node_List &align_to_refs); + // If strict memory alignment is required (vectors_should_be_aligned), then check if + // mem_ref is aligned with best_align_to_mem_ref. + bool mem_ref_has_no_alignment_violation(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p, + MemNode* best_align_to_mem_ref, int best_iv_adjustment, + Node_List &align_to_refs); // Find a memory reference to align the loop induction variable to. MemNode* find_align_to_ref(Node_List &memops, int &idx); // Calculate loop's iv adjustment for this memory ops. diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java index 9130571a406..14033f9712a 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java @@ -25,10 +25,9 @@ * Summary: * Test SuperWord vectorization with different access offsets * and various MaxVectorSize values, and +- AlignVector. - * Note: CompileCommand Option Vectorize is enabled. * * Note: this test is auto-generated. Please modify / generate with script: - * https://bugs.openjdk.org/browse/JDK-8298935 + * https://bugs.openjdk.org/browse/JDK-8308606 * * Types: int, long, short, char, byte, float, double * Offsets: 0, -1, 1, -2, 2, -3, 3, -4, 4, -7, 7, -8, 8, -14, 14, -16, 16, -18, 18, -20, 20, -31, 31, -32, 32, -63, 63, -64, 64, -65, 65, -128, 128, -129, 129, -192, 192 @@ -91,7 +90,7 @@ /* * @test id=vanilla-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @library /test/lib / @@ -100,7 +99,7 @@ /* * @test id=vanilla-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @library /test/lib / @@ -109,7 +108,7 @@ /* * @test id=sse4-v016-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -120,7 +119,7 @@ /* * @test id=sse4-v016-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -131,7 +130,7 @@ /* * @test id=sse4-v008-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -142,7 +141,7 @@ /* * @test id=sse4-v008-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -153,7 +152,7 @@ /* * @test id=sse4-v004-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -164,7 +163,7 @@ /* * @test id=sse4-v004-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -175,7 +174,7 @@ /* * @test id=sse4-v002-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -186,7 +185,7 @@ /* * @test id=sse4-v002-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -197,7 +196,7 @@ /* * @test id=avx1-v032-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -208,7 +207,7 @@ /* * @test id=avx1-v032-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -219,7 +218,7 @@ /* * @test id=avx1-v016-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -230,7 +229,7 @@ /* * @test id=avx1-v016-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -241,7 +240,7 @@ /* * @test id=avx2-v032-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -252,7 +251,7 @@ /* * @test id=avx2-v032-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -263,7 +262,7 @@ /* * @test id=avx2-v016-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -274,7 +273,7 @@ /* * @test id=avx2-v016-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -285,7 +284,7 @@ /* * @test id=avx512-v064-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -296,7 +295,7 @@ /* * @test id=avx512-v064-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -307,7 +306,7 @@ /* * @test id=avx512-v032-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -318,7 +317,7 @@ /* * @test id=avx512-v032-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -329,7 +328,7 @@ /* * @test id=avx512bw-v064-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -340,7 +339,7 @@ /* * @test id=avx512bw-v064-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -351,7 +350,7 @@ /* * @test id=avx512bw-v032-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -362,7 +361,7 @@ /* * @test id=avx512bw-v032-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64") @@ -373,7 +372,7 @@ /* * @test id=vec-v064-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -383,7 +382,7 @@ /* * @test id=vec-v064-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -393,7 +392,7 @@ /* * @test id=vec-v032-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -403,7 +402,7 @@ /* * @test id=vec-v032-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -413,7 +412,7 @@ /* * @test id=vec-v016-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -423,7 +422,7 @@ /* * @test id=vec-v016-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -433,7 +432,7 @@ /* * @test id=vec-v008-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -443,7 +442,7 @@ /* * @test id=vec-v008-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -453,7 +452,7 @@ /* * @test id=vec-v004-A - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -463,7 +462,7 @@ /* * @test id=vec-v004-U - * @bug 8298935 + * @bug 8298935 8308606 * @summary Test SuperWord: vector size, offsets, dependencies, alignment. * @requires vm.compiler2.enabled * @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64") @@ -1262,7 +1261,6 @@ public class TestDependencyOffsets { public static void main(String args[]) { TestFramework framework = new TestFramework(TestDependencyOffsets.class); framework.addFlags("-XX:-TieredCompilation", - "-XX:CompileCommand=option,compiler.loopopts.superword.TestDependencyOffsets::test*,Vectorize", "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::init", "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::test*", "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::verify", diff --git a/test/hotspot/jtreg/compiler/vectorization/runner/LoopArrayIndexComputeTest.java b/test/hotspot/jtreg/compiler/vectorization/runner/LoopArrayIndexComputeTest.java index e09c58d8dbd..cc35213e5db 100644 --- a/test/hotspot/jtreg/compiler/vectorization/runner/LoopArrayIndexComputeTest.java +++ b/test/hotspot/jtreg/compiler/vectorization/runner/LoopArrayIndexComputeTest.java @@ -161,9 +161,11 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { } @Test - // Note that this case cannot be vectorized due to data dependence. - @IR(failOn = {IRNode.STORE_VECTOR}) - public int[] indexWithDifferentConstants() { + // No true dependency in read-forward case. + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"AlignVector", "false"}, + counts = {IRNode.STORE_VECTOR, ">0"}) + public int[] indexWithDifferentConstantsPos() { int[] res = new int[SIZE]; for (int i = 0; i < SIZE / 4; i++) { res[i] = ints[i + 1]; @@ -171,6 +173,17 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { return res; } + @Test + // Note that this case cannot be vectorized due to data dependence. + @IR(failOn = {IRNode.STORE_VECTOR}) + public int[] indexWithDifferentConstantsNeg() { + int[] res = new int[SIZE]; + for (int i = 1; i < SIZE / 4; i++) { + res[i] = ints[i - 1]; + } + return res; + } + @Test // Note that this case cannot be vectorized due to data dependence. @IR(failOn = {IRNode.STORE_VECTOR}) @@ -246,10 +259,13 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { } // ---------------- Subword Type Arrays ---------------- + @Test - // Note that this case cannot be vectorized due to data dependence. - @IR(failOn = {IRNode.STORE_VECTOR}) - public short[] shortArrayWithDependence() { + // No true dependency in read-forward case. + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"AlignVector", "false"}, + counts = {IRNode.STORE_VECTOR, ">0"}) + public short[] shortArrayWithDependencePos() { short[] res = new short[SIZE]; System.arraycopy(shorts, 0, res, 0, SIZE); for (int i = 0; i < SIZE / 2; i++) { @@ -261,7 +277,21 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { @Test // Note that this case cannot be vectorized due to data dependence. @IR(failOn = {IRNode.STORE_VECTOR}) - public char[] charArrayWithDependence() { + public short[] shortArrayWithDependenceNeg() { + short[] res = new short[SIZE]; + System.arraycopy(shorts, 0, res, 0, SIZE); + for (int i = 1; i < SIZE / 2; i++) { + res[i] *= shorts[i - 1]; + } + return res; + } + + @Test + // No true dependency in read-forward case. + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"AlignVector", "false"}, + counts = {IRNode.STORE_VECTOR, ">0"}) + public char[] charArrayWithDependencePos() { char[] res = new char[SIZE]; System.arraycopy(chars, 0, res, 0, SIZE); for (int i = 0; i < SIZE / 2; i++) { @@ -273,7 +303,21 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { @Test // Note that this case cannot be vectorized due to data dependence. @IR(failOn = {IRNode.STORE_VECTOR}) - public byte[] byteArrayWithDependence() { + public char[] charArrayWithDependenceNeg() { + char[] res = new char[SIZE]; + System.arraycopy(chars, 0, res, 0, SIZE); + for (int i = 2; i < SIZE / 2; i++) { + res[i] *= chars[i - 2]; + } + return res; + } + + @Test + // No true dependency in read-forward case. + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"AlignVector", "false"}, + counts = {IRNode.STORE_VECTOR, ">0"}) + public byte[] byteArrayWithDependencePos() { byte[] res = new byte[SIZE]; System.arraycopy(bytes, 0, res, 0, SIZE); for (int i = 0; i < SIZE / 2; i++) { @@ -282,10 +326,25 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { return res; } + @Test // Note that this case cannot be vectorized due to data dependence. @IR(failOn = {IRNode.STORE_VECTOR}) - public boolean[] booleanArrayWithDependence() { + public byte[] byteArrayWithDependenceNeg() { + byte[] res = new byte[SIZE]; + System.arraycopy(bytes, 0, res, 0, SIZE); + for (int i = 3; i < SIZE / 2; i++) { + res[i] *= bytes[i - 3]; + } + return res; + } + + @Test + // No true dependency in read-forward case. + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + applyIf = {"AlignVector", "false"}, + counts = {IRNode.STORE_VECTOR, ">0"}) + public boolean[] booleanArrayWithDependencePos() { boolean[] res = new boolean[SIZE]; System.arraycopy(booleans, 0, res, 0, SIZE); for (int i = 0; i < SIZE / 2; i++) { @@ -294,6 +353,18 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner { return res; } + @Test + // Note that this case cannot be vectorized due to data dependence. + @IR(failOn = {IRNode.STORE_VECTOR}) + public boolean[] booleanArrayWithDependenceNeg() { + boolean[] res = new boolean[SIZE]; + System.arraycopy(booleans, 0, res, 0, SIZE); + for (int i = 4; i < SIZE / 2; i++) { + res[i] |= booleans[i - 4]; + } + return res; + } + // ---------------- Multiple Operations ---------------- @Test @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorAlignment.java b/test/micro/org/openjdk/bench/vm/compiler/VectorAlignment.java new file mode 100644 index 00000000000..7fff4952c8e --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorAlignment.java @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.*; + +import java.util.concurrent.TimeUnit; +import java.util.Random; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 1, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 1, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1) +public abstract class VectorAlignment { + @Param({/*"512", "1024", */ "2048"}) + public int COUNT; + + private int[] aI; + private int[] bI; + private int[] rI; + + private long[] aL; + private long[] bL; + private long[] rL; + + private short[] aS; + private short[] bS; + private short[] rS; + + private char[] aC; + private char[] bC; + private char[] rC; + + private byte[] aB; + private byte[] bB; + private byte[] rB; + + private float[] aF; + private float[] bF; + private float[] rF; + + private double[] aD; + private double[] bD; + private double[] rD; + + + @Param("0") + private int seed; + private Random r = new Random(seed); + + @Setup + public void init() { + aI = new int[COUNT]; + bI = new int[COUNT]; + rI = new int[COUNT]; + + aL = new long[COUNT]; + bL = new long[COUNT]; + rL = new long[COUNT]; + + aS = new short[COUNT]; + bS = new short[COUNT]; + rS = new short[COUNT]; + + aC = new char[COUNT]; + bC = new char[COUNT]; + rC = new char[COUNT]; + + aB = new byte[COUNT]; + bB = new byte[COUNT]; + rB = new byte[COUNT]; + + aF = new float[COUNT]; + bF = new float[COUNT]; + rF = new float[COUNT]; + + aD = new double[COUNT]; + bD = new double[COUNT]; + rD = new double[COUNT]; + + + for (int i = 0; i < COUNT; i++) { + aI[i] = r.nextInt(); + bI[i] = r.nextInt(); + + aL[i] = r.nextLong(); + bL[i] = r.nextLong(); + + aS[i] = (short) r.nextInt(); + bS[i] = (short) r.nextInt(); + + aC[i] = (char) r.nextInt(); + bC[i] = (char) r.nextInt(); + + aB[i] = (byte) r.nextInt(); + bB[i] = (byte) r.nextInt(); + + aF[i] = r.nextFloat(); + bF[i] = r.nextFloat(); + + aD[i] = r.nextDouble(); + bD[i] = r.nextDouble(); + } + } + + @Benchmark + // Control: should always vectorize with SuperWord + public void bench000I_control() { + for (int i = 0; i < COUNT; i++) { + // Have multiple MUL operations to make loop compute bound (more compute than load/store) + rI[i] = aI[i] * aI[i] * aI[i] * aI[i]; + } + } + + @Benchmark + public void bench000L_control() { + for (int i = 0; i < COUNT; i++) { + rL[i] = aL[i] * aL[i] * aL[i] * aL[i]; + } + } + + @Benchmark + public void bench000S_control() { + for (int i = 0; i < COUNT; i++) { + rS[i] = (short)(aS[i] * aS[i] * aS[i] * aS[i]); + } + } + + @Benchmark + public void bench000C_control() { + for (int i = 0; i < COUNT; i++) { + rC[i] = (char)(aC[i] * aC[i] * aC[i] * aC[i]); + } + } + + @Benchmark + public void bench000B_control() { + for (int i = 0; i < COUNT; i++) { + rB[i] = (byte)(aB[i] * aB[i] * aB[i] * aB[i]); + } + } + + @Benchmark + public void bench000F_control() { + for (int i = 0; i < COUNT; i++) { + rF[i] = aF[i] * aF[i] * aF[i] * aF[i]; + } + } + + @Benchmark + public void bench000D_control() { + for (int i = 0; i < COUNT; i++) { + rD[i] = aD[i] * aD[i] * aD[i] * aD[i]; + } + } + + @Benchmark + // Control: should always vectorize with SuperWord + public void bench001_control() { + for (int i = 0; i < COUNT; i++) { + // Have multiple MUL operations to make loop compute bound (more compute than load/store) + rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + bI[i]; + } + } + + @Benchmark + // Vectorizes without AlignVector + public void bench100I_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rI[i] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1]; + } + } + + @Benchmark + public void bench100L_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rL[i] = aL[i+1] * aL[i+1] * aL[i+1] * aL[i+1]; + } + } + + @Benchmark + public void bench100S_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rS[i] = (short)(aS[i+1] * aS[i+1] * aS[i+1] * aS[i+1]); + } + } + + @Benchmark + public void bench100C_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rC[i] = (char)(aC[i+1] * aC[i+1] * aC[i+1] * aC[i+1]); + } + } + + + @Benchmark + public void bench100B_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rB[i] = (byte)(aB[i+1] * aB[i+1] * aB[i+1] * aB[i+1]); + } + } + + @Benchmark + public void bench100F_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rF[i] = aF[i+1] * aF[i+1] * aF[i+1] * aF[i+1]; + } + } + + @Benchmark + public void bench100D_misaligned_load() { + for (int i = 0; i < COUNT-1; i++) { + rD[i] = aD[i+1] * aD[i+1] * aD[i+1] * aD[i+1]; + } + } + + @Benchmark + // Only without "Vectorize" (confused by hand-unrolling) + public void bench200_hand_unrolled_aligned() { + for (int i = 0; i < COUNT-10; i+=2) { + rI[i+0] = aI[i+0] * aI[i+0] * aI[i+0] * aI[i+0]; + rI[i+1] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1]; + } + } + + @Benchmark + // Only with "Vectorize", without we get issues with modulo computation of alignment for bI + public void bench300_multiple_misaligned_loads() { + for (int i = 0; i < COUNT-10; i++) { + rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + bI[i+1]; + } + } + + @Benchmark + // Only with "Vectorize", without we may confuse aI[5] with aI[4+1] and pack loads in wrong pack + public void bench301_multiple_misaligned_loads() { + for (int i = 0; i < COUNT-10; i++) { + rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + aI[i+1]; + } + } + + @Benchmark + // Only with "Vectorize", without we get mix of aI[i] and a[i-2] + public void bench302_multiple_misaligned_loads_and_stores() { + for (int i = 2; i < COUNT; i++) { + rI[i - 2] = aI[i-2] * aI[i-2] * aI[i-2] * aI[i-2]; // can do this for all iterations + rI[i] = aI[i] + 3; // before doing this second line + } + } + + @Benchmark + // Currently does not vectorize: + // hand-unrolled confuses Vectorize -> adjacent loads not from same original node (not even same line) + // multiple unaligned loads confuses non-Vectorize: aI[5+1] confused with aI[4+2] (plus modulo alignment issue) + public void bench400_hand_unrolled_misaligned() { + for (int i = 0; i < COUNT-10; i+=2) { + rI[i+0] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1] + aI[i]; + rI[i+1] = aI[i+2] * aI[i+2] * aI[i+2] * aI[i+2] + aI[i+1]; + } + } + + @Benchmark + // Currently does not vectorize: + // hand-unrolled confuses Vectorize -> adjacent loads not from same original node (not even same line) + // non-Vectorize: plus modulo alignment issue + public void bench401_hand_unrolled_misaligned() { + for (int i = 0; i < COUNT-10; i+=2) { + rI[i+0] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1] + bI[i]; + rI[i+1] = aI[i+2] * aI[i+2] * aI[i+2] * aI[i+2] + bI[i+1]; + } + } + + @Fork(value = 1, jvmArgsPrepend = { + "-XX:+UseSuperWord", "-XX:CompileCommand=Option,*::*,Vectorize" + }) + public static class VectorAlignmentSuperWordWithVectorize extends VectorAlignment {} + + @Fork(value = 1, jvmArgsPrepend = { + "-XX:+UseSuperWord", "-XX:+AlignVector" + }) + public static class VectorAlignmentSuperWordAlignVector extends VectorAlignment {} + + @Fork(value = 1, jvmArgsPrepend = { + "-XX:+UseSuperWord" + }) + public static class VectorAlignmentSuperWord extends VectorAlignment {} + + @Fork(value = 1, jvmArgsPrepend = { + "-XX:-UseSuperWord" + }) + public static class VectorAlignmentNoSuperWord extends VectorAlignment {} +}