8308606: C2 SuperWord: remove alignment checks when not required

Reviewed-by: fgao, kvn, pli
This commit is contained in:
Emanuel Peter 2023-06-21 06:40:50 +00:00
parent 47d00a4cbe
commit 886ac1c261
5 changed files with 517 additions and 167 deletions

View File

@ -724,9 +724,9 @@ void SuperWord::find_adjacent_refs() {
}
}
if (can_create_pairs(mem_ref, iv_adjustment, align_to_ref_p,
best_align_to_mem_ref, best_iv_adjustment,
align_to_refs)) {
if (mem_ref_has_no_alignment_violation(mem_ref, iv_adjustment, align_to_ref_p,
best_align_to_mem_ref, best_iv_adjustment,
align_to_refs)) {
// Create initial pack pairs of memory operations for which alignment was set.
for (uint i = 0; i < memops.size(); i++) {
Node* s1 = memops.at(i);
@ -836,93 +836,45 @@ void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best
}
#endif
// Check if we can create the pack pairs for mem_ref:
// If required, enforce strict alignment requirements of hardware.
// Else, only enforce alignment within a memory slice, so that there cannot be any
// memory-dependence between different vector "lanes".
bool SuperWord::can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs) {
bool is_aligned_with_best = memory_alignment(mem_ref, best_iv_adjustment) == 0;
if (vectors_should_be_aligned()) {
// All vectors need to be memory aligned, modulo their vector_width. This is more strict
// than the hardware probably requires. Most hardware at most requires 4-byte alignment.
//
// In the pre-loop, we align best_align_to_mem_ref to its vector_length. To ensure that
// all mem_ref's are memory aligned modulo their vector_width, we only need to check that
// they are all aligned to best_align_to_mem_ref, modulo their vector_width. For that,
// we check the following 3 conditions.
// (1) All packs are aligned with best_align_to_mem_ref.
if (!is_aligned_with_best) {
return false;
}
// (2) All other vectors have vector_size less or equal to that of best_align_to_mem_ref.
int vw = vector_width(mem_ref);
int vw_best = vector_width(best_align_to_mem_ref);
if (vw > vw_best) {
// We only align to vector_width of best_align_to_mem_ref during pre-loop.
// A mem_ref with a larger vector_width might thus not be vector_width aligned.
return false;
}
// (3) Ensure that all vectors have the same invariant. We model memory accesses like this
// address = base + k*iv + constant [+ invar]
// memory_alignment ignores the invariant.
SWPointer p2(best_align_to_mem_ref, this, nullptr, false);
if (!align_to_ref_p.invar_equals(p2)) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
return false;
}
// If strict memory alignment is required (vectors_should_be_aligned), then check if
// mem_ref is aligned with best_align_to_mem_ref.
bool SuperWord::mem_ref_has_no_alignment_violation(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs) {
if (!vectors_should_be_aligned()) {
// Alignment is not required by the hardware. No violation possible.
return true;
} else {
// Alignment is not required by the hardware.
// However, we need to ensure that the pack for mem_ref is independent, i.e. all members
// of the pack are mutually independent.
if (_do_vector_loop) {
// Wait until combine_packs to check independence of packs. For now we just know that
// the adjacent pairs are independent. This allows us to vectorize when we do not have
// alignment modulo vector_width. For example (forward read):
// for (int i ...) { v[i] = v[i + 1] + 5; }
// The following will be filtered out in combine_packs (forward write):
// for (int i ...) { v[i + 1] = v[i] + 5; }
return true;
}
// If all mem_ref's are modulo vector_width aligned with all other mem_ref's of their
// memory slice, then the VectorLoad / VectorStore regions are either exactly overlapping
// or completely non-overlapping. This ensures that there cannot be memory-dependencies
// between different vector "lanes".
// During SuperWord::filter_packs -> SuperWord::profitable -> SuperWord::is_vector_use,
// we check that all inputs are vectors that match on every element (with some reasonable
// exceptions). This ensures that every "lane" is isomorpic and independent to all other
// "lanes". This allows us to vectorize these cases:
// for (int i ...) { v[i] = v[i] + 5; } // same alignment
// for (int i ...) { v[i] = v[i + 32] + 5; } // alignment modulo vector_width
if (same_memory_slice(mem_ref, best_align_to_mem_ref)) {
return is_aligned_with_best;
} else {
return is_mem_ref_aligned_with_same_memory_slice(mem_ref, iv_adjustment, align_to_refs);
}
}
}
// Check if alignment of mem_ref is consistent with the other packs of the same memory slice
bool SuperWord::is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment,
Node_List &align_to_refs) {
for (uint i = 0; i < align_to_refs.size(); i++) {
MemNode* mr = align_to_refs.at(i)->as_Mem();
if (mr != mem_ref &&
same_memory_slice(mr, mem_ref) &&
memory_alignment(mr, iv_adjustment) != 0) {
// mem_ref is misaligned with mr, another ref of the same memory slice.
return false;
}
// All vectors need to be memory aligned, modulo their vector_width. This is more strict
// than the hardware probably requires. Most hardware at most requires 4-byte alignment.
//
// In the pre-loop, we align best_align_to_mem_ref to its vector_length. To ensure that
// all mem_ref's are memory aligned modulo their vector_width, we only need to check that
// they are all aligned to best_align_to_mem_ref, modulo their vector_width. For that,
// we check the following 3 conditions.
// (1) All packs are aligned with best_align_to_mem_ref.
if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
return false;
}
// (2) All other vectors have vector_size less or equal to that of best_align_to_mem_ref.
int vw = vector_width(mem_ref);
int vw_best = vector_width(best_align_to_mem_ref);
if (vw > vw_best) {
// We only align to vector_width of best_align_to_mem_ref during pre-loop.
// A mem_ref with a larger vector_width might thus not be vector_width aligned.
return false;
}
// (3) Ensure that all vectors have the same invariant. We model memory accesses like this
// address = base + k*iv + constant [+ invar]
// memory_alignment ignores the invariant.
SWPointer p2(best_align_to_mem_ref, this, nullptr, false);
if (!align_to_ref_p.invar_equals(p2)) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
return false;
}
// No misalignment found.
return true;
}
@ -1901,9 +1853,14 @@ void SuperWord::combine_packs() {
assert(is_power_of_2(max_vlen), "sanity");
uint psize = p1->size();
if (!is_power_of_2(psize)) {
// Skip pack which can't be vector.
// case1: for(...) { a[i] = i; } elements values are different (i+x)
// case2: for(...) { a[i] = b[i+1]; } can't align both, load and store
// We currently only support power-of-2 sizes for vectors.
#ifndef PRODUCT
if (TraceSuperWord) {
tty->cr();
tty->print_cr("WARNING: Removed pack[%d] with size that is not a power of 2:", i);
print_pack(p1);
}
#endif
_packset.at_put(i, nullptr);
continue;
}
@ -1922,28 +1879,41 @@ void SuperWord::combine_packs() {
}
}
if (_do_vector_loop) {
// Since we did not enforce exact alignment of the packsets, we only know that there
// is no dependence with distance 1, because we have checked independent(s1, s2) for
// all adjacent memops. But there could be a dependence of a different distance.
// Hence: remove the pack if there is a dependence.
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
if (p != nullptr) {
Node* dependence = find_dependence(p);
if (dependence != nullptr) {
// We know that the nodes in a pair pack were independent - this gives us independence
// at distance 1. But now that we may have more than 2 nodes in a pack, we need to check
// if they are all mutually independent. If there is a dependence we remove the pack.
// This is better than giving up completely - we can have partial vectorization if some
// are rejected and others still accepted.
//
// Examples with dependence at distance 1 (pack pairs are not created):
// for (int i ...) { v[i + 1] = v[i] + 5; }
// for (int i ...) { v[i] = v[i - 1] + 5; }
//
// Example with independence at distance 1, but dependence at distance 2 (pack pairs are
// created and we need to filter them out now):
// for (int i ...) { v[i + 2] = v[i] + 5; }
// for (int i ...) { v[i] = v[i - 2] + 5; }
//
// Note: dependencies are created when a later load may reference the same memory location
// as an earlier store. This happens in "read backward" or "store forward" cases. On the
// other hand, "read forward" or "store backward" cases do not have such dependencies:
// for (int i ...) { v[i] = v[i + 1] + 5; }
// for (int i ...) { v[i - 1] = v[i] + 5; }
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
if (p != nullptr) {
Node* dependence = find_dependence(p);
if (dependence != nullptr) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->cr();
tty->print_cr("WARNING: Found dependency.");
tty->print_cr("Cannot vectorize despite compile directive Vectorize.");
dependence->dump();
tty->print_cr("In pack[%d]", i);
print_pack(p);
}
#endif
_packset.at_put(i, nullptr);
if (TraceSuperWord) {
tty->cr();
tty->print_cr("WARNING: Found dependency at distance greater than 1.");
dependence->dump();
tty->print_cr("In pack[%d]", i);
print_pack(p);
}
#endif
_packset.at_put(i, nullptr);
}
}
}
@ -3757,7 +3727,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
#ifndef PRODUCT
if ((TraceSuperWord && Verbose) || is_trace_alignment()) {
tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);
tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d (offset = %d)", off_rem, off_mod, offset);
}
#endif
return off_mod;

View File

@ -513,15 +513,11 @@ private:
void find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment);
void print_loop(bool whole);
#endif
// Check if we can create the pack pairs for mem_ref:
// If required, enforce strict alignment requirements of hardware.
// Else, only enforce alignment within a memory slice, so that there cannot be any
// memory-dependence between different vector "lanes".
bool can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs);
// Check if alignment of mem_ref is consistent with the other packs of the same memory slice.
bool is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment, Node_List &align_to_refs);
// If strict memory alignment is required (vectors_should_be_aligned), then check if
// mem_ref is aligned with best_align_to_mem_ref.
bool mem_ref_has_no_alignment_violation(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs);
// Find a memory reference to align the loop induction variable to.
MemNode* find_align_to_ref(Node_List &memops, int &idx);
// Calculate loop's iv adjustment for this memory ops.

View File

@ -25,10 +25,9 @@
* Summary:
* Test SuperWord vectorization with different access offsets
* and various MaxVectorSize values, and +- AlignVector.
* Note: CompileCommand Option Vectorize is enabled.
*
* Note: this test is auto-generated. Please modify / generate with script:
* https://bugs.openjdk.org/browse/JDK-8298935
* https://bugs.openjdk.org/browse/JDK-8308606
*
* Types: int, long, short, char, byte, float, double
* Offsets: 0, -1, 1, -2, 2, -3, 3, -4, 4, -7, 7, -8, 8, -14, 14, -16, 16, -18, 18, -20, 20, -31, 31, -32, 32, -63, 63, -64, 64, -65, 65, -128, 128, -129, 129, -192, 192
@ -91,7 +90,7 @@
/*
* @test id=vanilla-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @library /test/lib /
@ -100,7 +99,7 @@
/*
* @test id=vanilla-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @library /test/lib /
@ -109,7 +108,7 @@
/*
* @test id=sse4-v016-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -120,7 +119,7 @@
/*
* @test id=sse4-v016-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -131,7 +130,7 @@
/*
* @test id=sse4-v008-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -142,7 +141,7 @@
/*
* @test id=sse4-v008-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -153,7 +152,7 @@
/*
* @test id=sse4-v004-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -164,7 +163,7 @@
/*
* @test id=sse4-v004-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -175,7 +174,7 @@
/*
* @test id=sse4-v002-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -186,7 +185,7 @@
/*
* @test id=sse4-v002-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -197,7 +196,7 @@
/*
* @test id=avx1-v032-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -208,7 +207,7 @@
/*
* @test id=avx1-v032-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -219,7 +218,7 @@
/*
* @test id=avx1-v016-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -230,7 +229,7 @@
/*
* @test id=avx1-v016-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -241,7 +240,7 @@
/*
* @test id=avx2-v032-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -252,7 +251,7 @@
/*
* @test id=avx2-v032-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -263,7 +262,7 @@
/*
* @test id=avx2-v016-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -274,7 +273,7 @@
/*
* @test id=avx2-v016-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -285,7 +284,7 @@
/*
* @test id=avx512-v064-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -296,7 +295,7 @@
/*
* @test id=avx512-v064-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -307,7 +306,7 @@
/*
* @test id=avx512-v032-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -318,7 +317,7 @@
/*
* @test id=avx512-v032-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -329,7 +328,7 @@
/*
* @test id=avx512bw-v064-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -340,7 +339,7 @@
/*
* @test id=avx512bw-v064-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -351,7 +350,7 @@
/*
* @test id=avx512bw-v032-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -362,7 +361,7 @@
/*
* @test id=avx512bw-v032-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64")
@ -373,7 +372,7 @@
/*
* @test id=vec-v064-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -383,7 +382,7 @@
/*
* @test id=vec-v064-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -393,7 +392,7 @@
/*
* @test id=vec-v032-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -403,7 +402,7 @@
/*
* @test id=vec-v032-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -413,7 +412,7 @@
/*
* @test id=vec-v016-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -423,7 +422,7 @@
/*
* @test id=vec-v016-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -433,7 +432,7 @@
/*
* @test id=vec-v008-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -443,7 +442,7 @@
/*
* @test id=vec-v008-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -453,7 +452,7 @@
/*
* @test id=vec-v004-A
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -463,7 +462,7 @@
/*
* @test id=vec-v004-U
* @bug 8298935
* @bug 8298935 8308606
* @summary Test SuperWord: vector size, offsets, dependencies, alignment.
* @requires vm.compiler2.enabled
* @requires (os.arch!="x86" & os.arch!="i386" & os.arch!="amd64" & os.arch!="x86_64")
@ -1262,7 +1261,6 @@ public class TestDependencyOffsets {
public static void main(String args[]) {
TestFramework framework = new TestFramework(TestDependencyOffsets.class);
framework.addFlags("-XX:-TieredCompilation",
"-XX:CompileCommand=option,compiler.loopopts.superword.TestDependencyOffsets::test*,Vectorize",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::init",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::test*",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestDependencyOffsets::verify",

View File

@ -161,9 +161,11 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
}
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public int[] indexWithDifferentConstants() {
// No true dependency in read-forward case.
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"AlignVector", "false"},
counts = {IRNode.STORE_VECTOR, ">0"})
public int[] indexWithDifferentConstantsPos() {
int[] res = new int[SIZE];
for (int i = 0; i < SIZE / 4; i++) {
res[i] = ints[i + 1];
@ -171,6 +173,17 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
return res;
}
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public int[] indexWithDifferentConstantsNeg() {
int[] res = new int[SIZE];
for (int i = 1; i < SIZE / 4; i++) {
res[i] = ints[i - 1];
}
return res;
}
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
@ -246,10 +259,13 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
}
// ---------------- Subword Type Arrays ----------------
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public short[] shortArrayWithDependence() {
// No true dependency in read-forward case.
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"AlignVector", "false"},
counts = {IRNode.STORE_VECTOR, ">0"})
public short[] shortArrayWithDependencePos() {
short[] res = new short[SIZE];
System.arraycopy(shorts, 0, res, 0, SIZE);
for (int i = 0; i < SIZE / 2; i++) {
@ -261,7 +277,21 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public char[] charArrayWithDependence() {
public short[] shortArrayWithDependenceNeg() {
short[] res = new short[SIZE];
System.arraycopy(shorts, 0, res, 0, SIZE);
for (int i = 1; i < SIZE / 2; i++) {
res[i] *= shorts[i - 1];
}
return res;
}
@Test
// No true dependency in read-forward case.
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"AlignVector", "false"},
counts = {IRNode.STORE_VECTOR, ">0"})
public char[] charArrayWithDependencePos() {
char[] res = new char[SIZE];
System.arraycopy(chars, 0, res, 0, SIZE);
for (int i = 0; i < SIZE / 2; i++) {
@ -273,7 +303,21 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public byte[] byteArrayWithDependence() {
public char[] charArrayWithDependenceNeg() {
char[] res = new char[SIZE];
System.arraycopy(chars, 0, res, 0, SIZE);
for (int i = 2; i < SIZE / 2; i++) {
res[i] *= chars[i - 2];
}
return res;
}
@Test
// No true dependency in read-forward case.
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"AlignVector", "false"},
counts = {IRNode.STORE_VECTOR, ">0"})
public byte[] byteArrayWithDependencePos() {
byte[] res = new byte[SIZE];
System.arraycopy(bytes, 0, res, 0, SIZE);
for (int i = 0; i < SIZE / 2; i++) {
@ -282,10 +326,25 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
return res;
}
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public boolean[] booleanArrayWithDependence() {
public byte[] byteArrayWithDependenceNeg() {
byte[] res = new byte[SIZE];
System.arraycopy(bytes, 0, res, 0, SIZE);
for (int i = 3; i < SIZE / 2; i++) {
res[i] *= bytes[i - 3];
}
return res;
}
@Test
// No true dependency in read-forward case.
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
applyIf = {"AlignVector", "false"},
counts = {IRNode.STORE_VECTOR, ">0"})
public boolean[] booleanArrayWithDependencePos() {
boolean[] res = new boolean[SIZE];
System.arraycopy(booleans, 0, res, 0, SIZE);
for (int i = 0; i < SIZE / 2; i++) {
@ -294,6 +353,18 @@ public class LoopArrayIndexComputeTest extends VectorizationTestRunner {
return res;
}
@Test
// Note that this case cannot be vectorized due to data dependence.
@IR(failOn = {IRNode.STORE_VECTOR})
public boolean[] booleanArrayWithDependenceNeg() {
boolean[] res = new boolean[SIZE];
System.arraycopy(booleans, 0, res, 0, SIZE);
for (int i = 4; i < SIZE / 2; i++) {
res[i] |= booleans[i - 4];
}
return res;
}
// ---------------- Multiple Operations ----------------
@Test
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},

View File

@ -0,0 +1,315 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.*;
import java.util.concurrent.TimeUnit;
import java.util.Random;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
@Warmup(iterations = 1, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 1, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(value = 1)
public abstract class VectorAlignment {
@Param({/*"512", "1024", */ "2048"})
public int COUNT;
private int[] aI;
private int[] bI;
private int[] rI;
private long[] aL;
private long[] bL;
private long[] rL;
private short[] aS;
private short[] bS;
private short[] rS;
private char[] aC;
private char[] bC;
private char[] rC;
private byte[] aB;
private byte[] bB;
private byte[] rB;
private float[] aF;
private float[] bF;
private float[] rF;
private double[] aD;
private double[] bD;
private double[] rD;
@Param("0")
private int seed;
private Random r = new Random(seed);
@Setup
public void init() {
aI = new int[COUNT];
bI = new int[COUNT];
rI = new int[COUNT];
aL = new long[COUNT];
bL = new long[COUNT];
rL = new long[COUNT];
aS = new short[COUNT];
bS = new short[COUNT];
rS = new short[COUNT];
aC = new char[COUNT];
bC = new char[COUNT];
rC = new char[COUNT];
aB = new byte[COUNT];
bB = new byte[COUNT];
rB = new byte[COUNT];
aF = new float[COUNT];
bF = new float[COUNT];
rF = new float[COUNT];
aD = new double[COUNT];
bD = new double[COUNT];
rD = new double[COUNT];
for (int i = 0; i < COUNT; i++) {
aI[i] = r.nextInt();
bI[i] = r.nextInt();
aL[i] = r.nextLong();
bL[i] = r.nextLong();
aS[i] = (short) r.nextInt();
bS[i] = (short) r.nextInt();
aC[i] = (char) r.nextInt();
bC[i] = (char) r.nextInt();
aB[i] = (byte) r.nextInt();
bB[i] = (byte) r.nextInt();
aF[i] = r.nextFloat();
bF[i] = r.nextFloat();
aD[i] = r.nextDouble();
bD[i] = r.nextDouble();
}
}
@Benchmark
// Control: should always vectorize with SuperWord
public void bench000I_control() {
for (int i = 0; i < COUNT; i++) {
// Have multiple MUL operations to make loop compute bound (more compute than load/store)
rI[i] = aI[i] * aI[i] * aI[i] * aI[i];
}
}
@Benchmark
public void bench000L_control() {
for (int i = 0; i < COUNT; i++) {
rL[i] = aL[i] * aL[i] * aL[i] * aL[i];
}
}
@Benchmark
public void bench000S_control() {
for (int i = 0; i < COUNT; i++) {
rS[i] = (short)(aS[i] * aS[i] * aS[i] * aS[i]);
}
}
@Benchmark
public void bench000C_control() {
for (int i = 0; i < COUNT; i++) {
rC[i] = (char)(aC[i] * aC[i] * aC[i] * aC[i]);
}
}
@Benchmark
public void bench000B_control() {
for (int i = 0; i < COUNT; i++) {
rB[i] = (byte)(aB[i] * aB[i] * aB[i] * aB[i]);
}
}
@Benchmark
public void bench000F_control() {
for (int i = 0; i < COUNT; i++) {
rF[i] = aF[i] * aF[i] * aF[i] * aF[i];
}
}
@Benchmark
public void bench000D_control() {
for (int i = 0; i < COUNT; i++) {
rD[i] = aD[i] * aD[i] * aD[i] * aD[i];
}
}
@Benchmark
// Control: should always vectorize with SuperWord
public void bench001_control() {
for (int i = 0; i < COUNT; i++) {
// Have multiple MUL operations to make loop compute bound (more compute than load/store)
rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + bI[i];
}
}
@Benchmark
// Vectorizes without AlignVector
public void bench100I_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rI[i] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1];
}
}
@Benchmark
public void bench100L_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rL[i] = aL[i+1] * aL[i+1] * aL[i+1] * aL[i+1];
}
}
@Benchmark
public void bench100S_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rS[i] = (short)(aS[i+1] * aS[i+1] * aS[i+1] * aS[i+1]);
}
}
@Benchmark
public void bench100C_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rC[i] = (char)(aC[i+1] * aC[i+1] * aC[i+1] * aC[i+1]);
}
}
@Benchmark
public void bench100B_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rB[i] = (byte)(aB[i+1] * aB[i+1] * aB[i+1] * aB[i+1]);
}
}
@Benchmark
public void bench100F_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rF[i] = aF[i+1] * aF[i+1] * aF[i+1] * aF[i+1];
}
}
@Benchmark
public void bench100D_misaligned_load() {
for (int i = 0; i < COUNT-1; i++) {
rD[i] = aD[i+1] * aD[i+1] * aD[i+1] * aD[i+1];
}
}
@Benchmark
// Only without "Vectorize" (confused by hand-unrolling)
public void bench200_hand_unrolled_aligned() {
for (int i = 0; i < COUNT-10; i+=2) {
rI[i+0] = aI[i+0] * aI[i+0] * aI[i+0] * aI[i+0];
rI[i+1] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1];
}
}
@Benchmark
// Only with "Vectorize", without we get issues with modulo computation of alignment for bI
public void bench300_multiple_misaligned_loads() {
for (int i = 0; i < COUNT-10; i++) {
rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + bI[i+1];
}
}
@Benchmark
// Only with "Vectorize", without we may confuse aI[5] with aI[4+1] and pack loads in wrong pack
public void bench301_multiple_misaligned_loads() {
for (int i = 0; i < COUNT-10; i++) {
rI[i] = aI[i] * aI[i] * aI[i] * aI[i] + aI[i+1];
}
}
@Benchmark
// Only with "Vectorize", without we get mix of aI[i] and a[i-2]
public void bench302_multiple_misaligned_loads_and_stores() {
for (int i = 2; i < COUNT; i++) {
rI[i - 2] = aI[i-2] * aI[i-2] * aI[i-2] * aI[i-2]; // can do this for all iterations
rI[i] = aI[i] + 3; // before doing this second line
}
}
@Benchmark
// Currently does not vectorize:
// hand-unrolled confuses Vectorize -> adjacent loads not from same original node (not even same line)
// multiple unaligned loads confuses non-Vectorize: aI[5+1] confused with aI[4+2] (plus modulo alignment issue)
public void bench400_hand_unrolled_misaligned() {
for (int i = 0; i < COUNT-10; i+=2) {
rI[i+0] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1] + aI[i];
rI[i+1] = aI[i+2] * aI[i+2] * aI[i+2] * aI[i+2] + aI[i+1];
}
}
@Benchmark
// Currently does not vectorize:
// hand-unrolled confuses Vectorize -> adjacent loads not from same original node (not even same line)
// non-Vectorize: plus modulo alignment issue
public void bench401_hand_unrolled_misaligned() {
for (int i = 0; i < COUNT-10; i+=2) {
rI[i+0] = aI[i+1] * aI[i+1] * aI[i+1] * aI[i+1] + bI[i];
rI[i+1] = aI[i+2] * aI[i+2] * aI[i+2] * aI[i+2] + bI[i+1];
}
}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:+UseSuperWord", "-XX:CompileCommand=Option,*::*,Vectorize"
})
public static class VectorAlignmentSuperWordWithVectorize extends VectorAlignment {}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:+UseSuperWord", "-XX:+AlignVector"
})
public static class VectorAlignmentSuperWordAlignVector extends VectorAlignment {}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:+UseSuperWord"
})
public static class VectorAlignmentSuperWord extends VectorAlignment {}
@Fork(value = 1, jvmArgsPrepend = {
"-XX:-UseSuperWord"
})
public static class VectorAlignmentNoSuperWord extends VectorAlignment {}
}