8334431: C2 SuperWord: fix performance regression due to store-to-load-forwarding failures
Reviewed-by: chagedorn, qamai
This commit is contained in:
parent
e11d126a8d
commit
75420e9314
@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling, false);
|
|||||||
define_pd_global(bool, OptoBundling, false);
|
define_pd_global(bool, OptoBundling, false);
|
||||||
define_pd_global(bool, OptoRegScheduling, false);
|
define_pd_global(bool, OptoRegScheduling, false);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 8);
|
||||||
define_pd_global(bool, IdealizeClearArrayNode, true);
|
define_pd_global(bool, IdealizeClearArrayNode, true);
|
||||||
|
|
||||||
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
|
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
|
||||||
|
@ -64,6 +64,7 @@ define_pd_global(bool, OptoBundling, false);
|
|||||||
define_pd_global(bool, OptoScheduling, true);
|
define_pd_global(bool, OptoScheduling, true);
|
||||||
define_pd_global(bool, OptoRegScheduling, false);
|
define_pd_global(bool, OptoRegScheduling, false);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, false);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, false);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
|
||||||
define_pd_global(bool, IdealizeClearArrayNode, true);
|
define_pd_global(bool, IdealizeClearArrayNode, true);
|
||||||
|
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
|
@ -59,6 +59,7 @@ define_pd_global(bool, UseCISCSpill, false);
|
|||||||
define_pd_global(bool, OptoBundling, false);
|
define_pd_global(bool, OptoBundling, false);
|
||||||
define_pd_global(bool, OptoRegScheduling, false);
|
define_pd_global(bool, OptoRegScheduling, false);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
|
||||||
// GL:
|
// GL:
|
||||||
// Detected a problem with unscaled compressed oops and
|
// Detected a problem with unscaled compressed oops and
|
||||||
// narrow_oop_use_complex_address() == false.
|
// narrow_oop_use_complex_address() == false.
|
||||||
|
@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling, true);
|
|||||||
define_pd_global(bool, OptoBundling, false);
|
define_pd_global(bool, OptoBundling, false);
|
||||||
define_pd_global(bool, OptoRegScheduling, false);
|
define_pd_global(bool, OptoRegScheduling, false);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
|
||||||
define_pd_global(bool, IdealizeClearArrayNode, true);
|
define_pd_global(bool, IdealizeClearArrayNode, true);
|
||||||
|
|
||||||
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
|
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
|
||||||
|
@ -61,6 +61,7 @@ define_pd_global(bool, OptoBundling, false);
|
|||||||
define_pd_global(bool, OptoScheduling, false);
|
define_pd_global(bool, OptoScheduling, false);
|
||||||
define_pd_global(bool, OptoRegScheduling, false);
|
define_pd_global(bool, OptoRegScheduling, false);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
|
||||||
// On s390x, we can clear the array with a single instruction,
|
// On s390x, we can clear the array with a single instruction,
|
||||||
// so don't idealize it.
|
// so don't idealize it.
|
||||||
define_pd_global(bool, IdealizeClearArrayNode, false);
|
define_pd_global(bool, IdealizeClearArrayNode, false);
|
||||||
|
@ -76,6 +76,7 @@ define_pd_global(bool, OptoScheduling, false);
|
|||||||
define_pd_global(bool, OptoBundling, false);
|
define_pd_global(bool, OptoBundling, false);
|
||||||
define_pd_global(bool, OptoRegScheduling, true);
|
define_pd_global(bool, OptoRegScheduling, true);
|
||||||
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
define_pd_global(bool, SuperWordLoopUnrollAnalysis, true);
|
||||||
|
define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
|
||||||
define_pd_global(bool, IdealizeClearArrayNode, true);
|
define_pd_global(bool, IdealizeClearArrayNode, true);
|
||||||
|
|
||||||
define_pd_global(uintx, ReservedCodeCacheSize, 48*M);
|
define_pd_global(uintx, ReservedCodeCacheSize, 48*M);
|
||||||
|
@ -355,6 +355,12 @@
|
|||||||
product(bool, SuperWordReductions, true, \
|
product(bool, SuperWordReductions, true, \
|
||||||
"Enable reductions support in superword.") \
|
"Enable reductions support in superword.") \
|
||||||
\
|
\
|
||||||
|
product_pd(uint, SuperWordStoreToLoadForwardingFailureDetection, DIAGNOSTIC, \
|
||||||
|
"if >0, auto-vectorization detects possible store-to-load " \
|
||||||
|
"forwarding failures. The number specifies over how many " \
|
||||||
|
"loop iterations this detection spans.") \
|
||||||
|
range(0, 4096) \
|
||||||
|
\
|
||||||
product(bool, UseCMoveUnconditionally, false, \
|
product(bool, UseCMoveUnconditionally, false, \
|
||||||
"Use CMove (scalar and vector) ignoring profitability test.") \
|
"Use CMove (scalar and vector) ignoring profitability test.") \
|
||||||
\
|
\
|
||||||
|
@ -1868,6 +1868,7 @@ bool SuperWord::schedule_and_apply() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!vtransform.schedule()) { return false; }
|
if (!vtransform.schedule()) { return false; }
|
||||||
|
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
|
||||||
vtransform.apply();
|
vtransform.apply();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,7 @@
|
|||||||
#include "opto/vectorization.hpp"
|
#include "opto/vectorization.hpp"
|
||||||
|
|
||||||
#ifndef PRODUCT
|
#ifndef PRODUCT
|
||||||
static void print_con_or_idx(const Node* n) {
|
void VPointer::print_con_or_idx(const Node* n) {
|
||||||
if (n == nullptr) {
|
if (n == nullptr) {
|
||||||
tty->print("( 0)");
|
tty->print("( 0)");
|
||||||
} else if (n->is_ConI()) {
|
} else if (n->is_ConI()) {
|
||||||
@ -1369,12 +1369,12 @@ void VPointer::print() const {
|
|||||||
tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
|
tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
|
||||||
|
|
||||||
tty->print(" base");
|
tty->print(" base");
|
||||||
print_con_or_idx(_base);
|
VPointer::print_con_or_idx(_base);
|
||||||
|
|
||||||
tty->print(" + offset(%4d)", _offset);
|
tty->print(" + offset(%4d)", _offset);
|
||||||
|
|
||||||
tty->print(" + invar");
|
tty->print(" + invar");
|
||||||
print_con_or_idx(_invar);
|
VPointer::print_con_or_idx(_invar);
|
||||||
|
|
||||||
tty->print_cr(" + scale(%4d) * iv]", _scale);
|
tty->print_cr(" + scale(%4d) * iv]", _scale);
|
||||||
}
|
}
|
||||||
@ -2168,15 +2168,15 @@ void AlignmentSolver::trace_start_solve() const {
|
|||||||
|
|
||||||
// iv = init + pre_iter * pre_stride + main_iter * main_stride
|
// iv = init + pre_iter * pre_stride + main_iter * main_stride
|
||||||
tty->print(" iv = init");
|
tty->print(" iv = init");
|
||||||
print_con_or_idx(_init_node);
|
VPointer::print_con_or_idx(_init_node);
|
||||||
tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
|
tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
|
||||||
_pre_stride, _main_stride);
|
_pre_stride, _main_stride);
|
||||||
|
|
||||||
// adr = base + offset + invar + scale * iv
|
// adr = base + offset + invar + scale * iv
|
||||||
tty->print(" adr = base");
|
tty->print(" adr = base");
|
||||||
print_con_or_idx(_base);
|
VPointer::print_con_or_idx(_base);
|
||||||
tty->print(" + offset(%d) + invar", _offset);
|
tty->print(" + offset(%d) + invar", _offset);
|
||||||
print_con_or_idx(_invar);
|
VPointer::print_con_or_idx(_invar);
|
||||||
tty->print_cr(" + scale(%d) * iv", _scale);
|
tty->print_cr(" + scale(%d) * iv", _scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -870,6 +870,7 @@ class VPointer : public ArenaObj {
|
|||||||
static int cmp_for_sort(const VPointer** p1, const VPointer** p2);
|
static int cmp_for_sort(const VPointer** p1, const VPointer** p2);
|
||||||
|
|
||||||
NOT_PRODUCT( void print() const; )
|
NOT_PRODUCT( void print() const; )
|
||||||
|
NOT_PRODUCT( static void print_con_or_idx(const Node* n); )
|
||||||
|
|
||||||
#ifndef PRODUCT
|
#ifndef PRODUCT
|
||||||
class Tracer {
|
class Tracer {
|
||||||
|
@ -144,6 +144,274 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// We use two comparisons, because a subtraction could underflow.
|
||||||
|
#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
|
||||||
|
if (a < b) { return -1; } \
|
||||||
|
if (a > b) { return 1; }
|
||||||
|
|
||||||
|
// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
|
||||||
|
// It represents a memory region: [ptr, ptr + memory_size)
|
||||||
|
class VMemoryRegion : public StackObj {
|
||||||
|
private:
|
||||||
|
Node* _base; // ptr = base + offset + invar + scale * iv
|
||||||
|
int _scale;
|
||||||
|
Node* _invar;
|
||||||
|
int _offset;
|
||||||
|
uint _memory_size;
|
||||||
|
bool _is_load; // load or store?
|
||||||
|
uint _schedule_order;
|
||||||
|
|
||||||
|
public:
|
||||||
|
VMemoryRegion() {} // empty constructor for GrowableArray
|
||||||
|
VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, uint schedule_order) :
|
||||||
|
_base(vpointer.base()),
|
||||||
|
_scale(vpointer.scale_in_bytes()),
|
||||||
|
_invar(vpointer.invar()),
|
||||||
|
_offset(vpointer.offset_in_bytes() + _scale * iv_offset),
|
||||||
|
_memory_size(vpointer.memory_size() * vector_length),
|
||||||
|
_is_load(vpointer.mem()->is_Load()),
|
||||||
|
_schedule_order(schedule_order) {}
|
||||||
|
|
||||||
|
Node* base() const { return _base; }
|
||||||
|
int scale() const { return _scale; }
|
||||||
|
Node* invar() const { return _invar; }
|
||||||
|
int offset() const { return _offset; }
|
||||||
|
uint memory_size() const { return _memory_size; }
|
||||||
|
bool is_load() const { return _is_load; }
|
||||||
|
uint schedule_order() const { return _schedule_order; }
|
||||||
|
|
||||||
|
static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) {
|
||||||
|
RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->base()->_idx, r2->base()->_idx);
|
||||||
|
RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->scale(), r2->scale());
|
||||||
|
int r1_invar_idx = r1->invar() == nullptr ? 0 : r1->invar()->_idx;
|
||||||
|
int r2_invar_idx = r2->invar() == nullptr ? 0 : r2->invar()->_idx;
|
||||||
|
RETURN_CMP_VALUE_IF_NOT_EQUAL(r1_invar_idx, r2_invar_idx);
|
||||||
|
return 0; // equal
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) {
|
||||||
|
int cmp_group = cmp_for_sort_by_group(r1, r2);
|
||||||
|
if (cmp_group != 0) { return cmp_group; }
|
||||||
|
|
||||||
|
RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->offset(), r2->offset());
|
||||||
|
return 0; // equal
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER };
|
||||||
|
|
||||||
|
Aliasing aliasing(VMemoryRegion& other) {
|
||||||
|
VMemoryRegion* p1 = this;
|
||||||
|
VMemoryRegion* p2 = &other;
|
||||||
|
if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; }
|
||||||
|
|
||||||
|
jlong offset1 = p1->offset();
|
||||||
|
jlong offset2 = p2->offset();
|
||||||
|
jlong memory_size1 = p1->memory_size();
|
||||||
|
jlong memory_size2 = p2->memory_size();
|
||||||
|
|
||||||
|
if (offset1 >= offset2 + memory_size2) { return AFTER; }
|
||||||
|
if (offset2 >= offset1 + memory_size1) { return BEFORE; }
|
||||||
|
if (offset1 == offset2 && memory_size1 == memory_size2) { return EXACT_OVERLAP; }
|
||||||
|
return PARTIAL_OVERLAP;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef PRODUCT
|
||||||
|
void print() const {
|
||||||
|
tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), base",
|
||||||
|
_is_load ? "load " : "store", _memory_size, _schedule_order);
|
||||||
|
VPointer::print_con_or_idx(_base);
|
||||||
|
tty->print(" + offset(%4d)", _offset);
|
||||||
|
tty->print(" + invar");
|
||||||
|
VPointer::print_con_or_idx(_invar);
|
||||||
|
tty->print_cr(" + scale(%4d) * iv]", _scale);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch
|
||||||
|
// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles
|
||||||
|
// faster. However, this optimization comes with some restrictions, depending on the CPU.
|
||||||
|
// Generally, store-to-load-forwarding works if the load and store memory regions match
|
||||||
|
// exactly (same start and width). Generally problematic are partial overlaps - though
|
||||||
|
// some CPU's can handle even some subsets of these cases. We conservatively assume that
|
||||||
|
// all such partial overlaps lead to a store-to-load-forwarding failures, which means the
|
||||||
|
// load has to stall until the store goes from the store-buffer into the L1 cache, incurring
|
||||||
|
// a penalty of many CPU cycles.
|
||||||
|
//
|
||||||
|
// Example (with "iteration distance" 2):
|
||||||
|
// for (int i = 10; i < SIZE; i++) {
|
||||||
|
// aI[i] = aI[i - 2] + 1;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// load_4_bytes( ptr + -8)
|
||||||
|
// store_4_bytes(ptr + 0) *
|
||||||
|
// load_4_bytes( ptr + -4) |
|
||||||
|
// store_4_bytes(ptr + 4) | *
|
||||||
|
// load_4_bytes( ptr + 0) <-+ |
|
||||||
|
// store_4_bytes(ptr + 8) |
|
||||||
|
// load_4_bytes( ptr + 4) <---+
|
||||||
|
// store_4_bytes(ptr + 12)
|
||||||
|
// ...
|
||||||
|
//
|
||||||
|
// In the scalar loop, we can forward the stores from 2 iterations back.
|
||||||
|
//
|
||||||
|
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2
|
||||||
|
// example. This gives us this machine code:
|
||||||
|
// load_8_bytes( ptr + -8)
|
||||||
|
// store_8_bytes(ptr + 0) |
|
||||||
|
// load_8_bytes( ptr + 0) v
|
||||||
|
// store_8_bytes(ptr + 8) |
|
||||||
|
// load_8_bytes( ptr + 8) v
|
||||||
|
// store_8_bytes(ptr + 16)
|
||||||
|
// ...
|
||||||
|
//
|
||||||
|
// We packed 2 iterations, and the stores can perfectly forward to the loads of
|
||||||
|
// the next 2 iterations.
|
||||||
|
//
|
||||||
|
// Example (with "iteration distance" 3):
|
||||||
|
// for (int i = 10; i < SIZE; i++) {
|
||||||
|
// aI[i] = aI[i - 3] + 1;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// load_4_bytes( ptr + -12)
|
||||||
|
// store_4_bytes(ptr + 0) *
|
||||||
|
// load_4_bytes( ptr + -8) |
|
||||||
|
// store_4_bytes(ptr + 4) |
|
||||||
|
// load_4_bytes( ptr + -4) |
|
||||||
|
// store_4_bytes(ptr + 8) |
|
||||||
|
// load_4_bytes( ptr + 0) <-+
|
||||||
|
// store_4_bytes(ptr + 12)
|
||||||
|
// ...
|
||||||
|
//
|
||||||
|
// In the scalar loop, we can forward the stores from 3 iterations back.
|
||||||
|
//
|
||||||
|
// Unfortunately, vectorization can introduce such store-to-load-forwarding failures.
|
||||||
|
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3
|
||||||
|
// example. This gives us this machine code:
|
||||||
|
// load_8_bytes( ptr + -12)
|
||||||
|
// store_8_bytes(ptr + 0) | |
|
||||||
|
// load_8_bytes( ptr + -4) x |
|
||||||
|
// store_8_bytes(ptr + 8) ||
|
||||||
|
// load_8_bytes( ptr + 4) xx <-- partial overlap with 2 stores
|
||||||
|
// store_8_bytes(ptr + 16)
|
||||||
|
// ...
|
||||||
|
//
|
||||||
|
// We see that eventually all loads are dependent on earlier stores, but the values cannot
|
||||||
|
// be forwarded because there is some partial overlap.
|
||||||
|
//
|
||||||
|
// Preferably, we would have some latency-based cost-model that accounts for such forwarding
|
||||||
|
// failures, and decide if vectorization with forwarding failures is still profitable. For
|
||||||
|
// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that
|
||||||
|
// there will be a forwarding failure. This approach has at least 2 possible weaknesses:
|
||||||
|
//
|
||||||
|
// (1) There may be forwarding failures in cases where we cannot prove it.
|
||||||
|
// Example:
|
||||||
|
// for (int i = 10; i < SIZE; i++) {
|
||||||
|
// bI[i] = aI[i - 3] + 1;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// We do not know if aI and bI refer to the same array or not. However, it is reasonable
|
||||||
|
// to assume that if we have two different array references, that they most likely refer
|
||||||
|
// to different arrays (i.e. no aliasing), where we would have no forwarding failures.
|
||||||
|
// (2) There could be some loops where vectorization introduces forwarding failures, and thus
|
||||||
|
// the latency of the loop body is high, but this does not matter because it is dominated
|
||||||
|
// by other latency/throughput based costs in the loop body.
|
||||||
|
//
|
||||||
|
// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated
|
||||||
|
// that there is some iteration threshold: if the failure happens between a store and load that
|
||||||
|
// have an iteration distance below this threshold, the latency is the limiting factor, and we
|
||||||
|
// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If
|
||||||
|
// the iteration distance is larger than this threshold, the throughput is the limiting factor,
|
||||||
|
// and we should vectorize in these cases to improve throughput.
|
||||||
|
//
|
||||||
|
bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const {
|
||||||
|
if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; }
|
||||||
|
|
||||||
|
// Collect all pointers for scalar and vector loads/stores.
|
||||||
|
ResourceMark rm;
|
||||||
|
GrowableArray<VMemoryRegion> memory_regions;
|
||||||
|
|
||||||
|
// To detect store-to-load-forwarding failures at the iteration threshold or below, we
|
||||||
|
// simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection
|
||||||
|
// iterations at least. This is a heuristic, and we are not trying to be very precise
|
||||||
|
// with the iteration distance. If we have already unrolled more than the iteration
|
||||||
|
// threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count",
|
||||||
|
// then we simply check if there are any store-to-load-forwarding failures in the unrolled
|
||||||
|
// loop body, which may be at larger distance than the desired threshold. We cannot do any
|
||||||
|
// more fine-grained analysis, because the unrolling has lost the information about the
|
||||||
|
// iteration distance.
|
||||||
|
int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection;
|
||||||
|
int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count();
|
||||||
|
uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count);
|
||||||
|
int iv_stride = vloop_analyzer.vloop().iv_stride();
|
||||||
|
int schedule_order = 0;
|
||||||
|
for (uint k = 0; k < simulated_super_unrolling_count; k++) {
|
||||||
|
int iv_offset = k * iv_stride; // virtual super-unrolling
|
||||||
|
for (int i = 0; i < _schedule.length(); i++) {
|
||||||
|
VTransformNode* vtn = _schedule.at(i);
|
||||||
|
if (vtn->is_load_or_store_in_loop()) {
|
||||||
|
const VPointer& p = vtn->vpointer(vloop_analyzer);
|
||||||
|
if (p.valid()) {
|
||||||
|
VTransformVectorNode* vector = vtn->isa_Vector();
|
||||||
|
uint vector_length = vector != nullptr ? vector->nodes().length() : 1;
|
||||||
|
memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, schedule_order++));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the pointers by group (same base, invar and stride), and then by offset.
|
||||||
|
memory_regions.sort(VMemoryRegion::cmp_for_sort);
|
||||||
|
|
||||||
|
#ifndef PRODUCT
|
||||||
|
if (_trace._verbose) {
|
||||||
|
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
|
||||||
|
tty->print_cr(" simulated_unrolling_count = %d", simulated_unrolling_count);
|
||||||
|
tty->print_cr(" simulated_super_unrolling_count = %d", simulated_super_unrolling_count);
|
||||||
|
for (int i = 0; i < memory_regions.length(); i++) {
|
||||||
|
VMemoryRegion& region = memory_regions.at(i);
|
||||||
|
region.print();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For all pairs of pointers in the same group, check if they have a partial overlap.
|
||||||
|
for (int i = 0; i < memory_regions.length(); i++) {
|
||||||
|
VMemoryRegion& region1 = memory_regions.at(i);
|
||||||
|
|
||||||
|
for (int j = i + 1; j < memory_regions.length(); j++) {
|
||||||
|
VMemoryRegion& region2 = memory_regions.at(j);
|
||||||
|
|
||||||
|
const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2);
|
||||||
|
if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP ||
|
||||||
|
aliasing == VMemoryRegion::Aliasing::BEFORE) {
|
||||||
|
break; // We have reached the next group or pointers that are always after.
|
||||||
|
} else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen");
|
||||||
|
if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) ||
|
||||||
|
(!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) {
|
||||||
|
// We predict that this leads to a store-to-load-forwarding failure penalty.
|
||||||
|
#ifndef PRODUCT
|
||||||
|
if (_trace._rejections) {
|
||||||
|
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
|
||||||
|
tty->print_cr(" Partial overlap of store->load. We predict that this leads to");
|
||||||
|
tty->print_cr(" a store-to-load-forwarding failure penalty which makes");
|
||||||
|
tty->print_cr(" vectorization unprofitable. These are the two pointers:");
|
||||||
|
region1.print();
|
||||||
|
region2.print();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
Node* VTransformNode::find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
Node* VTransformNode::find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
|
||||||
Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx);
|
Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx);
|
||||||
assert(n != nullptr, "must find input IR node");
|
assert(n != nullptr, "must find input IR node");
|
||||||
|
@ -66,6 +66,8 @@ class VTransformVectorNode;
|
|||||||
class VTransformElementWiseVectorNode;
|
class VTransformElementWiseVectorNode;
|
||||||
class VTransformBoolVectorNode;
|
class VTransformBoolVectorNode;
|
||||||
class VTransformReductionVectorNode;
|
class VTransformReductionVectorNode;
|
||||||
|
class VTransformLoadVectorNode;
|
||||||
|
class VTransformStoreVectorNode;
|
||||||
|
|
||||||
// Result from VTransformNode::apply
|
// Result from VTransformNode::apply
|
||||||
class VTransformApplyResult {
|
class VTransformApplyResult {
|
||||||
@ -157,6 +159,7 @@ public:
|
|||||||
const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
|
const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
|
||||||
|
|
||||||
bool schedule();
|
bool schedule();
|
||||||
|
bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
|
||||||
void apply_memops_reordering_with_schedule() const;
|
void apply_memops_reordering_with_schedule() const;
|
||||||
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
|
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
|
||||||
|
|
||||||
@ -221,6 +224,7 @@ public:
|
|||||||
VTransformGraph& graph() { return _graph; }
|
VTransformGraph& graph() { return _graph; }
|
||||||
|
|
||||||
bool schedule() { return _graph.schedule(); }
|
bool schedule() { return _graph.schedule(); }
|
||||||
|
bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
|
||||||
void apply();
|
void apply();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -310,6 +314,11 @@ public:
|
|||||||
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; }
|
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; }
|
||||||
virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; }
|
virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; }
|
||||||
virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; }
|
virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; }
|
||||||
|
virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; }
|
||||||
|
virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; }
|
||||||
|
|
||||||
|
virtual bool is_load_or_store_in_loop() const { return false; }
|
||||||
|
virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); }
|
||||||
|
|
||||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const = 0;
|
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const = 0;
|
||||||
@ -333,6 +342,8 @@ public:
|
|||||||
VTransformNode(vtransform, n->req()), _node(n) {}
|
VTransformNode(vtransform, n->req()), _node(n) {}
|
||||||
Node* node() const { return _node; }
|
Node* node() const { return _node; }
|
||||||
virtual VTransformScalarNode* isa_Scalar() override { return this; }
|
virtual VTransformScalarNode* isa_Scalar() override { return this; }
|
||||||
|
virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); }
|
||||||
|
virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); }
|
||||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||||
NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };)
|
NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };)
|
||||||
@ -347,6 +358,7 @@ public:
|
|||||||
VTransformInputScalarNode(VTransform& vtransform, Node* n) :
|
VTransformInputScalarNode(VTransform& vtransform, Node* n) :
|
||||||
VTransformScalarNode(vtransform, n) {}
|
VTransformScalarNode(vtransform, n) {}
|
||||||
virtual VTransformInputScalarNode* isa_InputScalar() override { return this; }
|
virtual VTransformInputScalarNode* isa_InputScalar() override { return this; }
|
||||||
|
virtual bool is_load_or_store_in_loop() const override { return false; }
|
||||||
NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };)
|
NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };)
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -472,6 +484,9 @@ public:
|
|||||||
VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
||||||
VTransformVectorNode(vtransform, 3, number_of_nodes) {}
|
VTransformVectorNode(vtransform, 3, number_of_nodes) {}
|
||||||
LoadNode::ControlDependency control_dependency() const;
|
LoadNode::ControlDependency control_dependency() const;
|
||||||
|
virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
|
||||||
|
virtual bool is_load_or_store_in_loop() const override { return true; }
|
||||||
|
virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
|
||||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||||
NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
|
NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
|
||||||
@ -482,6 +497,9 @@ public:
|
|||||||
// req = 4 -> [ctrl, mem, adr, val]
|
// req = 4 -> [ctrl, mem, adr, val]
|
||||||
VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) :
|
||||||
VTransformVectorNode(vtransform, 4, number_of_nodes) {}
|
VTransformVectorNode(vtransform, 4, number_of_nodes) {}
|
||||||
|
virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
|
||||||
|
virtual bool is_load_or_store_in_loop() const override { return true; }
|
||||||
|
virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
|
||||||
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
|
||||||
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
|
||||||
NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
|
NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
|
||||||
|
@ -168,6 +168,9 @@ public class TestAlignVector {
|
|||||||
tests.put("test14aB", () -> { return test14aB(aB.clone()); });
|
tests.put("test14aB", () -> { return test14aB(aB.clone()); });
|
||||||
tests.put("test14bB", () -> { return test14bB(aB.clone()); });
|
tests.put("test14bB", () -> { return test14bB(aB.clone()); });
|
||||||
tests.put("test14cB", () -> { return test14cB(aB.clone()); });
|
tests.put("test14cB", () -> { return test14cB(aB.clone()); });
|
||||||
|
tests.put("test14dB", () -> { return test14dB(aB.clone()); });
|
||||||
|
tests.put("test14eB", () -> { return test14eB(aB.clone()); });
|
||||||
|
tests.put("test14fB", () -> { return test14fB(aB.clone()); });
|
||||||
|
|
||||||
tests.put("test15aB", () -> { return test15aB(aB.clone()); });
|
tests.put("test15aB", () -> { return test15aB(aB.clone()); });
|
||||||
tests.put("test15bB", () -> { return test15bB(aB.clone()); });
|
tests.put("test15bB", () -> { return test15bB(aB.clone()); });
|
||||||
@ -239,6 +242,9 @@ public class TestAlignVector {
|
|||||||
"test14aB",
|
"test14aB",
|
||||||
"test14bB",
|
"test14bB",
|
||||||
"test14cB",
|
"test14cB",
|
||||||
|
"test14dB",
|
||||||
|
"test14eB",
|
||||||
|
"test14fB",
|
||||||
"test15aB",
|
"test15aB",
|
||||||
"test15bB",
|
"test15bB",
|
||||||
"test15cB",
|
"test15cB",
|
||||||
@ -1128,9 +1134,9 @@ public class TestAlignVector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
IRNode.ADD_VB, "> 0",
|
IRNode.ADD_VB, "= 0",
|
||||||
IRNode.STORE_VECTOR, "> 0"},
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
applyIfPlatform = {"64-bit", "true"},
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
applyIf = {"AlignVector", "false"})
|
applyIf = {"AlignVector", "false"})
|
||||||
@ -1143,6 +1149,9 @@ public class TestAlignVector {
|
|||||||
static Object[] test14aB(byte[] a) {
|
static Object[] test14aB(byte[] a) {
|
||||||
// non-power-of-2 stride
|
// non-power-of-2 stride
|
||||||
for (int i = 0; i < RANGE-20; i+=9) {
|
for (int i = 0; i < RANGE-20; i+=9) {
|
||||||
|
// Since the stride is shorter than the vector length, there will be always
|
||||||
|
// partial overlap of loads with previous stores, this leads to failure in
|
||||||
|
// store-to-load-forwarding -> vectorization not profitable.
|
||||||
a[i+0]++;
|
a[i+0]++;
|
||||||
a[i+1]++;
|
a[i+1]++;
|
||||||
a[i+2]++;
|
a[i+2]++;
|
||||||
@ -1164,9 +1173,9 @@ public class TestAlignVector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
IRNode.ADD_VB, "> 0",
|
IRNode.ADD_VB, "= 0",
|
||||||
IRNode.STORE_VECTOR, "> 0"},
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
applyIfPlatform = {"64-bit", "true"},
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
applyIf = {"AlignVector", "false"})
|
applyIf = {"AlignVector", "false"})
|
||||||
@ -1179,6 +1188,9 @@ public class TestAlignVector {
|
|||||||
static Object[] test14bB(byte[] a) {
|
static Object[] test14bB(byte[] a) {
|
||||||
// non-power-of-2 stride
|
// non-power-of-2 stride
|
||||||
for (int i = 0; i < RANGE-20; i+=3) {
|
for (int i = 0; i < RANGE-20; i+=3) {
|
||||||
|
// Since the stride is shorter than the vector length, there will be always
|
||||||
|
// partial overlap of loads with previous stores, this leads to failure in
|
||||||
|
// store-to-load-forwarding -> vectorization not profitable.
|
||||||
a[i+0]++;
|
a[i+0]++;
|
||||||
a[i+1]++;
|
a[i+1]++;
|
||||||
a[i+2]++;
|
a[i+2]++;
|
||||||
@ -1200,9 +1212,9 @@ public class TestAlignVector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
IRNode.ADD_VB, "> 0",
|
IRNode.ADD_VB, "= 0",
|
||||||
IRNode.STORE_VECTOR, "> 0"},
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
applyIfPlatform = {"64-bit", "true"},
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
applyIf = {"AlignVector", "false"})
|
applyIf = {"AlignVector", "false"})
|
||||||
@ -1215,6 +1227,9 @@ public class TestAlignVector {
|
|||||||
static Object[] test14cB(byte[] a) {
|
static Object[] test14cB(byte[] a) {
|
||||||
// non-power-of-2 stride
|
// non-power-of-2 stride
|
||||||
for (int i = 0; i < RANGE-20; i+=5) {
|
for (int i = 0; i < RANGE-20; i+=5) {
|
||||||
|
// Since the stride is shorter than the vector length, there will be always
|
||||||
|
// partial overlap of loads with previous stores, this leads to failure in
|
||||||
|
// store-to-load-forwarding -> vectorization not profitable.
|
||||||
a[i+0]++;
|
a[i+0]++;
|
||||||
a[i+1]++;
|
a[i+1]++;
|
||||||
a[i+2]++;
|
a[i+2]++;
|
||||||
@ -1235,6 +1250,90 @@ public class TestAlignVector {
|
|||||||
return new Object[]{ a };
|
return new Object[]{ a };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.STORE_VECTOR, "> 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "false"})
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
|
IRNode.ADD_VB, "= 0",
|
||||||
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "true"})
|
||||||
|
static Object[] test14dB(byte[] a) {
|
||||||
|
// non-power-of-2 stride
|
||||||
|
for (int i = 0; i < RANGE-20; i+=9) {
|
||||||
|
a[i+0]++;
|
||||||
|
a[i+1]++;
|
||||||
|
a[i+2]++;
|
||||||
|
a[i+3]++;
|
||||||
|
a[i+4]++;
|
||||||
|
a[i+5]++;
|
||||||
|
a[i+6]++;
|
||||||
|
a[i+7]++;
|
||||||
|
}
|
||||||
|
return new Object[]{ a };
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.STORE_VECTOR, "> 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "false"})
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
|
IRNode.ADD_VB, "= 0",
|
||||||
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "true"})
|
||||||
|
static Object[] test14eB(byte[] a) {
|
||||||
|
// non-power-of-2 stride
|
||||||
|
for (int i = 0; i < RANGE-32; i+=11) {
|
||||||
|
a[i+0]++;
|
||||||
|
a[i+1]++;
|
||||||
|
a[i+2]++;
|
||||||
|
a[i+3]++;
|
||||||
|
a[i+4]++;
|
||||||
|
a[i+5]++;
|
||||||
|
a[i+6]++;
|
||||||
|
a[i+7]++;
|
||||||
|
}
|
||||||
|
return new Object[]{ a };
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
|
||||||
|
IRNode.STORE_VECTOR, "> 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "false"})
|
||||||
|
@IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
|
||||||
|
IRNode.ADD_VB, "= 0",
|
||||||
|
IRNode.STORE_VECTOR, "= 0"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||||
|
applyIfPlatform = {"64-bit", "true"},
|
||||||
|
applyIf = {"AlignVector", "true"})
|
||||||
|
static Object[] test14fB(byte[] a) {
|
||||||
|
// non-power-of-2 stride
|
||||||
|
for (int i = 0; i < RANGE-40; i+=12) {
|
||||||
|
a[i+0]++;
|
||||||
|
a[i+1]++;
|
||||||
|
a[i+2]++;
|
||||||
|
a[i+3]++;
|
||||||
|
a[i+4]++;
|
||||||
|
a[i+5]++;
|
||||||
|
a[i+6]++;
|
||||||
|
a[i+7]++;
|
||||||
|
}
|
||||||
|
return new Object[]{ a };
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
// IR rules difficult because of modulo wrapping with offset after peeling.
|
// IR rules difficult because of modulo wrapping with offset after peeling.
|
||||||
static Object[] test15aB(byte[] a) {
|
static Object[] test15aB(byte[] a) {
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* @test
|
* @test
|
||||||
* @bug 8298935
|
* @bug 8298935 8334431
|
||||||
* @summary Writing forward on array creates cyclic dependency
|
* @summary Writing forward on array creates cyclic dependency
|
||||||
* which leads to wrong result, when ignored.
|
* which leads to wrong result, when ignored.
|
||||||
* @library /test/lib /
|
* @library /test/lib /
|
||||||
@ -55,15 +55,30 @@ public class TestCyclicDependency {
|
|||||||
float[] goldF6a = new float[RANGE];
|
float[] goldF6a = new float[RANGE];
|
||||||
int[] goldI6b = new int[RANGE];
|
int[] goldI6b = new int[RANGE];
|
||||||
float[] goldF6b = new float[RANGE];
|
float[] goldF6b = new float[RANGE];
|
||||||
int[] goldI7 = new int[RANGE];
|
int[] goldI7a = new int[RANGE];
|
||||||
float[] goldF7 = new float[RANGE];
|
float[] goldF7a = new float[RANGE];
|
||||||
int[] goldI8 = new int[RANGE];
|
int[] goldI7b = new int[RANGE];
|
||||||
float[] goldF8 = new float[RANGE];
|
float[] goldF7b = new float[RANGE];
|
||||||
|
float[] goldF7b_2 = new float[RANGE];
|
||||||
|
int[] goldI7c = new int[RANGE];
|
||||||
|
float[] goldF7c = new float[RANGE];
|
||||||
|
int[] goldI8a = new int[RANGE];
|
||||||
|
float[] goldF8a = new float[RANGE];
|
||||||
|
int[] goldI8b = new int[RANGE];
|
||||||
|
int[] goldI8b_2 = new int[RANGE];
|
||||||
|
float[] goldF8b = new float[RANGE];
|
||||||
|
int[] goldI8c = new int[RANGE];
|
||||||
|
float[] goldF8c = new float[RANGE];
|
||||||
int[] goldI9 = new int[RANGE];
|
int[] goldI9 = new int[RANGE];
|
||||||
float[] goldF9 = new float[RANGE];
|
float[] goldF9 = new float[RANGE];
|
||||||
|
|
||||||
public static void main(String args[]) {
|
public static void main(String args[]) {
|
||||||
TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*");
|
TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
|
||||||
|
"-XX:+IgnoreUnrecognizedVMOptions", "-XX:-AlignVector", "-XX:-VerifyAlignVector");
|
||||||
|
TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
|
||||||
|
"-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:-VerifyAlignVector");
|
||||||
|
TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*",
|
||||||
|
"-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:+VerifyAlignVector");
|
||||||
}
|
}
|
||||||
|
|
||||||
TestCyclicDependency() {
|
TestCyclicDependency() {
|
||||||
@ -95,12 +110,24 @@ public class TestCyclicDependency {
|
|||||||
// test6b
|
// test6b
|
||||||
init(goldI6b, goldF6b);
|
init(goldI6b, goldF6b);
|
||||||
test6b(goldI6b, goldF6b);
|
test6b(goldI6b, goldF6b);
|
||||||
// test7
|
// test7a
|
||||||
init(goldI7, goldF7);
|
init(goldI7a, goldF7a);
|
||||||
test7(goldI7, goldF7);
|
test7a(goldI7a, goldF7a);
|
||||||
// test8
|
// test7b
|
||||||
init(goldI8, goldF8);
|
init(goldI7b, goldF7b, goldF7b_2);
|
||||||
test8(goldI8, goldF8);
|
test7b(goldI7b, goldF7b, goldF7b_2);
|
||||||
|
// test7c
|
||||||
|
init(goldI7c, goldF7c);
|
||||||
|
test7c(goldI7c, goldF7c, goldF7c);
|
||||||
|
// test8a
|
||||||
|
init(goldI8a, goldF8a);
|
||||||
|
test8a(goldI8a, goldF8a);
|
||||||
|
// test8b
|
||||||
|
init(goldI8b, goldI8b_2, goldF8b);
|
||||||
|
test8b(goldI8b, goldI8b_2, goldF8b);
|
||||||
|
// test8c
|
||||||
|
init(goldI8c, goldF8c);
|
||||||
|
test8c(goldI8c, goldI8c, goldF8c);
|
||||||
// test9
|
// test9
|
||||||
init(goldI9, goldF9);
|
init(goldI9, goldF9);
|
||||||
test9(goldI9, goldF9);
|
test9(goldI9, goldF9);
|
||||||
@ -205,26 +232,74 @@ public class TestCyclicDependency {
|
|||||||
verifyF("test6b", dataF, goldF6b);
|
verifyF("test6b", dataF, goldF6b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Run(test = "test7")
|
@Run(test = "test7a")
|
||||||
@Warmup(100)
|
@Warmup(100)
|
||||||
public void runTest7() {
|
public void runTest7a() {
|
||||||
int[] dataI = new int[RANGE];
|
int[] dataI = new int[RANGE];
|
||||||
float[] dataF = new float[RANGE];
|
float[] dataF = new float[RANGE];
|
||||||
init(dataI, dataF);
|
init(dataI, dataF);
|
||||||
test7(dataI, dataF);
|
test7a(dataI, dataF);
|
||||||
verifyI("test7", dataI, goldI7);
|
verifyI("test7a", dataI, goldI7a);
|
||||||
verifyF("test7", dataF, goldF7);
|
verifyF("test7a", dataF, goldF7a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Run(test = "test8")
|
@Run(test = "test7b")
|
||||||
@Warmup(100)
|
@Warmup(100)
|
||||||
public void runTest8() {
|
public void runTest7b() {
|
||||||
|
int[] dataI = new int[RANGE];
|
||||||
|
float[] dataF = new float[RANGE];
|
||||||
|
float[] dataF_2 = new float[RANGE];
|
||||||
|
init(dataI, dataF, dataF_2);
|
||||||
|
test7b(dataI, dataF, dataF_2);
|
||||||
|
verifyI("test7b", dataI, goldI7b);
|
||||||
|
verifyF("test7b", dataF, goldF7b);
|
||||||
|
verifyF("test7b", dataF_2, goldF7b_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Run(test = "test7c")
|
||||||
|
@Warmup(100)
|
||||||
|
public void runTest7c() {
|
||||||
int[] dataI = new int[RANGE];
|
int[] dataI = new int[RANGE];
|
||||||
float[] dataF = new float[RANGE];
|
float[] dataF = new float[RANGE];
|
||||||
init(dataI, dataF);
|
init(dataI, dataF);
|
||||||
test8(dataI, dataF);
|
test7c(dataI, dataF, dataF);
|
||||||
verifyI("test8", dataI, goldI8);
|
verifyI("test7c", dataI, goldI7c);
|
||||||
verifyF("test8", dataF, goldF8);
|
verifyF("test7c", dataF, goldF7c);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Run(test = "test8a")
|
||||||
|
@Warmup(100)
|
||||||
|
public void runTest8a() {
|
||||||
|
int[] dataI = new int[RANGE];
|
||||||
|
float[] dataF = new float[RANGE];
|
||||||
|
init(dataI, dataF);
|
||||||
|
test8a(dataI, dataF);
|
||||||
|
verifyI("test8a", dataI, goldI8a);
|
||||||
|
verifyF("test8a", dataF, goldF8a);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Run(test = "test8b")
|
||||||
|
@Warmup(100)
|
||||||
|
public void runTest8b() {
|
||||||
|
int[] dataI = new int[RANGE];
|
||||||
|
int[] dataI_2 = new int[RANGE];
|
||||||
|
float[] dataF = new float[RANGE];
|
||||||
|
init(dataI, dataI_2, dataF);
|
||||||
|
test8b(dataI, dataI_2, dataF);
|
||||||
|
verifyI("test8b", dataI, goldI8b);
|
||||||
|
verifyI("test8b", dataI_2, goldI8b_2);
|
||||||
|
verifyF("test8b", dataF, goldF8b);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Run(test = "test8c")
|
||||||
|
@Warmup(100)
|
||||||
|
public void runTest8c() {
|
||||||
|
int[] dataI = new int[RANGE];
|
||||||
|
float[] dataF = new float[RANGE];
|
||||||
|
init(dataI, dataF);
|
||||||
|
test8c(dataI, dataI, dataF);
|
||||||
|
verifyI("test8c", dataI, goldI8c);
|
||||||
|
verifyF("test8c", dataF, goldF8c);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Run(test = "test9")
|
@Run(test = "test9")
|
||||||
@ -328,34 +403,156 @@ public class TestCyclicDependency {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(counts = {IRNode.ADD_VI, "> 0"},
|
@IR(counts = {IRNode.ADD_VI, "= 0",
|
||||||
|
IRNode.ADD_VF, "= 0"},
|
||||||
applyIf = {"AlignVector", "false"},
|
applyIf = {"AlignVector", "false"},
|
||||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "> 0",
|
||||||
|
IRNode.ADD_VF, "= 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
static void test7(int[] dataI, float[] dataF) {
|
static void test7a(int[] dataI, float[] dataF) {
|
||||||
for (int i = 0; i < RANGE - 32; i++) {
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
// write forward 32 -> more than vector size -> can vectorize
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
// write forward 3 -> cannot vectorize
|
|
||||||
// separate types should make decision separately if they vectorize or not
|
|
||||||
int v = dataI[i];
|
int v = dataI[i];
|
||||||
dataI[i + 32] = v + 5;
|
dataI[i + 32] = v + 5;
|
||||||
|
// write forward 3:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> could vectorize, but would get 2-element vectors where
|
||||||
|
// store-to-load-forwarding fails, because we have store-load
|
||||||
|
// dependencies that have partial overlap.
|
||||||
|
// -> all vectorization cancled.
|
||||||
float f = dataF[i];
|
float f = dataF[i];
|
||||||
dataF[i + 3] = f + 3.5f;
|
dataF[i + 3] = f + 3.5f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(counts = {IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
@IR(counts = {IRNode.ADD_VI, "> 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"},
|
||||||
applyIf = {"AlignVector", "false"},
|
applyIf = {"AlignVector", "false"},
|
||||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "> 0",
|
||||||
|
IRNode.ADD_VF, "= 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
static void test8(int[] dataI, float[] dataF) {
|
static void test7b(int[] dataI, float[] dataF, float[] dataF_2) {
|
||||||
for (int i = 0; i < RANGE - 32; i++) {
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
// write forward 32 -> more than vector size -> can vectorize
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
// write forward 3 -> cannot vectorize
|
int v = dataI[i];
|
||||||
// separate types should make decision separately if they vectorize or not
|
dataI[i + 32] = v + 5;
|
||||||
|
// write forward 3 to different array reference:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
|
||||||
|
// failure. But we can only have 2-element vectors in case
|
||||||
|
// the two float-arrays reference the same array.
|
||||||
|
// Note: at runtime the float-arrays are always different.
|
||||||
|
float f = dataF[i];
|
||||||
|
dataF_2[i + 3] = f + 3.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "> 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "false"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "> 0",
|
||||||
|
IRNode.ADD_VF, "= 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
|
static void test7c(int[] dataI, float[] dataF, float[] dataF_2) {
|
||||||
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
|
int v = dataI[i];
|
||||||
|
dataI[i + 32] = v + 5;
|
||||||
|
// write forward 3 to different array reference:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
|
||||||
|
// failure. But we can only have 2-element vectors in case
|
||||||
|
// the two float-arrays reference the same array.
|
||||||
|
// Note: at runtime the float-arrays are always the same.
|
||||||
|
float f = dataF[i];
|
||||||
|
dataF_2[i + 3] = f + 3.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "= 0",
|
||||||
|
IRNode.ADD_VF, "= 0"},
|
||||||
|
applyIf = {"AlignVector", "false"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "= 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
|
static void test8a(int[] dataI, float[] dataF) {
|
||||||
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
|
// write forward 3:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> could vectorize, but would get 2-element vectors where
|
||||||
|
// store-to-load-forwarding fails, because we have store-load
|
||||||
|
// dependencies that have partial overlap.
|
||||||
|
// -> all vectorization cancled.
|
||||||
int v = dataI[i];
|
int v = dataI[i];
|
||||||
dataI[i + 3] = v + 5;
|
dataI[i + 3] = v + 5;
|
||||||
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
|
float f = dataF[i];
|
||||||
|
dataF[i + 32] = f + 3.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "false"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "= 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
|
static void test8b(int[] dataI, int[] dataI_2, float[] dataF) {
|
||||||
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
|
// write forward 3 to different array reference:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
|
||||||
|
// failure. But we can only have 2-element vectors in case
|
||||||
|
// the two float-arrays reference the same array.
|
||||||
|
// Note: at runtime the float-arrays are always different.
|
||||||
|
int v = dataI[i];
|
||||||
|
dataI_2[i + 3] = v + 5;
|
||||||
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
|
float f = dataF[i];
|
||||||
|
dataF[i + 32] = f + 3.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "false"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
@IR(counts = {IRNode.ADD_VI, "= 0",
|
||||||
|
IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"},
|
||||||
|
applyIf = {"AlignVector", "true"},
|
||||||
|
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||||
|
// Some aarch64 machines have AlignVector == true, like ThunderX2
|
||||||
|
static void test8c(int[] dataI, int[] dataI_2, float[] dataF) {
|
||||||
|
for (int i = 0; i < RANGE - 32; i++) {
|
||||||
|
// write forward 3 to different array reference:
|
||||||
|
// AlignVector=true -> cannot vectorize because load and store cannot be both aligned
|
||||||
|
// AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding
|
||||||
|
// failure. But we can only have 2-element vectors in case
|
||||||
|
// the two float-arrays reference the same array.
|
||||||
|
// Note: at runtime the float-arrays are always the same.
|
||||||
|
int v = dataI[i];
|
||||||
|
dataI_2[i + 3] = v + 5;
|
||||||
|
// write forward 32 -> more than vector size -> can vectorize
|
||||||
float f = dataF[i];
|
float f = dataF[i];
|
||||||
dataF[i + 32] = f + 3.5f;
|
dataF[i + 32] = f + 3.5f;
|
||||||
}
|
}
|
||||||
@ -380,6 +577,22 @@ public class TestCyclicDependency {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void init(int[] dataI, float[] dataF, float[] dataF_2) {
|
||||||
|
for (int j = 0; j < RANGE; j++) {
|
||||||
|
dataI[j] = j;
|
||||||
|
dataF[j] = j * 0.5f;
|
||||||
|
dataF_2[j] = j * 0.3f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void init(int[] dataI, int[] dataI_2, float[] dataF) {
|
||||||
|
for (int j = 0; j < RANGE; j++) {
|
||||||
|
dataI[j] = j;
|
||||||
|
dataI_2[j] = 3*j - 42;
|
||||||
|
dataF[j] = j * 0.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void verifyI(String name, int[] data, int[] gold) {
|
static void verifyI(String name, int[] data, int[] gold) {
|
||||||
for (int i = 0; i < RANGE; i++) {
|
for (int i = 0; i < RANGE; i++) {
|
||||||
if (data[i] != gold[i]) {
|
if (data[i] != gold[i]) {
|
||||||
|
@ -643,6 +643,12 @@ public class TestDependencyOffsets {
|
|||||||
return new ArrayList<Integer>(set);
|
return new ArrayList<Integer>(set);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum ExpectVectorization {
|
||||||
|
ALWAYS, // -> positive "count" IR rule
|
||||||
|
UNKNOWN, // -> disable IR rule
|
||||||
|
NEVER // -> negative "failOn" IR rule
|
||||||
|
};
|
||||||
|
|
||||||
static record TestDefinition (int id, Type type, int offset) {
|
static record TestDefinition (int id, Type type, int offset) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -656,18 +662,22 @@ public class TestDependencyOffsets {
|
|||||||
String aliasingComment;
|
String aliasingComment;
|
||||||
String secondArgument;
|
String secondArgument;
|
||||||
String loadFrom;
|
String loadFrom;
|
||||||
|
boolean isSingleArray;
|
||||||
switch (RANDOM.nextInt(3)) {
|
switch (RANDOM.nextInt(3)) {
|
||||||
case 0: // a[i + offset] = a[i]
|
case 0: // a[i + offset] = a[i]
|
||||||
|
isSingleArray = true;
|
||||||
aliasingComment = "single-array";
|
aliasingComment = "single-array";
|
||||||
secondArgument = "a";
|
secondArgument = "a";
|
||||||
loadFrom = "a";
|
loadFrom = "a";
|
||||||
break;
|
break;
|
||||||
case 1: // a[i + offset] = b[i], but a and b alias, i.e. at runtime a == b.
|
case 1: // a[i + offset] = b[i], but a and b alias, i.e. at runtime a == b.
|
||||||
|
isSingleArray = false;
|
||||||
aliasingComment = "aliasing";
|
aliasingComment = "aliasing";
|
||||||
secondArgument = "a";
|
secondArgument = "a";
|
||||||
loadFrom = "b";
|
loadFrom = "b";
|
||||||
break;
|
break;
|
||||||
case 2: // a[i + offset] = b[i], and a and b do not alias, i.e. at runtime a != b.
|
case 2: // a[i + offset] = b[i], and a and b do not alias, i.e. at runtime a != b.
|
||||||
|
isSingleArray = false;
|
||||||
aliasingComment = "non-aliasing";
|
aliasingComment = "non-aliasing";
|
||||||
secondArgument = "b";
|
secondArgument = "b";
|
||||||
loadFrom = "b";
|
loadFrom = "b";
|
||||||
@ -712,7 +722,7 @@ public class TestDependencyOffsets {
|
|||||||
type.name, id, type.name,
|
type.name, id, type.name,
|
||||||
id, id, id, id, secondArgument, id,
|
id, id, id, id, secondArgument, id,
|
||||||
// IR rules
|
// IR rules
|
||||||
generateIRRules(),
|
generateIRRules(isSingleArray),
|
||||||
// test
|
// test
|
||||||
id, type.name, type.name,
|
id, type.name, type.name,
|
||||||
start, end,
|
start, end,
|
||||||
@ -726,7 +736,7 @@ public class TestDependencyOffsets {
|
|||||||
* expect depends on AlignVector and MaxVectorSize, as well as the byteOffset between the load and
|
* expect depends on AlignVector and MaxVectorSize, as well as the byteOffset between the load and
|
||||||
* store.
|
* store.
|
||||||
*/
|
*/
|
||||||
String generateIRRules() {
|
String generateIRRules(boolean isSingleArray) {
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
for (CPUMinVectorWidth cm : getCPUMinVectorWidth(type.name)) {
|
for (CPUMinVectorWidth cm : getCPUMinVectorWidth(type.name)) {
|
||||||
@ -744,29 +754,75 @@ public class TestDependencyOffsets {
|
|||||||
// power of two.
|
// power of two.
|
||||||
int infinity = 256; // No vector size is ever larger than this.
|
int infinity = 256; // No vector size is ever larger than this.
|
||||||
int maxVectorWidth = infinity; // no constraint by default
|
int maxVectorWidth = infinity; // no constraint by default
|
||||||
if (0 < byteOffset && byteOffset < maxVectorWidth) {
|
|
||||||
int log2 = 31 - Integer.numberOfLeadingZeros(offset);
|
int log2 = 31 - Integer.numberOfLeadingZeros(offset);
|
||||||
int floorPow2 = 1 << log2;
|
int floorPow2Offset = 1 << log2;
|
||||||
maxVectorWidth = Math.min(maxVectorWidth, floorPow2 * type.size);
|
if (0 < byteOffset && byteOffset < maxVectorWidth) {
|
||||||
builder.append(" // Vectors must have at most " + floorPow2 +
|
maxVectorWidth = Math.min(maxVectorWidth, floorPow2Offset * type.size);
|
||||||
|
builder.append(" // Vectors must have at most " + floorPow2Offset +
|
||||||
" elements: maxVectorWidth = " + maxVectorWidth +
|
" elements: maxVectorWidth = " + maxVectorWidth +
|
||||||
" to avoid cyclic dependency.\n");
|
" to avoid cyclic dependency.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ExpectVectorization expectVectorization = ExpectVectorization.ALWAYS;
|
||||||
|
if (isSingleArray && 0 < offset && offset < 64) {
|
||||||
|
// In a store-forward case at iteration distances below a certain threshold, and not there
|
||||||
|
// is some partial overlap between the expected vector store and some vector load in a later
|
||||||
|
// iteration, we avoid vectorization to avoid the latency penalties of store-to-load
|
||||||
|
// forwarding failure. We only detect these failures in single-array cases.
|
||||||
|
//
|
||||||
|
// Note: we currently never detect store-to-load-forwarding failures beyond 64 iterations,
|
||||||
|
// And so if the offset >= 64, we always expect vectorization.
|
||||||
|
//
|
||||||
|
// The condition for partial overlap:
|
||||||
|
// offset % #elements != 0
|
||||||
|
//
|
||||||
|
// But we do not know #elements exactly, only a range from min/maxVectorWidth.
|
||||||
|
|
||||||
|
int maxElements = maxVectorWidth / type.size;
|
||||||
|
int minElements = minVectorWidth / type.size;
|
||||||
|
boolean sometimesPartialOverlap = offset % maxElements != 0;
|
||||||
|
// If offset % minElements != 0, then it does also not hold for any larger vector.
|
||||||
|
boolean alwaysPartialOverlap = offset % minElements != 0;
|
||||||
|
|
||||||
|
if (alwaysPartialOverlap) {
|
||||||
|
// It is a little tricky to know the exact threshold. On all platforms and in all
|
||||||
|
// unrolling cases, it is between 8 and 64. Hence, we have these 3 cases:
|
||||||
|
if (offset <= 8) {
|
||||||
|
builder.append(" // We always detect store-to-load-forwarding failures -> never vectorize.\n");
|
||||||
|
expectVectorization = ExpectVectorization.NEVER;
|
||||||
|
} else if (offset <= 64) {
|
||||||
|
builder.append(" // Unknown if detect store-to-load-forwarding failures -> maybe disable IR rules.\n");
|
||||||
|
expectVectorization = ExpectVectorization.UNKNOWN;
|
||||||
|
} else {
|
||||||
|
// offset > 64 -> offset too large, expect no store-to-load-failure detection
|
||||||
|
throw new RuntimeException("impossible");
|
||||||
|
}
|
||||||
|
} else if (sometimesPartialOverlap && !alwaysPartialOverlap) {
|
||||||
|
builder.append(" // Partial overlap condition true: sometimes but not always -> maybe disable IR rules.\n");
|
||||||
|
expectVectorization = ExpectVectorization.UNKNOWN;
|
||||||
|
} else {
|
||||||
|
builder.append(" // Partial overlap never happens -> expect vectorization.\n");
|
||||||
|
expectVectorization = ExpectVectorization.ALWAYS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Rule 1: No strict alignment: -XX:-AlignVector
|
// Rule 1: No strict alignment: -XX:-AlignVector
|
||||||
|
ExpectVectorization expectVectorization1 = expectVectorization;
|
||||||
IRRule r1 = new IRRule(type, type.irNode, applyIfCPUFeature);
|
IRRule r1 = new IRRule(type, type.irNode, applyIfCPUFeature);
|
||||||
r1.addApplyIf("\"AlignVector\", \"false\"");
|
r1.addApplyIf("\"AlignVector\", \"false\"");
|
||||||
r1.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
|
r1.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
|
||||||
|
|
||||||
if (maxVectorWidth < minVectorWidth) {
|
if (maxVectorWidth < minVectorWidth) {
|
||||||
builder.append(" // maxVectorWidth < minVectorWidth -> expect no vectorization.\n");
|
builder.append(" // maxVectorWidth < minVectorWidth -> expect no vectorization.\n");
|
||||||
r1.setNegative();
|
expectVectorization1 = ExpectVectorization.NEVER;
|
||||||
} else if (maxVectorWidth < infinity) {
|
} else if (maxVectorWidth < infinity) {
|
||||||
r1.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
|
r1.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
|
||||||
}
|
}
|
||||||
|
r1.setExpectVectVectorization(expectVectorization1);
|
||||||
r1.generate(builder);
|
r1.generate(builder);
|
||||||
|
|
||||||
// Rule 2: strict alignment: -XX:+AlignVector
|
// Rule 2: strict alignment: -XX:+AlignVector
|
||||||
|
ExpectVectorization expectVectorization2 = expectVectorization;
|
||||||
IRRule r2 = new IRRule(type, type.irNode, applyIfCPUFeature);
|
IRRule r2 = new IRRule(type, type.irNode, applyIfCPUFeature);
|
||||||
r2.addApplyIf("\"AlignVector\", \"true\"");
|
r2.addApplyIf("\"AlignVector\", \"true\"");
|
||||||
r2.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
|
r2.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\"");
|
||||||
@ -791,18 +847,23 @@ public class TestDependencyOffsets {
|
|||||||
builder.append(" // byteOffset % awMax == 0 -> always trivially aligned\n");
|
builder.append(" // byteOffset % awMax == 0 -> always trivially aligned\n");
|
||||||
} else if (byteOffset % awMin != 0) {
|
} else if (byteOffset % awMin != 0) {
|
||||||
builder.append(" // byteOffset % awMin != 0 -> can never align -> expect no vectorization.\n");
|
builder.append(" // byteOffset % awMin != 0 -> can never align -> expect no vectorization.\n");
|
||||||
r2.setNegative();
|
expectVectorization2 = ExpectVectorization.NEVER;
|
||||||
} else {
|
} else {
|
||||||
|
if (expectVectorization2 != ExpectVectorization.NEVER) {
|
||||||
builder.append(" // Alignment unknown -> disable IR rule.\n");
|
builder.append(" // Alignment unknown -> disable IR rule.\n");
|
||||||
r2.disable();
|
expectVectorization2 = ExpectVectorization.UNKNOWN;
|
||||||
|
} else {
|
||||||
|
builder.append(" // Alignment unknown -> but already proved no vectorization above.\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maxVectorWidth < minVectorWidth) {
|
if (maxVectorWidth < minVectorWidth) {
|
||||||
builder.append(" // Not at least 2 elements or 4 bytes -> expect no vectorization.\n");
|
builder.append(" // Not at least 2 elements or 4 bytes -> expect no vectorization.\n");
|
||||||
r2.setNegative();
|
expectVectorization2 = ExpectVectorization.NEVER;
|
||||||
} else if (maxVectorWidth < infinity) {
|
} else if (maxVectorWidth < infinity) {
|
||||||
r2.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
|
r2.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")");
|
||||||
}
|
}
|
||||||
|
r2.setExpectVectVectorization(expectVectorization2);
|
||||||
r2.generate(builder);
|
r2.generate(builder);
|
||||||
}
|
}
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
@ -846,12 +907,12 @@ public class TestDependencyOffsets {
|
|||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setNegative() {
|
void setExpectVectVectorization(ExpectVectorization expectVectorization) {
|
||||||
this.isPositiveRule = false;
|
switch(expectVectorization) {
|
||||||
|
case ExpectVectorization.NEVER -> { this.isPositiveRule = false; }
|
||||||
|
case ExpectVectorization.UNKNOWN -> { this.isEnabled = false; }
|
||||||
|
case ExpectVectorization.ALWAYS -> {}
|
||||||
}
|
}
|
||||||
|
|
||||||
void disable() {
|
|
||||||
this.isEnabled = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void addApplyIf(String constraint) {
|
void addApplyIf(String constraint) {
|
||||||
|
@ -138,8 +138,11 @@ public class LoopCombinedOpTest extends VectorizationTestRunner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"},
|
@IR(applyIfCPUFeatureOr = {"asimd", "true", "sse4.1", "true"},
|
||||||
counts = {IRNode.STORE_VECTOR, ">0"})
|
counts = {IRNode.STORE_VECTOR, ">0"})
|
||||||
|
// With sse2, the MulI does not vectorize. This means we have vectorized stores
|
||||||
|
// to res1, but scalar loads from res1. The store-to-load-forwarding failure
|
||||||
|
// detection catches this and rejects vectorization.
|
||||||
public int[] multipleStores() {
|
public int[] multipleStores() {
|
||||||
int[] res1 = new int[SIZE];
|
int[] res1 = new int[SIZE];
|
||||||
int[] res2 = new int[SIZE];
|
int[] res2 = new int[SIZE];
|
||||||
|
@ -0,0 +1,142 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package org.openjdk.bench.vm.compiler;
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
import org.openjdk.jmh.infra.*;
|
||||||
|
|
||||||
|
import java.lang.invoke.*;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.AverageTime)
|
||||||
|
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||||
|
@State(Scope.Thread)
|
||||||
|
@Warmup(iterations = 2, time = 1, timeUnit = TimeUnit.SECONDS)
|
||||||
|
@Measurement(iterations = 3, time = 1, timeUnit = TimeUnit.SECONDS)
|
||||||
|
@Fork(value = 1)
|
||||||
|
public abstract class VectorStoreToLoadForwarding {
|
||||||
|
@Param({"10000"})
|
||||||
|
public int SIZE;
|
||||||
|
|
||||||
|
@Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
|
||||||
|
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
|
||||||
|
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
|
||||||
|
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
|
||||||
|
"40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
|
||||||
|
"50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
|
||||||
|
"60", "61", "62", "63", "64", "65", "66", "67", "68", "69",
|
||||||
|
"70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
|
||||||
|
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89",
|
||||||
|
"90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
|
||||||
|
"100", "101", "102", "103", "104", "105", "106", "107", "108", "109",
|
||||||
|
"110", "111", "112", "113", "114", "115", "116", "117", "118", "119",
|
||||||
|
"120", "121", "122", "123", "124", "125", "126", "127", "128", "129"})
|
||||||
|
public int OFFSET;
|
||||||
|
|
||||||
|
// To get compile-time constants for OFFSET
|
||||||
|
static final MutableCallSite MUTABLE_CONSTANT = new MutableCallSite(MethodType.methodType(int.class));
|
||||||
|
static final MethodHandle MUTABLE_CONSTANT_HANDLE = MUTABLE_CONSTANT.dynamicInvoker();
|
||||||
|
|
||||||
|
public int START = 1000;
|
||||||
|
|
||||||
|
private byte[] aB;
|
||||||
|
private short[] aS;
|
||||||
|
private int[] aI;
|
||||||
|
private long[] aL;
|
||||||
|
|
||||||
|
@Param("0")
|
||||||
|
private int seed;
|
||||||
|
private Random r = new Random(seed);
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void init() throws Throwable {
|
||||||
|
aB = new byte[SIZE];
|
||||||
|
aS = new short[SIZE];
|
||||||
|
aI = new int[SIZE];
|
||||||
|
aL = new long[SIZE];
|
||||||
|
|
||||||
|
for (int i = START; i < SIZE; i++) {
|
||||||
|
aB[i] = (byte)r.nextInt();
|
||||||
|
aS[i] = (short)r.nextInt();
|
||||||
|
aI[i] = r.nextInt();
|
||||||
|
aL[i] = r.nextLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
MethodHandle constant = MethodHandles.constant(int.class, OFFSET);
|
||||||
|
MUTABLE_CONSTANT.setTarget(constant);
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.INLINE)
|
||||||
|
private int offset_con() throws Throwable {
|
||||||
|
return (int) MUTABLE_CONSTANT_HANDLE.invokeExact();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void bytes() throws Throwable {
|
||||||
|
int offset = offset_con();
|
||||||
|
for (int i = START; i < SIZE; i++) {
|
||||||
|
aB[i] = (byte)(aB[i - offset] + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void shorts() throws Throwable {
|
||||||
|
int offset = offset_con();
|
||||||
|
for (int i = START; i < SIZE; i++) {
|
||||||
|
aS[i] = (short)(aS[i - offset] + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void ints() throws Throwable {
|
||||||
|
int offset = offset_con();
|
||||||
|
for (int i = START; i < SIZE; i++) {
|
||||||
|
aI[i] = aI[i - offset] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void longs() throws Throwable {
|
||||||
|
int offset = offset_con();
|
||||||
|
for (int i = START; i < SIZE; i++) {
|
||||||
|
aL[i] = (long)(aL[i - offset] + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Fork(value = 1, jvmArgs = {
|
||||||
|
"-XX:+UseSuperWord"
|
||||||
|
})
|
||||||
|
public static class Default extends VectorStoreToLoadForwarding {}
|
||||||
|
|
||||||
|
@Fork(value = 1, jvmArgs = {
|
||||||
|
"-XX:-UseSuperWord"
|
||||||
|
})
|
||||||
|
public static class NoVectorization extends VectorStoreToLoadForwarding {}
|
||||||
|
|
||||||
|
@Fork(value = 1, jvmArgs = {
|
||||||
|
"-XX:+UseSuperWord", "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordStoreToLoadForwardingFailureDetection=0"
|
||||||
|
})
|
||||||
|
public static class NoStoreToLoadForwardFailureDetection extends VectorStoreToLoadForwarding {}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user