diff --git a/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp index 34e6e688abb..e57dab7d1ed 100644 --- a/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp @@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoRegScheduling, false); define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 8); define_pd_global(bool, IdealizeClearArrayNode, true); define_pd_global(intx, ReservedCodeCacheSize, 48*M); diff --git a/src/hotspot/cpu/arm/c2_globals_arm.hpp b/src/hotspot/cpu/arm/c2_globals_arm.hpp index 57ed8f11c08..a44a8f649ae 100644 --- a/src/hotspot/cpu/arm/c2_globals_arm.hpp +++ b/src/hotspot/cpu/arm/c2_globals_arm.hpp @@ -64,6 +64,7 @@ define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoScheduling, true); define_pd_global(bool, OptoRegScheduling, false); define_pd_global(bool, SuperWordLoopUnrollAnalysis, false); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16); define_pd_global(bool, IdealizeClearArrayNode, true); #ifdef _LP64 diff --git a/src/hotspot/cpu/ppc/c2_globals_ppc.hpp b/src/hotspot/cpu/ppc/c2_globals_ppc.hpp index 00a92ff6b62..f45faa21f01 100644 --- a/src/hotspot/cpu/ppc/c2_globals_ppc.hpp +++ b/src/hotspot/cpu/ppc/c2_globals_ppc.hpp @@ -59,6 +59,7 @@ define_pd_global(bool, UseCISCSpill, false); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoRegScheduling, false); define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16); // GL: // Detected a problem with unscaled compressed oops and // narrow_oop_use_complex_address() == false. diff --git a/src/hotspot/cpu/riscv/c2_globals_riscv.hpp b/src/hotspot/cpu/riscv/c2_globals_riscv.hpp index 53a41665f4b..e9947f9888a 100644 --- a/src/hotspot/cpu/riscv/c2_globals_riscv.hpp +++ b/src/hotspot/cpu/riscv/c2_globals_riscv.hpp @@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling, true); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoRegScheduling, false); define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16); define_pd_global(bool, IdealizeClearArrayNode, true); define_pd_global(intx, ReservedCodeCacheSize, 48*M); diff --git a/src/hotspot/cpu/s390/c2_globals_s390.hpp b/src/hotspot/cpu/s390/c2_globals_s390.hpp index 1de38f100f6..7f780ca63a0 100644 --- a/src/hotspot/cpu/s390/c2_globals_s390.hpp +++ b/src/hotspot/cpu/s390/c2_globals_s390.hpp @@ -61,6 +61,7 @@ define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoRegScheduling, false); define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16); // On s390x, we can clear the array with a single instruction, // so don't idealize it. define_pd_global(bool, IdealizeClearArrayNode, false); diff --git a/src/hotspot/cpu/x86/c2_globals_x86.hpp b/src/hotspot/cpu/x86/c2_globals_x86.hpp index f7315011e6b..084dde217e4 100644 --- a/src/hotspot/cpu/x86/c2_globals_x86.hpp +++ b/src/hotspot/cpu/x86/c2_globals_x86.hpp @@ -76,6 +76,7 @@ define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoRegScheduling, true); define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); +define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16); define_pd_global(bool, IdealizeClearArrayNode, true); define_pd_global(uintx, ReservedCodeCacheSize, 48*M); diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index 45a067a830b..d4b55ec2d8d 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -355,6 +355,12 @@ product(bool, SuperWordReductions, true, \ "Enable reductions support in superword.") \ \ + product_pd(uint, SuperWordStoreToLoadForwardingFailureDetection, DIAGNOSTIC, \ + "if >0, auto-vectorization detects possible store-to-load " \ + "forwarding failures. The number specifies over how many " \ + "loop iterations this detection spans.") \ + range(0, 4096) \ + \ product(bool, UseCMoveUnconditionally, false, \ "Use CMove (scalar and vector) ignoring profitability test.") \ \ diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 20c8dfbff17..8000e4fd39e 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1868,6 +1868,7 @@ bool SuperWord::schedule_and_apply() const { } if (!vtransform.schedule()) { return false; } + if (vtransform.has_store_to_load_forwarding_failure()) { return false; } vtransform.apply(); return true; } diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index fc4eaccff5c..4d152189625 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -31,7 +31,7 @@ #include "opto/vectorization.hpp" #ifndef PRODUCT -static void print_con_or_idx(const Node* n) { +void VPointer::print_con_or_idx(const Node* n) { if (n == nullptr) { tty->print("( 0)"); } else if (n->is_ConI()) { @@ -1369,12 +1369,12 @@ void VPointer::print() const { tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0); tty->print(" base"); - print_con_or_idx(_base); + VPointer::print_con_or_idx(_base); tty->print(" + offset(%4d)", _offset); tty->print(" + invar"); - print_con_or_idx(_invar); + VPointer::print_con_or_idx(_invar); tty->print_cr(" + scale(%4d) * iv]", _scale); } @@ -2168,15 +2168,15 @@ void AlignmentSolver::trace_start_solve() const { // iv = init + pre_iter * pre_stride + main_iter * main_stride tty->print(" iv = init"); - print_con_or_idx(_init_node); + VPointer::print_con_or_idx(_init_node); tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)", _pre_stride, _main_stride); // adr = base + offset + invar + scale * iv tty->print(" adr = base"); - print_con_or_idx(_base); + VPointer::print_con_or_idx(_base); tty->print(" + offset(%d) + invar", _offset); - print_con_or_idx(_invar); + VPointer::print_con_or_idx(_invar); tty->print_cr(" + scale(%d) * iv", _scale); } } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b084edd44b3..98aa3336ded 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -870,6 +870,7 @@ class VPointer : public ArenaObj { static int cmp_for_sort(const VPointer** p1, const VPointer** p2); NOT_PRODUCT( void print() const; ) + NOT_PRODUCT( static void print_con_or_idx(const Node* n); ) #ifndef PRODUCT class Tracer { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 7c7aca3b90e..d09a4c899f6 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -144,6 +144,274 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { } #endif +// We use two comparisons, because a subtraction could underflow. +#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ + if (a < b) { return -1; } \ + if (a > b) { return 1; } + +// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. +// It represents a memory region: [ptr, ptr + memory_size) +class VMemoryRegion : public StackObj { +private: + Node* _base; // ptr = base + offset + invar + scale * iv + int _scale; + Node* _invar; + int _offset; + uint _memory_size; + bool _is_load; // load or store? + uint _schedule_order; + +public: + VMemoryRegion() {} // empty constructor for GrowableArray + VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, uint schedule_order) : + _base(vpointer.base()), + _scale(vpointer.scale_in_bytes()), + _invar(vpointer.invar()), + _offset(vpointer.offset_in_bytes() + _scale * iv_offset), + _memory_size(vpointer.memory_size() * vector_length), + _is_load(vpointer.mem()->is_Load()), + _schedule_order(schedule_order) {} + + Node* base() const { return _base; } + int scale() const { return _scale; } + Node* invar() const { return _invar; } + int offset() const { return _offset; } + uint memory_size() const { return _memory_size; } + bool is_load() const { return _is_load; } + uint schedule_order() const { return _schedule_order; } + + static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) { + RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->base()->_idx, r2->base()->_idx); + RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->scale(), r2->scale()); + int r1_invar_idx = r1->invar() == nullptr ? 0 : r1->invar()->_idx; + int r2_invar_idx = r2->invar() == nullptr ? 0 : r2->invar()->_idx; + RETURN_CMP_VALUE_IF_NOT_EQUAL(r1_invar_idx, r2_invar_idx); + return 0; // equal + } + + static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) { + int cmp_group = cmp_for_sort_by_group(r1, r2); + if (cmp_group != 0) { return cmp_group; } + + RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->offset(), r2->offset()); + return 0; // equal + } + + enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER }; + + Aliasing aliasing(VMemoryRegion& other) { + VMemoryRegion* p1 = this; + VMemoryRegion* p2 = &other; + if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; } + + jlong offset1 = p1->offset(); + jlong offset2 = p2->offset(); + jlong memory_size1 = p1->memory_size(); + jlong memory_size2 = p2->memory_size(); + + if (offset1 >= offset2 + memory_size2) { return AFTER; } + if (offset2 >= offset1 + memory_size1) { return BEFORE; } + if (offset1 == offset2 && memory_size1 == memory_size2) { return EXACT_OVERLAP; } + return PARTIAL_OVERLAP; + } + +#ifndef PRODUCT + void print() const { + tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), base", + _is_load ? "load " : "store", _memory_size, _schedule_order); + VPointer::print_con_or_idx(_base); + tty->print(" + offset(%4d)", _offset); + tty->print(" + invar"); + VPointer::print_con_or_idx(_invar); + tty->print_cr(" + scale(%4d) * iv]", _scale); + } +#endif +}; + +// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch +// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles +// faster. However, this optimization comes with some restrictions, depending on the CPU. +// Generally, store-to-load-forwarding works if the load and store memory regions match +// exactly (same start and width). Generally problematic are partial overlaps - though +// some CPU's can handle even some subsets of these cases. We conservatively assume that +// all such partial overlaps lead to a store-to-load-forwarding failures, which means the +// load has to stall until the store goes from the store-buffer into the L1 cache, incurring +// a penalty of many CPU cycles. +// +// Example (with "iteration distance" 2): +// for (int i = 10; i < SIZE; i++) { +// aI[i] = aI[i - 2] + 1; +// } +// +// load_4_bytes( ptr + -8) +// store_4_bytes(ptr + 0) * +// load_4_bytes( ptr + -4) | +// store_4_bytes(ptr + 4) | * +// load_4_bytes( ptr + 0) <-+ | +// store_4_bytes(ptr + 8) | +// load_4_bytes( ptr + 4) <---+ +// store_4_bytes(ptr + 12) +// ... +// +// In the scalar loop, we can forward the stores from 2 iterations back. +// +// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2 +// example. This gives us this machine code: +// load_8_bytes( ptr + -8) +// store_8_bytes(ptr + 0) | +// load_8_bytes( ptr + 0) v +// store_8_bytes(ptr + 8) | +// load_8_bytes( ptr + 8) v +// store_8_bytes(ptr + 16) +// ... +// +// We packed 2 iterations, and the stores can perfectly forward to the loads of +// the next 2 iterations. +// +// Example (with "iteration distance" 3): +// for (int i = 10; i < SIZE; i++) { +// aI[i] = aI[i - 3] + 1; +// } +// +// load_4_bytes( ptr + -12) +// store_4_bytes(ptr + 0) * +// load_4_bytes( ptr + -8) | +// store_4_bytes(ptr + 4) | +// load_4_bytes( ptr + -4) | +// store_4_bytes(ptr + 8) | +// load_4_bytes( ptr + 0) <-+ +// store_4_bytes(ptr + 12) +// ... +// +// In the scalar loop, we can forward the stores from 3 iterations back. +// +// Unfortunately, vectorization can introduce such store-to-load-forwarding failures. +// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3 +// example. This gives us this machine code: +// load_8_bytes( ptr + -12) +// store_8_bytes(ptr + 0) | | +// load_8_bytes( ptr + -4) x | +// store_8_bytes(ptr + 8) || +// load_8_bytes( ptr + 4) xx <-- partial overlap with 2 stores +// store_8_bytes(ptr + 16) +// ... +// +// We see that eventually all loads are dependent on earlier stores, but the values cannot +// be forwarded because there is some partial overlap. +// +// Preferably, we would have some latency-based cost-model that accounts for such forwarding +// failures, and decide if vectorization with forwarding failures is still profitable. For +// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that +// there will be a forwarding failure. This approach has at least 2 possible weaknesses: +// +// (1) There may be forwarding failures in cases where we cannot prove it. +// Example: +// for (int i = 10; i < SIZE; i++) { +// bI[i] = aI[i - 3] + 1; +// } +// +// We do not know if aI and bI refer to the same array or not. However, it is reasonable +// to assume that if we have two different array references, that they most likely refer +// to different arrays (i.e. no aliasing), where we would have no forwarding failures. +// (2) There could be some loops where vectorization introduces forwarding failures, and thus +// the latency of the loop body is high, but this does not matter because it is dominated +// by other latency/throughput based costs in the loop body. +// +// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated +// that there is some iteration threshold: if the failure happens between a store and load that +// have an iteration distance below this threshold, the latency is the limiting factor, and we +// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If +// the iteration distance is larger than this threshold, the throughput is the limiting factor, +// and we should vectorize in these cases to improve throughput. +// +bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const { + if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; } + + // Collect all pointers for scalar and vector loads/stores. + ResourceMark rm; + GrowableArray memory_regions; + + // To detect store-to-load-forwarding failures at the iteration threshold or below, we + // simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection + // iterations at least. This is a heuristic, and we are not trying to be very precise + // with the iteration distance. If we have already unrolled more than the iteration + // threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count", + // then we simply check if there are any store-to-load-forwarding failures in the unrolled + // loop body, which may be at larger distance than the desired threshold. We cannot do any + // more fine-grained analysis, because the unrolling has lost the information about the + // iteration distance. + int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection; + int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count(); + uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count); + int iv_stride = vloop_analyzer.vloop().iv_stride(); + int schedule_order = 0; + for (uint k = 0; k < simulated_super_unrolling_count; k++) { + int iv_offset = k * iv_stride; // virtual super-unrolling + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + if (vtn->is_load_or_store_in_loop()) { + const VPointer& p = vtn->vpointer(vloop_analyzer); + if (p.valid()) { + VTransformVectorNode* vector = vtn->isa_Vector(); + uint vector_length = vector != nullptr ? vector->nodes().length() : 1; + memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, schedule_order++)); + } + } + } + } + + // Sort the pointers by group (same base, invar and stride), and then by offset. + memory_regions.sort(VMemoryRegion::cmp_for_sort); + +#ifndef PRODUCT + if (_trace._verbose) { + tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:"); + tty->print_cr(" simulated_unrolling_count = %d", simulated_unrolling_count); + tty->print_cr(" simulated_super_unrolling_count = %d", simulated_super_unrolling_count); + for (int i = 0; i < memory_regions.length(); i++) { + VMemoryRegion& region = memory_regions.at(i); + region.print(); + } + } +#endif + + // For all pairs of pointers in the same group, check if they have a partial overlap. + for (int i = 0; i < memory_regions.length(); i++) { + VMemoryRegion& region1 = memory_regions.at(i); + + for (int j = i + 1; j < memory_regions.length(); j++) { + VMemoryRegion& region2 = memory_regions.at(j); + + const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2); + if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP || + aliasing == VMemoryRegion::Aliasing::BEFORE) { + break; // We have reached the next group or pointers that are always after. + } else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) { + continue; + } else { + assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen"); + if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) || + (!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) { + // We predict that this leads to a store-to-load-forwarding failure penalty. +#ifndef PRODUCT + if (_trace._rejections) { + tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:"); + tty->print_cr(" Partial overlap of store->load. We predict that this leads to"); + tty->print_cr(" a store-to-load-forwarding failure penalty which makes"); + tty->print_cr(" vectorization unprofitable. These are the two pointers:"); + region1.print(); + region2.print(); + } +#endif + return true; + } + } + } + } + + return false; +} + Node* VTransformNode::find_transformed_input(int i, const GrowableArray& vnode_idx_to_transformed_node) const { Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx); assert(n != nullptr, "must find input IR node"); diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index ee298e7fe72..8ceca318f4a 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -66,6 +66,8 @@ class VTransformVectorNode; class VTransformElementWiseVectorNode; class VTransformBoolVectorNode; class VTransformReductionVectorNode; +class VTransformLoadVectorNode; +class VTransformStoreVectorNode; // Result from VTransformNode::apply class VTransformApplyResult { @@ -157,6 +159,7 @@ public: const GrowableArray& vtnodes() const { return _vtnodes; } bool schedule(); + bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const; void apply_memops_reordering_with_schedule() const; void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; @@ -221,6 +224,7 @@ public: VTransformGraph& graph() { return _graph; } bool schedule() { return _graph.schedule(); } + bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply(); private: @@ -310,6 +314,11 @@ public: virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; } virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; } virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; } + virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; } + virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; } + + virtual bool is_load_or_store_in_loop() const { return false; } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const = 0; @@ -333,6 +342,8 @@ public: VTransformNode(vtransform, n->req()), _node(n) {} Node* node() const { return _node; } virtual VTransformScalarNode* isa_Scalar() override { return this; } + virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };) @@ -347,6 +358,7 @@ public: VTransformInputScalarNode(VTransform& vtransform, Node* n) : VTransformScalarNode(vtransform, n) {} virtual VTransformInputScalarNode* isa_InputScalar() override { return this; } + virtual bool is_load_or_store_in_loop() const override { return false; } NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };) }; @@ -472,6 +484,9 @@ public: VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) : VTransformVectorNode(vtransform, 3, number_of_nodes) {} LoadNode::ControlDependency control_dependency() const; + virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } + virtual bool is_load_or_store_in_loop() const override { return true; } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) @@ -482,6 +497,9 @@ public: // req = 4 -> [ctrl, mem, adr, val] VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) : VTransformVectorNode(vtransform, 4, number_of_nodes) {} + virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } + virtual bool is_load_or_store_in_loop() const override { return true; } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java index efd328dc5cc..60d753ee75f 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java @@ -168,6 +168,9 @@ public class TestAlignVector { tests.put("test14aB", () -> { return test14aB(aB.clone()); }); tests.put("test14bB", () -> { return test14bB(aB.clone()); }); tests.put("test14cB", () -> { return test14cB(aB.clone()); }); + tests.put("test14dB", () -> { return test14dB(aB.clone()); }); + tests.put("test14eB", () -> { return test14eB(aB.clone()); }); + tests.put("test14fB", () -> { return test14fB(aB.clone()); }); tests.put("test15aB", () -> { return test15aB(aB.clone()); }); tests.put("test15bB", () -> { return test15bB(aB.clone()); }); @@ -239,6 +242,9 @@ public class TestAlignVector { "test14aB", "test14bB", "test14cB", + "test14dB", + "test14eB", + "test14fB", "test15aB", "test15bB", "test15cB", @@ -1128,9 +1134,9 @@ public class TestAlignVector { } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", - IRNode.ADD_VB, "> 0", - IRNode.STORE_VECTOR, "> 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIfPlatform = {"64-bit", "true"}, applyIf = {"AlignVector", "false"}) @@ -1143,6 +1149,9 @@ public class TestAlignVector { static Object[] test14aB(byte[] a) { // non-power-of-2 stride for (int i = 0; i < RANGE-20; i+=9) { + // Since the stride is shorter than the vector length, there will be always + // partial overlap of loads with previous stores, this leads to failure in + // store-to-load-forwarding -> vectorization not profitable. a[i+0]++; a[i+1]++; a[i+2]++; @@ -1164,9 +1173,9 @@ public class TestAlignVector { } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", - IRNode.ADD_VB, "> 0", - IRNode.STORE_VECTOR, "> 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIfPlatform = {"64-bit", "true"}, applyIf = {"AlignVector", "false"}) @@ -1179,6 +1188,9 @@ public class TestAlignVector { static Object[] test14bB(byte[] a) { // non-power-of-2 stride for (int i = 0; i < RANGE-20; i+=3) { + // Since the stride is shorter than the vector length, there will be always + // partial overlap of loads with previous stores, this leads to failure in + // store-to-load-forwarding -> vectorization not profitable. a[i+0]++; a[i+1]++; a[i+2]++; @@ -1200,9 +1212,9 @@ public class TestAlignVector { } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", - IRNode.ADD_VB, "> 0", - IRNode.STORE_VECTOR, "> 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIfPlatform = {"64-bit", "true"}, applyIf = {"AlignVector", "false"}) @@ -1215,6 +1227,9 @@ public class TestAlignVector { static Object[] test14cB(byte[] a) { // non-power-of-2 stride for (int i = 0; i < RANGE-20; i+=5) { + // Since the stride is shorter than the vector length, there will be always + // partial overlap of loads with previous stores, this leads to failure in + // store-to-load-forwarding -> vectorization not profitable. a[i+0]++; a[i+1]++; a[i+2]++; @@ -1235,6 +1250,90 @@ public class TestAlignVector { return new Object[]{ a }; } + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}) + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "true"}) + static Object[] test14dB(byte[] a) { + // non-power-of-2 stride + for (int i = 0; i < RANGE-20; i+=9) { + a[i+0]++; + a[i+1]++; + a[i+2]++; + a[i+3]++; + a[i+4]++; + a[i+5]++; + a[i+6]++; + a[i+7]++; + } + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}) + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "true"}) + static Object[] test14eB(byte[] a) { + // non-power-of-2 stride + for (int i = 0; i < RANGE-32; i+=11) { + a[i+0]++; + a[i+1]++; + a[i+2]++; + a[i+3]++; + a[i+4]++; + a[i+5]++; + a[i+6]++; + a[i+7]++; + } + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.ADD_VB, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}) + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.ADD_VB, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "true"}) + static Object[] test14fB(byte[] a) { + // non-power-of-2 stride + for (int i = 0; i < RANGE-40; i+=12) { + a[i+0]++; + a[i+1]++; + a[i+2]++; + a[i+3]++; + a[i+4]++; + a[i+5]++; + a[i+6]++; + a[i+7]++; + } + return new Object[]{ a }; + } + @Test // IR rules difficult because of modulo wrapping with offset after peeling. static Object[] test15aB(byte[] a) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java index 3849f1b05cf..7c6b7c92c37 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestCyclicDependency.java @@ -24,7 +24,7 @@ /* * @test - * @bug 8298935 + * @bug 8298935 8334431 * @summary Writing forward on array creates cyclic dependency * which leads to wrong result, when ignored. * @library /test/lib / @@ -55,15 +55,30 @@ public class TestCyclicDependency { float[] goldF6a = new float[RANGE]; int[] goldI6b = new int[RANGE]; float[] goldF6b = new float[RANGE]; - int[] goldI7 = new int[RANGE]; - float[] goldF7 = new float[RANGE]; - int[] goldI8 = new int[RANGE]; - float[] goldF8 = new float[RANGE]; + int[] goldI7a = new int[RANGE]; + float[] goldF7a = new float[RANGE]; + int[] goldI7b = new int[RANGE]; + float[] goldF7b = new float[RANGE]; + float[] goldF7b_2 = new float[RANGE]; + int[] goldI7c = new int[RANGE]; + float[] goldF7c = new float[RANGE]; + int[] goldI8a = new int[RANGE]; + float[] goldF8a = new float[RANGE]; + int[] goldI8b = new int[RANGE]; + int[] goldI8b_2 = new int[RANGE]; + float[] goldF8b = new float[RANGE]; + int[] goldI8c = new int[RANGE]; + float[] goldF8c = new float[RANGE]; int[] goldI9 = new int[RANGE]; float[] goldF9 = new float[RANGE]; public static void main(String args[]) { - TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*"); + TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*", + "-XX:+IgnoreUnrecognizedVMOptions", "-XX:-AlignVector", "-XX:-VerifyAlignVector"); + TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*", + "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:-VerifyAlignVector"); + TestFramework.runWithFlags("-XX:CompileCommand=compileonly,TestCyclicDependency::test*", + "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+AlignVector", "-XX:+VerifyAlignVector"); } TestCyclicDependency() { @@ -95,12 +110,24 @@ public class TestCyclicDependency { // test6b init(goldI6b, goldF6b); test6b(goldI6b, goldF6b); - // test7 - init(goldI7, goldF7); - test7(goldI7, goldF7); - // test8 - init(goldI8, goldF8); - test8(goldI8, goldF8); + // test7a + init(goldI7a, goldF7a); + test7a(goldI7a, goldF7a); + // test7b + init(goldI7b, goldF7b, goldF7b_2); + test7b(goldI7b, goldF7b, goldF7b_2); + // test7c + init(goldI7c, goldF7c); + test7c(goldI7c, goldF7c, goldF7c); + // test8a + init(goldI8a, goldF8a); + test8a(goldI8a, goldF8a); + // test8b + init(goldI8b, goldI8b_2, goldF8b); + test8b(goldI8b, goldI8b_2, goldF8b); + // test8c + init(goldI8c, goldF8c); + test8c(goldI8c, goldI8c, goldF8c); // test9 init(goldI9, goldF9); test9(goldI9, goldF9); @@ -205,26 +232,74 @@ public class TestCyclicDependency { verifyF("test6b", dataF, goldF6b); } - @Run(test = "test7") + @Run(test = "test7a") @Warmup(100) - public void runTest7() { + public void runTest7a() { int[] dataI = new int[RANGE]; float[] dataF = new float[RANGE]; init(dataI, dataF); - test7(dataI, dataF); - verifyI("test7", dataI, goldI7); - verifyF("test7", dataF, goldF7); + test7a(dataI, dataF); + verifyI("test7a", dataI, goldI7a); + verifyF("test7a", dataF, goldF7a); } - @Run(test = "test8") + @Run(test = "test7b") @Warmup(100) - public void runTest8() { + public void runTest7b() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + float[] dataF_2 = new float[RANGE]; + init(dataI, dataF, dataF_2); + test7b(dataI, dataF, dataF_2); + verifyI("test7b", dataI, goldI7b); + verifyF("test7b", dataF, goldF7b); + verifyF("test7b", dataF_2, goldF7b_2); + } + + @Run(test = "test7c") + @Warmup(100) + public void runTest7c() { int[] dataI = new int[RANGE]; float[] dataF = new float[RANGE]; init(dataI, dataF); - test8(dataI, dataF); - verifyI("test8", dataI, goldI8); - verifyF("test8", dataF, goldF8); + test7c(dataI, dataF, dataF); + verifyI("test7c", dataI, goldI7c); + verifyF("test7c", dataF, goldF7c); + } + + @Run(test = "test8a") + @Warmup(100) + public void runTest8a() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test8a(dataI, dataF); + verifyI("test8a", dataI, goldI8a); + verifyF("test8a", dataF, goldF8a); + } + + @Run(test = "test8b") + @Warmup(100) + public void runTest8b() { + int[] dataI = new int[RANGE]; + int[] dataI_2 = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataI_2, dataF); + test8b(dataI, dataI_2, dataF); + verifyI("test8b", dataI, goldI8b); + verifyI("test8b", dataI_2, goldI8b_2); + verifyF("test8b", dataF, goldF8b); + } + + @Run(test = "test8c") + @Warmup(100) + public void runTest8c() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test8c(dataI, dataI, dataF); + verifyI("test8c", dataI, goldI8c); + verifyF("test8c", dataF, goldF8c); } @Run(test = "test9") @@ -328,34 +403,156 @@ public class TestCyclicDependency { } @Test - @IR(counts = {IRNode.ADD_VI, "> 0"}, + @IR(counts = {IRNode.ADD_VI, "= 0", + IRNode.ADD_VF, "= 0"}, applyIf = {"AlignVector", "false"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // Some aarch64 machines have AlignVector == true, like ThunderX2 - static void test7(int[] dataI, float[] dataF) { + static void test7a(int[] dataI, float[] dataF) { for (int i = 0; i < RANGE - 32; i++) { // write forward 32 -> more than vector size -> can vectorize - // write forward 3 -> cannot vectorize - // separate types should make decision separately if they vectorize or not int v = dataI[i]; dataI[i + 32] = v + 5; + // write forward 3: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> could vectorize, but would get 2-element vectors where + // store-to-load-forwarding fails, because we have store-load + // dependencies that have partial overlap. + // -> all vectorization cancled. float f = dataF[i]; dataF[i + 3] = f + 3.5f; } } @Test - @IR(counts = {IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + @IR(counts = {IRNode.ADD_VI, "> 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"}, applyIf = {"AlignVector", "false"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // Some aarch64 machines have AlignVector == true, like ThunderX2 - static void test8(int[] dataI, float[] dataF) { + static void test7b(int[] dataI, float[] dataF, float[] dataF_2) { for (int i = 0; i < RANGE - 32; i++) { // write forward 32 -> more than vector size -> can vectorize - // write forward 3 -> cannot vectorize - // separate types should make decision separately if they vectorize or not + int v = dataI[i]; + dataI[i + 32] = v + 5; + // write forward 3 to different array reference: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding + // failure. But we can only have 2-element vectors in case + // the two float-arrays reference the same array. + // Note: at runtime the float-arrays are always different. + float f = dataF[i]; + dataF_2[i + 3] = f + 3.5f; + } + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "2", "> 0"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Some aarch64 machines have AlignVector == true, like ThunderX2 + static void test7c(int[] dataI, float[] dataF, float[] dataF_2) { + for (int i = 0; i < RANGE - 32; i++) { + // write forward 32 -> more than vector size -> can vectorize + int v = dataI[i]; + dataI[i + 32] = v + 5; + // write forward 3 to different array reference: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding + // failure. But we can only have 2-element vectors in case + // the two float-arrays reference the same array. + // Note: at runtime the float-arrays are always the same. + float f = dataF[i]; + dataF_2[i + 3] = f + 3.5f; + } + } + + @Test + @IR(counts = {IRNode.ADD_VI, "= 0", + IRNode.ADD_VF, "= 0"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "= 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Some aarch64 machines have AlignVector == true, like ThunderX2 + static void test8a(int[] dataI, float[] dataF) { + for (int i = 0; i < RANGE - 32; i++) { + // write forward 3: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> could vectorize, but would get 2-element vectors where + // store-to-load-forwarding fails, because we have store-load + // dependencies that have partial overlap. + // -> all vectorization cancled. int v = dataI[i]; dataI[i + 3] = v + 5; + // write forward 32 -> more than vector size -> can vectorize + float f = dataF[i]; + dataF[i + 32] = f + 3.5f; + } + } + + @Test + @IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "= 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Some aarch64 machines have AlignVector == true, like ThunderX2 + static void test8b(int[] dataI, int[] dataI_2, float[] dataF) { + for (int i = 0; i < RANGE - 32; i++) { + // write forward 3 to different array reference: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding + // failure. But we can only have 2-element vectors in case + // the two float-arrays reference the same array. + // Note: at runtime the float-arrays are always different. + int v = dataI[i]; + dataI_2[i + 3] = v + 5; + // write forward 32 -> more than vector size -> can vectorize + float f = dataF[i]; + dataF[i + 32] = f + 3.5f; + } + } + + @Test + @IR(counts = {IRNode.ADD_VI, IRNode.VECTOR_SIZE + "2", "> 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + @IR(counts = {IRNode.ADD_VI, "= 0", + IRNode.ADD_VF, IRNode.VECTOR_SIZE + "min(max_int, max_float)", "> 0"}, + applyIf = {"AlignVector", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Some aarch64 machines have AlignVector == true, like ThunderX2 + static void test8c(int[] dataI, int[] dataI_2, float[] dataF) { + for (int i = 0; i < RANGE - 32; i++) { + // write forward 3 to different array reference: + // AlignVector=true -> cannot vectorize because load and store cannot be both aligned + // AlignVector=false -> vectorizes because we cannot prove store-to-load forwarding + // failure. But we can only have 2-element vectors in case + // the two float-arrays reference the same array. + // Note: at runtime the float-arrays are always the same. + int v = dataI[i]; + dataI_2[i + 3] = v + 5; + // write forward 32 -> more than vector size -> can vectorize float f = dataF[i]; dataF[i + 32] = f + 3.5f; } @@ -380,6 +577,22 @@ public class TestCyclicDependency { } } + public static void init(int[] dataI, float[] dataF, float[] dataF_2) { + for (int j = 0; j < RANGE; j++) { + dataI[j] = j; + dataF[j] = j * 0.5f; + dataF_2[j] = j * 0.3f; + } + } + + public static void init(int[] dataI, int[] dataI_2, float[] dataF) { + for (int j = 0; j < RANGE; j++) { + dataI[j] = j; + dataI_2[j] = 3*j - 42; + dataF[j] = j * 0.5f; + } + } + static void verifyI(String name, int[] data, int[] gold) { for (int i = 0; i < RANGE; i++) { if (data[i] != gold[i]) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java index 8e5ac88a27d..cfa19ce385a 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java @@ -643,6 +643,12 @@ public class TestDependencyOffsets { return new ArrayList(set); } + enum ExpectVectorization { + ALWAYS, // -> positive "count" IR rule + UNKNOWN, // -> disable IR rule + NEVER // -> negative "failOn" IR rule + }; + static record TestDefinition (int id, Type type, int offset) { /* @@ -656,18 +662,22 @@ public class TestDependencyOffsets { String aliasingComment; String secondArgument; String loadFrom; + boolean isSingleArray; switch (RANDOM.nextInt(3)) { case 0: // a[i + offset] = a[i] + isSingleArray = true; aliasingComment = "single-array"; secondArgument = "a"; loadFrom = "a"; break; case 1: // a[i + offset] = b[i], but a and b alias, i.e. at runtime a == b. + isSingleArray = false; aliasingComment = "aliasing"; secondArgument = "a"; loadFrom = "b"; break; case 2: // a[i + offset] = b[i], and a and b do not alias, i.e. at runtime a != b. + isSingleArray = false; aliasingComment = "non-aliasing"; secondArgument = "b"; loadFrom = "b"; @@ -712,7 +722,7 @@ public class TestDependencyOffsets { type.name, id, type.name, id, id, id, id, secondArgument, id, // IR rules - generateIRRules(), + generateIRRules(isSingleArray), // test id, type.name, type.name, start, end, @@ -726,7 +736,7 @@ public class TestDependencyOffsets { * expect depends on AlignVector and MaxVectorSize, as well as the byteOffset between the load and * store. */ - String generateIRRules() { + String generateIRRules(boolean isSingleArray) { StringBuilder builder = new StringBuilder(); for (CPUMinVectorWidth cm : getCPUMinVectorWidth(type.name)) { @@ -744,29 +754,75 @@ public class TestDependencyOffsets { // power of two. int infinity = 256; // No vector size is ever larger than this. int maxVectorWidth = infinity; // no constraint by default + int log2 = 31 - Integer.numberOfLeadingZeros(offset); + int floorPow2Offset = 1 << log2; if (0 < byteOffset && byteOffset < maxVectorWidth) { - int log2 = 31 - Integer.numberOfLeadingZeros(offset); - int floorPow2 = 1 << log2; - maxVectorWidth = Math.min(maxVectorWidth, floorPow2 * type.size); - builder.append(" // Vectors must have at most " + floorPow2 + + maxVectorWidth = Math.min(maxVectorWidth, floorPow2Offset * type.size); + builder.append(" // Vectors must have at most " + floorPow2Offset + " elements: maxVectorWidth = " + maxVectorWidth + " to avoid cyclic dependency.\n"); } + ExpectVectorization expectVectorization = ExpectVectorization.ALWAYS; + if (isSingleArray && 0 < offset && offset < 64) { + // In a store-forward case at iteration distances below a certain threshold, and not there + // is some partial overlap between the expected vector store and some vector load in a later + // iteration, we avoid vectorization to avoid the latency penalties of store-to-load + // forwarding failure. We only detect these failures in single-array cases. + // + // Note: we currently never detect store-to-load-forwarding failures beyond 64 iterations, + // And so if the offset >= 64, we always expect vectorization. + // + // The condition for partial overlap: + // offset % #elements != 0 + // + // But we do not know #elements exactly, only a range from min/maxVectorWidth. + + int maxElements = maxVectorWidth / type.size; + int minElements = minVectorWidth / type.size; + boolean sometimesPartialOverlap = offset % maxElements != 0; + // If offset % minElements != 0, then it does also not hold for any larger vector. + boolean alwaysPartialOverlap = offset % minElements != 0; + + if (alwaysPartialOverlap) { + // It is a little tricky to know the exact threshold. On all platforms and in all + // unrolling cases, it is between 8 and 64. Hence, we have these 3 cases: + if (offset <= 8) { + builder.append(" // We always detect store-to-load-forwarding failures -> never vectorize.\n"); + expectVectorization = ExpectVectorization.NEVER; + } else if (offset <= 64) { + builder.append(" // Unknown if detect store-to-load-forwarding failures -> maybe disable IR rules.\n"); + expectVectorization = ExpectVectorization.UNKNOWN; + } else { + // offset > 64 -> offset too large, expect no store-to-load-failure detection + throw new RuntimeException("impossible"); + } + } else if (sometimesPartialOverlap && !alwaysPartialOverlap) { + builder.append(" // Partial overlap condition true: sometimes but not always -> maybe disable IR rules.\n"); + expectVectorization = ExpectVectorization.UNKNOWN; + } else { + builder.append(" // Partial overlap never happens -> expect vectorization.\n"); + expectVectorization = ExpectVectorization.ALWAYS; + } + } + // Rule 1: No strict alignment: -XX:-AlignVector + ExpectVectorization expectVectorization1 = expectVectorization; IRRule r1 = new IRRule(type, type.irNode, applyIfCPUFeature); r1.addApplyIf("\"AlignVector\", \"false\""); r1.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\""); if (maxVectorWidth < minVectorWidth) { builder.append(" // maxVectorWidth < minVectorWidth -> expect no vectorization.\n"); - r1.setNegative(); + expectVectorization1 = ExpectVectorization.NEVER; } else if (maxVectorWidth < infinity) { r1.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")"); } + r1.setExpectVectVectorization(expectVectorization1); r1.generate(builder); // Rule 2: strict alignment: -XX:+AlignVector + ExpectVectorization expectVectorization2 = expectVectorization; IRRule r2 = new IRRule(type, type.irNode, applyIfCPUFeature); r2.addApplyIf("\"AlignVector\", \"true\""); r2.addApplyIf("\"MaxVectorSize\", \">=" + minVectorWidth + "\""); @@ -791,18 +847,23 @@ public class TestDependencyOffsets { builder.append(" // byteOffset % awMax == 0 -> always trivially aligned\n"); } else if (byteOffset % awMin != 0) { builder.append(" // byteOffset % awMin != 0 -> can never align -> expect no vectorization.\n"); - r2.setNegative(); + expectVectorization2 = ExpectVectorization.NEVER; } else { - builder.append(" // Alignment unknown -> disable IR rule.\n"); - r2.disable(); + if (expectVectorization2 != ExpectVectorization.NEVER) { + builder.append(" // Alignment unknown -> disable IR rule.\n"); + expectVectorization2 = ExpectVectorization.UNKNOWN; + } else { + builder.append(" // Alignment unknown -> but already proved no vectorization above.\n"); + } } if (maxVectorWidth < minVectorWidth) { builder.append(" // Not at least 2 elements or 4 bytes -> expect no vectorization.\n"); - r2.setNegative(); + expectVectorization2 = ExpectVectorization.NEVER; } else if (maxVectorWidth < infinity) { r2.setSize("min(" + (maxVectorWidth / type.size) + ",max_" + type.name + ")"); } + r2.setExpectVectVectorization(expectVectorization2); r2.generate(builder); } return builder.toString(); @@ -846,12 +907,12 @@ public class TestDependencyOffsets { this.size = size; } - void setNegative() { - this.isPositiveRule = false; - } - - void disable() { - this.isEnabled = false; + void setExpectVectVectorization(ExpectVectorization expectVectorization) { + switch(expectVectorization) { + case ExpectVectorization.NEVER -> { this.isPositiveRule = false; } + case ExpectVectorization.UNKNOWN -> { this.isEnabled = false; } + case ExpectVectorization.ALWAYS -> {} + } } void addApplyIf(String constraint) { diff --git a/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java b/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java index 16d04102082..8a0715eadfe 100644 --- a/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java +++ b/test/hotspot/jtreg/compiler/vectorization/runner/LoopCombinedOpTest.java @@ -138,8 +138,11 @@ public class LoopCombinedOpTest extends VectorizationTestRunner { } @Test - @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse2", "true"}, + @IR(applyIfCPUFeatureOr = {"asimd", "true", "sse4.1", "true"}, counts = {IRNode.STORE_VECTOR, ">0"}) + // With sse2, the MulI does not vectorize. This means we have vectorized stores + // to res1, but scalar loads from res1. The store-to-load-forwarding failure + // detection catches this and rejects vectorization. public int[] multipleStores() { int[] res1 = new int[SIZE]; int[] res2 = new int[SIZE]; diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java b/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java new file mode 100644 index 00000000000..ac8940ec675 --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorStoreToLoadForwarding.java @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.*; + +import java.lang.invoke.*; + +import java.util.concurrent.TimeUnit; +import java.util.Random; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 2, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 3, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1) +public abstract class VectorStoreToLoadForwarding { + @Param({"10000"}) + public int SIZE; + + @Param({ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", + "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", + "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", + "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", + "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", + "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", + "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", + "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", + "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", + "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", + "120", "121", "122", "123", "124", "125", "126", "127", "128", "129"}) + public int OFFSET; + + // To get compile-time constants for OFFSET + static final MutableCallSite MUTABLE_CONSTANT = new MutableCallSite(MethodType.methodType(int.class)); + static final MethodHandle MUTABLE_CONSTANT_HANDLE = MUTABLE_CONSTANT.dynamicInvoker(); + + public int START = 1000; + + private byte[] aB; + private short[] aS; + private int[] aI; + private long[] aL; + + @Param("0") + private int seed; + private Random r = new Random(seed); + + @Setup + public void init() throws Throwable { + aB = new byte[SIZE]; + aS = new short[SIZE]; + aI = new int[SIZE]; + aL = new long[SIZE]; + + for (int i = START; i < SIZE; i++) { + aB[i] = (byte)r.nextInt(); + aS[i] = (short)r.nextInt(); + aI[i] = r.nextInt(); + aL[i] = r.nextLong(); + } + + MethodHandle constant = MethodHandles.constant(int.class, OFFSET); + MUTABLE_CONSTANT.setTarget(constant); + } + + @CompilerControl(CompilerControl.Mode.INLINE) + private int offset_con() throws Throwable { + return (int) MUTABLE_CONSTANT_HANDLE.invokeExact(); + } + + @Benchmark + public void bytes() throws Throwable { + int offset = offset_con(); + for (int i = START; i < SIZE; i++) { + aB[i] = (byte)(aB[i - offset] + 1); + } + } + + @Benchmark + public void shorts() throws Throwable { + int offset = offset_con(); + for (int i = START; i < SIZE; i++) { + aS[i] = (short)(aS[i - offset] + 1); + } + } + + @Benchmark + public void ints() throws Throwable { + int offset = offset_con(); + for (int i = START; i < SIZE; i++) { + aI[i] = aI[i - offset] + 1; + } + } + + @Benchmark + public void longs() throws Throwable { + int offset = offset_con(); + for (int i = START; i < SIZE; i++) { + aL[i] = (long)(aL[i - offset] + 1); + } + } + + @Fork(value = 1, jvmArgs = { + "-XX:+UseSuperWord" + }) + public static class Default extends VectorStoreToLoadForwarding {} + + @Fork(value = 1, jvmArgs = { + "-XX:-UseSuperWord" + }) + public static class NoVectorization extends VectorStoreToLoadForwarding {} + + @Fork(value = 1, jvmArgs = { + "-XX:+UseSuperWord", "-XX:+UnlockDiagnosticVMOptions", "-XX:SuperWordStoreToLoadForwardingFailureDetection=0" + }) + public static class NoStoreToLoadForwardFailureDetection extends VectorStoreToLoadForwarding {} +}