From f762637be2568f898db25aa6a57c180f1feac3a3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 4 Apr 2024 05:11:59 +0000 Subject: [PATCH] 8326962: C2 SuperWord: cache VPointer Reviewed-by: chagedorn, kvn --- src/hotspot/share/opto/superword.cpp | 24 ++-- src/hotspot/share/opto/superword.hpp | 17 ++- .../share/opto/traceAutoVectorizationTag.hpp | 3 +- src/hotspot/share/opto/vectorization.cpp | 106 ++++++++++++++---- src/hotspot/share/opto/vectorization.hpp | 79 +++++++++++-- 5 files changed, 177 insertions(+), 52 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 8bef9d09980..c242e5a4ef7 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -531,13 +531,13 @@ void SuperWord::find_adjacent_refs() { set_align_to_ref(align_to_mem_ref); } - VPointer align_to_ref_p(mem_ref, _vloop); + const VPointer& align_to_ref_p = vpointer(mem_ref); // Set alignment relative to "align_to_ref" for all related memory operations. for (int i = memops.size() - 1; i >= 0; i--) { MemNode* s = memops.at(i)->as_Mem(); if (isomorphic(s, mem_ref) && (!_do_vector_loop || same_origin_idx(s, mem_ref))) { - VPointer p2(s, _vloop); + const VPointer& p2 = vpointer(s); if (p2.comparable(align_to_ref_p)) { int align = memory_alignment(s, iv_adjustment); set_alignment(s, align); @@ -593,11 +593,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { // Count number of comparable memory ops for (uint i = 0; i < memops.size(); i++) { MemNode* s1 = memops.at(i)->as_Mem(); - VPointer p1(s1, _vloop); + const VPointer& p1 = vpointer(s1); for (uint j = i+1; j < memops.size(); j++) { MemNode* s2 = memops.at(j)->as_Mem(); if (isomorphic(s1, s2)) { - VPointer p2(s2, _vloop); + const VPointer& p2 = vpointer(s2); if (p1.comparable(p2)) { (*cmp_ct.adr_at(i))++; (*cmp_ct.adr_at(j))++; @@ -618,7 +618,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { if (s->is_Store()) { int vw = vector_width_in_bytes(s); assert(vw > 1, "sanity"); - VPointer p(s, _vloop); + const VPointer& p = vpointer(s); if ( cmp_ct.at(j) > max_ct || (cmp_ct.at(j) == max_ct && ( vw > max_vw || @@ -641,7 +641,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { if (s->is_Load()) { int vw = vector_width_in_bytes(s); assert(vw > 1, "sanity"); - VPointer p(s, _vloop); + const VPointer& p = vpointer(s); if ( cmp_ct.at(j) > max_ct || (cmp_ct.at(j) == max_ct && ( vw > max_vw || @@ -714,7 +714,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) { //---------------------------get_iv_adjustment--------------------------- // Calculate loop's iv adjustment for this memory ops. int SuperWord::get_iv_adjustment(MemNode* mem_ref) { - VPointer align_to_ref_p(mem_ref, _vloop); + const VPointer& align_to_ref_p = vpointer(mem_ref); int offset = align_to_ref_p.offset_in_bytes(); int scale = align_to_ref_p.scale_in_bytes(); int elt_size = align_to_ref_p.memory_size(); @@ -875,8 +875,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const { // Adjacent memory references must have the same base, be comparable // and have the correct distance between them. - VPointer p1(s1->as_Mem(), _vloop); - VPointer p2(s2->as_Mem(), _vloop); + const VPointer& p1 = vpointer(s1->as_Mem()); + const VPointer& p2 = vpointer(s2->as_Mem()); if (p1.base() != p2.base() || !p1.comparable(p2)) return false; int diff = p2.offset_in_bytes() - p1.offset_in_bytes(); return diff == data_size(s1); @@ -1637,7 +1637,7 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs"); const MemNode* mem_ref = pack->at(0)->as_Mem(); - VPointer mem_ref_p(mem_ref, _vloop); + const VPointer& mem_ref_p = vpointer(mem_ref); const CountedLoopEndNode* pre_end = _vloop.pre_loop_end(); assert(pre_end->stride_is_con(), "pre loop stride is constant"); @@ -3310,7 +3310,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump(); } #endif - VPointer p(s, _vloop); + const VPointer& p = vpointer(s); if (!p.valid()) { NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");) return bottom_align; @@ -3413,7 +3413,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* orig_limit = pre_opaq->original_loop_limit(); assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, ""); - VPointer align_to_ref_p(align_to_ref, _vloop); + const VPointer& align_to_ref_p = vpointer(align_to_ref); assert(align_to_ref_p.valid(), "sanity"); // For the main-loop, we want the address of align_to_ref to be memory aligned diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index e4f2e30052c..7d1ba1131f3 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -425,7 +425,7 @@ class SuperWord : public ResourceObj { // Decide if loop can eventually be vectorized, and what unrolling factor is required. static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor); - // VLoop Accessors + // VLoop accessors PhaseIdealLoop* phase() const { return _vloop.phase(); } PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } IdealLoopTree* lpt() const { return _vloop.lpt(); } @@ -434,7 +434,7 @@ class SuperWord : public ResourceObj { int iv_stride() const { return cl()->stride_con(); } bool in_bb(const Node* n) const { return _vloop.in_bb(n); } - // VLoopReductions Accessors + // VLoopReductions accessors bool is_marked_reduction(const Node* n) const { return _vloop_analyzer.reductions().is_marked_reduction(n); } @@ -443,12 +443,12 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.reductions().is_marked_reduction_pair(n1, n2); } - // VLoopMemorySlices Accessors + // VLoopMemorySlices accessors bool same_memory_slice(MemNode* n1, MemNode* n2) const { return _vloop_analyzer.memory_slices().same_memory_slice(n1, n2); } - // VLoopBody Accessors + // VLoopBody accessors const GrowableArray& body() const { return _vloop_analyzer.body().body(); } @@ -457,7 +457,7 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.body().bb_idx(n); } - // VLoopTypes Accessors + // VLoopTypes accessors const Type* velt_type(Node* n) const { return _vloop_analyzer.types().velt_type(n); } @@ -482,7 +482,7 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.types().vector_width_in_bytes(n); } - // VLoopDependencyGraph Accessors + // VLoopDependencyGraph accessors const VLoopDependencyGraph& dependency_graph() const { return _vloop_analyzer.dependency_graph(); } @@ -495,6 +495,11 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.dependency_graph().mutually_independent(nodes); } + // VLoopVPointer accessors + const VPointer& vpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().vpointer(mem); + } + #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord bool is_trace_superword_alignment() const { diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index e04664caba1..b5e818efae3 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -29,12 +29,13 @@ #include "utilities/stringUtils.hpp" #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \ - flags(POINTER_ANALYSIS, "Trace VPointer") \ + flags(POINTER_ANALYSIS, "Trace VPointer (verbose)") \ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(BODY, "Trace VLoopBody") \ flags(TYPES, "Trace VLoopTypes") \ + flags(POINTERS, "Trace VLoopPointers") \ flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 72e43d5148e..0ad87256b27 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -31,6 +31,19 @@ #include "opto/rootnode.hpp" #include "opto/vectorization.hpp" +#ifndef PRODUCT +static void print_con_or_idx(const Node* n) { + if (n == nullptr) { + tty->print("( 0)"); + } else if (n->is_ConI()) { + jint val = n->as_ConI()->get_int(); + tty->print("(%4d)", val); + } else { + tty->print("[%4d]", n->_idx); + } +} +#endif + bool VLoop::check_preconditions() { #ifndef PRODUCT if (is_trace_preconditions()) { @@ -161,11 +174,62 @@ VStatus VLoopAnalyzer::setup_submodules_helper() { _types.compute_vector_element_type(); + _vpointers.compute_vpointers(); + _dependency_graph.construct(); return VStatus::make_success(); } +void VLoopVPointers::compute_vpointers() { + count_vpointers(); + allocate_vpointers_array(); + compute_and_cache_vpointers(); + NOT_PRODUCT( if (_vloop.is_trace_vpointers()) { print(); } ) +} + +void VLoopVPointers::count_vpointers() { + _vpointers_length = 0; + _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { + _vpointers_length++; + }); +} + +void VLoopVPointers::allocate_vpointers_array() { + uint bytes = _vpointers_length * sizeof(VPointer); + _vpointers = (VPointer*)_arena->Amalloc(bytes); +} + +void VLoopVPointers::compute_and_cache_vpointers() { + int pointers_idx = 0; + _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { + // Placement new: construct directly into the array. + ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); + _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); + pointers_idx++; + }); +} + +const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const { + assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop"); + int bb_idx = _body.bb_idx(mem); + int pointers_idx = _bb_idx_to_vpointer.at(bb_idx); + assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range"); + return _vpointers[pointers_idx]; +} + +#ifndef PRODUCT +void VLoopVPointers::print() const { + tty->print_cr("\nVLoopVPointers::print:"); + + _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { + const VPointer& p = vpointer(mem); + tty->print(" "); + p.print(); + }); +} +#endif + // Construct the dependency graph: // - Data-dependencies: implicit (taken from C2 node inputs). // - Memory-dependencies: @@ -193,7 +257,7 @@ void VLoopDependencyGraph::construct() { MemNode* n1 = slice_nodes.at(j); memory_pred_edges.clear(); - VPointer p1(n1, _vloop); + const VPointer& p1 = _vpointers.vpointer(n1); // For all memory nodes before it, check if we need to add a memory edge. for (int k = slice_nodes.length() - 1; k > j; k--) { MemNode* n2 = slice_nodes.at(k); @@ -201,7 +265,7 @@ void VLoopDependencyGraph::construct() { // Ignore Load-Load dependencies: if (n1->is_Load() && n2->is_Load()) { continue; } - VPointer p2(n2, _vloop); + const VPointer& p2 = _vpointers.vpointer(n2); if (!VPointer::not_equal(p1.cmp(p2))) { // Possibly overlapping memory memory_pred_edges.append(_body.bb_idx(n2)); @@ -723,19 +787,24 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) { _invar = register_if_new(add); } -// Function for printing the fields of a VPointer -void VPointer::print() { #ifndef PRODUCT - tty->print("base: [%d] adr: [%d] scale: %d offset: %d", - _base != nullptr ? _base->_idx : 0, - _adr != nullptr ? _adr->_idx : 0, - _scale, _offset); - if (_invar != nullptr) { - tty->print(" invar: [%d]", _invar->_idx); - } - tty->cr(); -#endif +// Function for printing the fields of a VPointer +void VPointer::print() const { + tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name()); + tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0); + tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0); + + tty->print(" base"); + print_con_or_idx(_base); + + tty->print(" + offset(%4d)", _offset); + + tty->print(" + invar"); + print_con_or_idx(_invar); + + tty->print_cr(" + scale(%4d) * iv]", _scale); } +#endif // Following are functions for tracing VPointer match #ifndef PRODUCT @@ -1502,17 +1571,6 @@ AlignmentSolution* AlignmentSolver::solve() const { } #ifdef ASSERT -static void print_con_or_idx(const Node* n) { - if (n == nullptr) { - tty->print("(0)"); - } else if (n->is_ConI()) { - jint val = n->as_ConI()->get_int(); - tty->print("(%d)", val); - } else { - tty->print("[%d]", n->_idx); - } -} - void AlignmentSolver::trace_start_solve() const { if (is_trace()) { tty->print(" vector mem_ref:"); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index acc6bbf475d..9dc029efb6b 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -33,6 +33,8 @@ // Code in this file and the vectorization.cpp contains shared logics and // utilities for C2's loop auto-vectorization. +class VPointer; + class VStatus : public StackObj { private: const char* _failure_reason; @@ -154,6 +156,10 @@ public: return _vtrace.is_trace(TraceAutoVectorizationTag::DEPENDENCY_GRAPH); } + bool is_trace_vpointers() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS); + } + bool is_trace_pointer_analysis() const { return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -356,6 +362,16 @@ public: return _body_idx.at(n->_idx); } + template + void for_each_mem(Callback callback) const { + for (int i = 0; i < _body.length(); i++) { + MemNode* mem = _body.at(i)->isa_Mem(); + if (mem != nullptr && _vloop.in_bb(mem)) { + callback(mem, i); + } + } + } + private: void set_bb_idx(Node* n, int i) { _body_idx.at_put_grow(n->_idx, i); @@ -445,6 +461,45 @@ private: const Type* container_type(Node* n) const; }; +// Submodule of VLoopAnalyzer. +// We compute and cache the VPointer for every load and store. +class VLoopVPointers : public StackObj { +private: + Arena* _arena; + const VLoop& _vloop; + const VLoopBody& _body; + + // Array of cached pointers + VPointer* _vpointers; + int _vpointers_length; + + // Map bb_idx -> index in _vpointers. -1 if not mapped. + GrowableArray _bb_idx_to_vpointer; + +public: + VLoopVPointers(Arena* arena, + const VLoop& vloop, + const VLoopBody& body) : + _arena(arena), + _vloop(vloop), + _body(body), + _vpointers(nullptr), + _bb_idx_to_vpointer(arena, + vloop.estimated_body_length(), + vloop.estimated_body_length(), + -1) {} + NONCOPYABLE(VLoopVPointers); + + void compute_vpointers(); + const VPointer& vpointer(const MemNode* mem) const; + NOT_PRODUCT( void print() const; ) + +private: + void count_vpointers(); + void allocate_vpointers_array(); + void compute_and_cache_vpointers(); +}; + // Submodule of VLoopAnalyzer. // The dependency graph is used to determine if nodes are independent, and can thus potentially // be executed in parallel. That is a prerequisite for packing nodes into vector operations. @@ -461,6 +516,7 @@ private: const VLoop& _vloop; const VLoopBody& _body; const VLoopMemorySlices& _memory_slices; + const VLoopVPointers& _vpointers; // bb_idx -> DependenceNode* GrowableArray _dependency_nodes; @@ -472,11 +528,13 @@ public: VLoopDependencyGraph(Arena* arena, const VLoop& vloop, const VLoopBody& body, - const VLoopMemorySlices& memory_slices) : + const VLoopMemorySlices& memory_slices, + const VLoopVPointers& pointers) : _arena(arena), _vloop(vloop), _body(body), _memory_slices(memory_slices), + _vpointers(pointers), _dependency_nodes(arena, vloop.estimated_body_length(), vloop.estimated_body_length(), @@ -570,6 +628,7 @@ private: VLoopMemorySlices _memory_slices; VLoopBody _body; VLoopTypes _types; + VLoopVPointers _vpointers; VLoopDependencyGraph _dependency_graph; public: @@ -581,7 +640,8 @@ public: _memory_slices (&_arena, vloop), _body (&_arena, vloop, vshared), _types (&_arena, vloop, _body), - _dependency_graph(&_arena, vloop, _body, _memory_slices) + _vpointers (&_arena, vloop, _body), + _dependency_graph(&_arena, vloop, _body, _memory_slices, _vpointers) { _success = setup_submodules(); } @@ -595,6 +655,7 @@ public: const VLoopMemorySlices& memory_slices() const { return _memory_slices; } const VLoopBody& body() const { return _body; } const VLoopTypes& types() const { return _types; } + const VLoopVPointers& vpointers() const { return _vpointers; } const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; } private: @@ -678,7 +739,7 @@ class VPointer : public ArenaObj { int invar_factor() const; // Comparable? - bool invar_equals(VPointer& q) { + bool invar_equals(const VPointer& q) const { assert(_debug_invar == NodeSentinel || q._debug_invar == NodeSentinel || (_invar == q._invar) == (_debug_invar == q._debug_invar && _debug_invar_scale == q._debug_invar_scale && @@ -686,7 +747,7 @@ class VPointer : public ArenaObj { return _invar == q._invar; } - int cmp(VPointer& q) { + int cmp(const VPointer& q) const { if (valid() && q.valid() && (_adr == q._adr || (_base == _adr && q._base == q._adr)) && _scale == q._scale && invar_equals(q)) { @@ -698,7 +759,7 @@ class VPointer : public ArenaObj { } } - bool overlap_possible_with_any_in(Node_List* p) { + bool overlap_possible_with_any_in(const Node_List* p) const { for (uint k = 0; k < p->size(); k++) { MemNode* mem = p->at(k)->as_Mem(); VPointer p_mem(mem, _vloop); @@ -712,14 +773,14 @@ class VPointer : public ArenaObj { return false; } - bool not_equal(VPointer& q) { return not_equal(cmp(q)); } - bool equal(VPointer& q) { return equal(cmp(q)); } - bool comparable(VPointer& q) { return comparable(cmp(q)); } + bool not_equal(const VPointer& q) const { return not_equal(cmp(q)); } + bool equal(const VPointer& q) const { return equal(cmp(q)); } + bool comparable(const VPointer& q) const { return comparable(cmp(q)); } static bool not_equal(int cmp) { return cmp <= NotEqual; } static bool equal(int cmp) { return cmp == Equal; } static bool comparable(int cmp) { return cmp < NotComparable; } - void print(); + NOT_PRODUCT( void print() const; ) #ifndef PRODUCT class Tracer {