8326962: C2 SuperWord: cache VPointer

Reviewed-by: chagedorn, kvn
This commit is contained in:
Emanuel Peter 2024-04-04 05:11:59 +00:00
parent 2931458711
commit f762637be2
5 changed files with 177 additions and 52 deletions

View File

@ -531,13 +531,13 @@ void SuperWord::find_adjacent_refs() {
set_align_to_ref(align_to_mem_ref);
}
VPointer align_to_ref_p(mem_ref, _vloop);
const VPointer& align_to_ref_p = vpointer(mem_ref);
// Set alignment relative to "align_to_ref" for all related memory operations.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (isomorphic(s, mem_ref) &&
(!_do_vector_loop || same_origin_idx(s, mem_ref))) {
VPointer p2(s, _vloop);
const VPointer& p2 = vpointer(s);
if (p2.comparable(align_to_ref_p)) {
int align = memory_alignment(s, iv_adjustment);
set_alignment(s, align);
@ -593,11 +593,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
// Count number of comparable memory ops
for (uint i = 0; i < memops.size(); i++) {
MemNode* s1 = memops.at(i)->as_Mem();
VPointer p1(s1, _vloop);
const VPointer& p1 = vpointer(s1);
for (uint j = i+1; j < memops.size(); j++) {
MemNode* s2 = memops.at(j)->as_Mem();
if (isomorphic(s1, s2)) {
VPointer p2(s2, _vloop);
const VPointer& p2 = vpointer(s2);
if (p1.comparable(p2)) {
(*cmp_ct.adr_at(i))++;
(*cmp_ct.adr_at(j))++;
@ -618,7 +618,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
if (s->is_Store()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
VPointer p(s, _vloop);
const VPointer& p = vpointer(s);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
@ -641,7 +641,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
if (s->is_Load()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
VPointer p(s, _vloop);
const VPointer& p = vpointer(s);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
@ -714,7 +714,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) {
//---------------------------get_iv_adjustment---------------------------
// Calculate loop's iv adjustment for this memory ops.
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
VPointer align_to_ref_p(mem_ref, _vloop);
const VPointer& align_to_ref_p = vpointer(mem_ref);
int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes();
int elt_size = align_to_ref_p.memory_size();
@ -875,8 +875,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const {
// Adjacent memory references must have the same base, be comparable
// and have the correct distance between them.
VPointer p1(s1->as_Mem(), _vloop);
VPointer p2(s2->as_Mem(), _vloop);
const VPointer& p1 = vpointer(s1->as_Mem());
const VPointer& p2 = vpointer(s2->as_Mem());
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
return diff == data_size(s1);
@ -1637,7 +1637,7 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
const MemNode* mem_ref = pack->at(0)->as_Mem();
VPointer mem_ref_p(mem_ref, _vloop);
const VPointer& mem_ref_p = vpointer(mem_ref);
const CountedLoopEndNode* pre_end = _vloop.pre_loop_end();
assert(pre_end->stride_is_con(), "pre loop stride is constant");
@ -3310,7 +3310,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
}
#endif
VPointer p(s, _vloop);
const VPointer& p = vpointer(s);
if (!p.valid()) {
NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
return bottom_align;
@ -3413,7 +3413,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
Node* orig_limit = pre_opaq->original_loop_limit();
assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
VPointer align_to_ref_p(align_to_ref, _vloop);
const VPointer& align_to_ref_p = vpointer(align_to_ref);
assert(align_to_ref_p.valid(), "sanity");
// For the main-loop, we want the address of align_to_ref to be memory aligned

View File

@ -425,7 +425,7 @@ class SuperWord : public ResourceObj {
// Decide if loop can eventually be vectorized, and what unrolling factor is required.
static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
// VLoop Accessors
// VLoop accessors
PhaseIdealLoop* phase() const { return _vloop.phase(); }
PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); }
IdealLoopTree* lpt() const { return _vloop.lpt(); }
@ -434,7 +434,7 @@ class SuperWord : public ResourceObj {
int iv_stride() const { return cl()->stride_con(); }
bool in_bb(const Node* n) const { return _vloop.in_bb(n); }
// VLoopReductions Accessors
// VLoopReductions accessors
bool is_marked_reduction(const Node* n) const {
return _vloop_analyzer.reductions().is_marked_reduction(n);
}
@ -443,12 +443,12 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.reductions().is_marked_reduction_pair(n1, n2);
}
// VLoopMemorySlices Accessors
// VLoopMemorySlices accessors
bool same_memory_slice(MemNode* n1, MemNode* n2) const {
return _vloop_analyzer.memory_slices().same_memory_slice(n1, n2);
}
// VLoopBody Accessors
// VLoopBody accessors
const GrowableArray<Node*>& body() const {
return _vloop_analyzer.body().body();
}
@ -457,7 +457,7 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.body().bb_idx(n);
}
// VLoopTypes Accessors
// VLoopTypes accessors
const Type* velt_type(Node* n) const {
return _vloop_analyzer.types().velt_type(n);
}
@ -482,7 +482,7 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.types().vector_width_in_bytes(n);
}
// VLoopDependencyGraph Accessors
// VLoopDependencyGraph accessors
const VLoopDependencyGraph& dependency_graph() const {
return _vloop_analyzer.dependency_graph();
}
@ -495,6 +495,11 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.dependency_graph().mutually_independent(nodes);
}
// VLoopVPointer accessors
const VPointer& vpointer(const MemNode* mem) const {
return _vloop_analyzer.vpointers().vpointer(mem);
}
#ifndef PRODUCT
// TraceAutoVectorization and TraceSuperWord
bool is_trace_superword_alignment() const {

View File

@ -29,12 +29,13 @@
#include "utilities/stringUtils.hpp"
#define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
flags(POINTER_ANALYSIS, "Trace VPointer") \
flags(POINTER_ANALYSIS, "Trace VPointer (verbose)") \
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
flags(BODY, "Trace VLoopBody") \
flags(TYPES, "Trace VLoopTypes") \
flags(POINTERS, "Trace VLoopPointers") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \

View File

@ -31,6 +31,19 @@
#include "opto/rootnode.hpp"
#include "opto/vectorization.hpp"
#ifndef PRODUCT
static void print_con_or_idx(const Node* n) {
if (n == nullptr) {
tty->print("( 0)");
} else if (n->is_ConI()) {
jint val = n->as_ConI()->get_int();
tty->print("(%4d)", val);
} else {
tty->print("[%4d]", n->_idx);
}
}
#endif
bool VLoop::check_preconditions() {
#ifndef PRODUCT
if (is_trace_preconditions()) {
@ -161,11 +174,62 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
_types.compute_vector_element_type();
_vpointers.compute_vpointers();
_dependency_graph.construct();
return VStatus::make_success();
}
void VLoopVPointers::compute_vpointers() {
count_vpointers();
allocate_vpointers_array();
compute_and_cache_vpointers();
NOT_PRODUCT( if (_vloop.is_trace_vpointers()) { print(); } )
}
void VLoopVPointers::count_vpointers() {
_vpointers_length = 0;
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
_vpointers_length++;
});
}
void VLoopVPointers::allocate_vpointers_array() {
uint bytes = _vpointers_length * sizeof(VPointer);
_vpointers = (VPointer*)_arena->Amalloc(bytes);
}
void VLoopVPointers::compute_and_cache_vpointers() {
int pointers_idx = 0;
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
// Placement new: construct directly into the array.
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
pointers_idx++;
});
}
const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const {
assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop");
int bb_idx = _body.bb_idx(mem);
int pointers_idx = _bb_idx_to_vpointer.at(bb_idx);
assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range");
return _vpointers[pointers_idx];
}
#ifndef PRODUCT
void VLoopVPointers::print() const {
tty->print_cr("\nVLoopVPointers::print:");
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
const VPointer& p = vpointer(mem);
tty->print(" ");
p.print();
});
}
#endif
// Construct the dependency graph:
// - Data-dependencies: implicit (taken from C2 node inputs).
// - Memory-dependencies:
@ -193,7 +257,7 @@ void VLoopDependencyGraph::construct() {
MemNode* n1 = slice_nodes.at(j);
memory_pred_edges.clear();
VPointer p1(n1, _vloop);
const VPointer& p1 = _vpointers.vpointer(n1);
// For all memory nodes before it, check if we need to add a memory edge.
for (int k = slice_nodes.length() - 1; k > j; k--) {
MemNode* n2 = slice_nodes.at(k);
@ -201,7 +265,7 @@ void VLoopDependencyGraph::construct() {
// Ignore Load-Load dependencies:
if (n1->is_Load() && n2->is_Load()) { continue; }
VPointer p2(n2, _vloop);
const VPointer& p2 = _vpointers.vpointer(n2);
if (!VPointer::not_equal(p1.cmp(p2))) {
// Possibly overlapping memory
memory_pred_edges.append(_body.bb_idx(n2));
@ -723,19 +787,24 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
_invar = register_if_new(add);
}
// Function for printing the fields of a VPointer
void VPointer::print() {
#ifndef PRODUCT
tty->print("base: [%d] adr: [%d] scale: %d offset: %d",
_base != nullptr ? _base->_idx : 0,
_adr != nullptr ? _adr->_idx : 0,
_scale, _offset);
if (_invar != nullptr) {
tty->print(" invar: [%d]", _invar->_idx);
// Function for printing the fields of a VPointer
void VPointer::print() const {
tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name());
tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0);
tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);
tty->print(" base");
print_con_or_idx(_base);
tty->print(" + offset(%4d)", _offset);
tty->print(" + invar");
print_con_or_idx(_invar);
tty->print_cr(" + scale(%4d) * iv]", _scale);
}
tty->cr();
#endif
}
// Following are functions for tracing VPointer match
#ifndef PRODUCT
@ -1502,17 +1571,6 @@ AlignmentSolution* AlignmentSolver::solve() const {
}
#ifdef ASSERT
static void print_con_or_idx(const Node* n) {
if (n == nullptr) {
tty->print("(0)");
} else if (n->is_ConI()) {
jint val = n->as_ConI()->get_int();
tty->print("(%d)", val);
} else {
tty->print("[%d]", n->_idx);
}
}
void AlignmentSolver::trace_start_solve() const {
if (is_trace()) {
tty->print(" vector mem_ref:");

View File

@ -33,6 +33,8 @@
// Code in this file and the vectorization.cpp contains shared logics and
// utilities for C2's loop auto-vectorization.
class VPointer;
class VStatus : public StackObj {
private:
const char* _failure_reason;
@ -154,6 +156,10 @@ public:
return _vtrace.is_trace(TraceAutoVectorizationTag::DEPENDENCY_GRAPH);
}
bool is_trace_vpointers() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
}
bool is_trace_pointer_analysis() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
}
@ -356,6 +362,16 @@ public:
return _body_idx.at(n->_idx);
}
template<typename Callback>
void for_each_mem(Callback callback) const {
for (int i = 0; i < _body.length(); i++) {
MemNode* mem = _body.at(i)->isa_Mem();
if (mem != nullptr && _vloop.in_bb(mem)) {
callback(mem, i);
}
}
}
private:
void set_bb_idx(Node* n, int i) {
_body_idx.at_put_grow(n->_idx, i);
@ -445,6 +461,45 @@ private:
const Type* container_type(Node* n) const;
};
// Submodule of VLoopAnalyzer.
// We compute and cache the VPointer for every load and store.
class VLoopVPointers : public StackObj {
private:
Arena* _arena;
const VLoop& _vloop;
const VLoopBody& _body;
// Array of cached pointers
VPointer* _vpointers;
int _vpointers_length;
// Map bb_idx -> index in _vpointers. -1 if not mapped.
GrowableArray<int> _bb_idx_to_vpointer;
public:
VLoopVPointers(Arena* arena,
const VLoop& vloop,
const VLoopBody& body) :
_arena(arena),
_vloop(vloop),
_body(body),
_vpointers(nullptr),
_bb_idx_to_vpointer(arena,
vloop.estimated_body_length(),
vloop.estimated_body_length(),
-1) {}
NONCOPYABLE(VLoopVPointers);
void compute_vpointers();
const VPointer& vpointer(const MemNode* mem) const;
NOT_PRODUCT( void print() const; )
private:
void count_vpointers();
void allocate_vpointers_array();
void compute_and_cache_vpointers();
};
// Submodule of VLoopAnalyzer.
// The dependency graph is used to determine if nodes are independent, and can thus potentially
// be executed in parallel. That is a prerequisite for packing nodes into vector operations.
@ -461,6 +516,7 @@ private:
const VLoop& _vloop;
const VLoopBody& _body;
const VLoopMemorySlices& _memory_slices;
const VLoopVPointers& _vpointers;
// bb_idx -> DependenceNode*
GrowableArray<DependencyNode*> _dependency_nodes;
@ -472,11 +528,13 @@ public:
VLoopDependencyGraph(Arena* arena,
const VLoop& vloop,
const VLoopBody& body,
const VLoopMemorySlices& memory_slices) :
const VLoopMemorySlices& memory_slices,
const VLoopVPointers& pointers) :
_arena(arena),
_vloop(vloop),
_body(body),
_memory_slices(memory_slices),
_vpointers(pointers),
_dependency_nodes(arena,
vloop.estimated_body_length(),
vloop.estimated_body_length(),
@ -570,6 +628,7 @@ private:
VLoopMemorySlices _memory_slices;
VLoopBody _body;
VLoopTypes _types;
VLoopVPointers _vpointers;
VLoopDependencyGraph _dependency_graph;
public:
@ -581,7 +640,8 @@ public:
_memory_slices (&_arena, vloop),
_body (&_arena, vloop, vshared),
_types (&_arena, vloop, _body),
_dependency_graph(&_arena, vloop, _body, _memory_slices)
_vpointers (&_arena, vloop, _body),
_dependency_graph(&_arena, vloop, _body, _memory_slices, _vpointers)
{
_success = setup_submodules();
}
@ -595,6 +655,7 @@ public:
const VLoopMemorySlices& memory_slices() const { return _memory_slices; }
const VLoopBody& body() const { return _body; }
const VLoopTypes& types() const { return _types; }
const VLoopVPointers& vpointers() const { return _vpointers; }
const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
private:
@ -678,7 +739,7 @@ class VPointer : public ArenaObj {
int invar_factor() const;
// Comparable?
bool invar_equals(VPointer& q) {
bool invar_equals(const VPointer& q) const {
assert(_debug_invar == NodeSentinel || q._debug_invar == NodeSentinel ||
(_invar == q._invar) == (_debug_invar == q._debug_invar &&
_debug_invar_scale == q._debug_invar_scale &&
@ -686,7 +747,7 @@ class VPointer : public ArenaObj {
return _invar == q._invar;
}
int cmp(VPointer& q) {
int cmp(const VPointer& q) const {
if (valid() && q.valid() &&
(_adr == q._adr || (_base == _adr && q._base == q._adr)) &&
_scale == q._scale && invar_equals(q)) {
@ -698,7 +759,7 @@ class VPointer : public ArenaObj {
}
}
bool overlap_possible_with_any_in(Node_List* p) {
bool overlap_possible_with_any_in(const Node_List* p) const {
for (uint k = 0; k < p->size(); k++) {
MemNode* mem = p->at(k)->as_Mem();
VPointer p_mem(mem, _vloop);
@ -712,14 +773,14 @@ class VPointer : public ArenaObj {
return false;
}
bool not_equal(VPointer& q) { return not_equal(cmp(q)); }
bool equal(VPointer& q) { return equal(cmp(q)); }
bool comparable(VPointer& q) { return comparable(cmp(q)); }
bool not_equal(const VPointer& q) const { return not_equal(cmp(q)); }
bool equal(const VPointer& q) const { return equal(cmp(q)); }
bool comparable(const VPointer& q) const { return comparable(cmp(q)); }
static bool not_equal(int cmp) { return cmp <= NotEqual; }
static bool equal(int cmp) { return cmp == Equal; }
static bool comparable(int cmp) { return cmp < NotComparable; }
void print();
NOT_PRODUCT( void print() const; )
#ifndef PRODUCT
class Tracer {