8333258: C2: high memory usage in PhaseCFG::insert_anti_dependences()

Reviewed-by: kvn, epeter
This commit is contained in:
Roland Westrelin 2024-09-18 07:07:45 +00:00
parent d23c59e408
commit 5381f553ad
3 changed files with 11020 additions and 41 deletions

View File

@ -562,6 +562,103 @@ bool PhaseCFG::unrelated_load_in_store_null_block(Node* store, Node* load) {
return false;
}
class DefUseMemStatesQueue : public StackObj {
private:
class DefUsePair : public StackObj {
private:
Node* _def; // memory state
Node* _use; // use of the memory state that also modifies the memory state
public:
DefUsePair(Node* def, Node* use) :
_def(def), _use(use) {
}
DefUsePair() :
_def(nullptr), _use(nullptr) {
}
Node* def() const {
return _def;
}
Node* use() const {
return _use;
}
};
GrowableArray<DefUsePair> _queue;
GrowableArray<MergeMemNode*> _worklist_visited; // visited mergemem nodes
bool already_enqueued(Node* def_mem, PhiNode* use_phi) const {
// def_mem is one of the inputs of use_phi and at least one input of use_phi is
// not def_mem. It's however possible that use_phi has def_mem as input multiple
// times. If that happens, use_phi is recorded as a use of def_mem multiple
// times as well. When PhaseCFG::insert_anti_dependences() goes over
// uses of def_mem and enqueues them for processing, use_phi would then be
// enqueued for processing multiple times when it only needs to be
// processed once. The code below checks if use_phi as a use of def_mem was
// already enqueued to avoid redundant processing of use_phi.
int j = _queue.length()-1;
// If there are any use of def_mem already enqueued, they were enqueued
// last (all use of def_mem are processed in one go).
for (; j >= 0; j--) {
const DefUsePair& def_use_pair = _queue.at(j);
if (def_use_pair.def() != def_mem) {
// We're done with the uses of def_mem
break;
}
if (def_use_pair.use() == use_phi) {
return true;
}
}
#ifdef ASSERT
for (; j >= 0; j--) {
const DefUsePair& def_use_pair = _queue.at(j);
assert(def_use_pair.def() != def_mem, "Should be done with the uses of def_mem");
}
#endif
return false;
}
public:
DefUseMemStatesQueue(ResourceArea* area) {
}
void push(Node* def_mem_state, Node* use_mem_state) {
if (use_mem_state->is_MergeMem()) {
// Be sure we don't get into combinatorial problems.
if (!_worklist_visited.append_if_missing(use_mem_state->as_MergeMem())) {
return; // already on work list; do not repeat
}
} else if (use_mem_state->is_Phi()) {
// A Phi could have the same mem as input multiple times. If that's the case, we don't need to enqueue it
// more than once. We otherwise allow phis to be repeated; they can merge two relevant states.
if (already_enqueued(def_mem_state, use_mem_state->as_Phi())) {
return;
}
}
_queue.push(DefUsePair(def_mem_state, use_mem_state));
}
bool is_nonempty() const {
return _queue.is_nonempty();
}
Node* top_def() const {
return _queue.top().def();
}
Node* top_use() const {
return _queue.top().use();
}
void pop() {
_queue.pop();
}
};
//--------------------------insert_anti_dependences---------------------------
// A load may need to witness memory that nearby stores can overwrite.
// For each nearby store, either insert an "anti-dependence" edge
@ -579,6 +676,7 @@ bool PhaseCFG::unrelated_load_in_store_null_block(Node* store, Node* load) {
// preserve anti-dependences. The caller may also hoist the load
// above the LCA, if it is not the early block.
Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
ResourceMark rm;
assert(load->needs_anti_dependence_check(), "must be a load of some sort");
assert(LCA != nullptr, "");
DEBUG_ONLY(Block* LCA_orig = LCA);
@ -625,10 +723,8 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
early = memory_early_block(load, early, this);
}
ResourceArea *area = Thread::current()->resource_area();
Node_List worklist_mem(area); // prior memory state to store
Node_List worklist_store(area); // possible-def to explore
Node_List worklist_visited(area); // visited mergemem nodes
ResourceArea* area = Thread::current()->resource_area();
DefUseMemStatesQueue worklist_def_use_mem_states(area); // prior memory state to store and possible-def to explore
Node_List non_early_stores(area); // all relevant stores outside of early
bool must_raise_LCA = false;
@ -644,60 +740,56 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
// The relevant stores "nearby" the load consist of a tree rooted
// at initial_mem, with internal nodes of type MergeMem.
// Therefore, the branches visited by the worklist are of this form:
// initial_mem -> (MergeMem ->)* store
// initial_mem -> (MergeMem ->)* Memory state modifying node
// Memory state modifying nodes include Store and Phi nodes and any node for which needs_anti_dependence_check()
// returns false.
// The anti-dependence constraints apply only to the fringe of this tree.
Node* initial_mem = load->in(MemNode::Memory);
worklist_store.push(initial_mem);
worklist_visited.push(initial_mem);
worklist_mem.push(nullptr);
while (worklist_store.size() > 0) {
worklist_def_use_mem_states.push(nullptr, initial_mem);
while (worklist_def_use_mem_states.is_nonempty()) {
// Examine a nearby store to see if it might interfere with our load.
Node* mem = worklist_mem.pop();
Node* store = worklist_store.pop();
uint op = store->Opcode();
Node* def_mem_state = worklist_def_use_mem_states.top_def();
Node* use_mem_state = worklist_def_use_mem_states.top_use();
worklist_def_use_mem_states.pop();
uint op = use_mem_state->Opcode();
assert(!use_mem_state->needs_anti_dependence_check(), "no loads");
// MergeMems do not directly have anti-deps.
// Treat them as internal nodes in a forward tree of memory states,
// the leaves of which are each a 'possible-def'.
if (store == initial_mem // root (exclusive) of tree we are searching
if (use_mem_state == initial_mem // root (exclusive) of tree we are searching
|| op == Op_MergeMem // internal node of tree we are searching
) {
mem = store; // It's not a possibly interfering store.
if (store == initial_mem)
def_mem_state = use_mem_state; // It's not a possibly interfering store.
if (use_mem_state == initial_mem)
initial_mem = nullptr; // only process initial memory once
for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
store = mem->fast_out(i);
if (store->is_MergeMem()) {
// Be sure we don't get into combinatorial problems.
// (Allow phis to be repeated; they can merge two relevant states.)
uint j = worklist_visited.size();
for (; j > 0; j--) {
if (worklist_visited.at(j-1) == store) break;
}
if (j > 0) continue; // already on work list; do not repeat
worklist_visited.push(store);
for (DUIterator_Fast imax, i = def_mem_state->fast_outs(imax); i < imax; i++) {
use_mem_state = def_mem_state->fast_out(i);
if (use_mem_state->needs_anti_dependence_check()) {
// use_mem_state is also a kind of load (i.e. needs_anti_dependence_check), and it is not a memory state
// modifying node (store, Phi or MergeMem). Hence, load can't be anti dependent on this node.
continue;
}
worklist_mem.push(mem);
worklist_store.push(store);
worklist_def_use_mem_states.push(def_mem_state, use_mem_state);
}
continue;
}
if (op == Op_MachProj || op == Op_Catch) continue;
if (store->needs_anti_dependence_check()) continue; // not really a store
// Compute the alias index. Loads and stores with different alias
// indices do not need anti-dependence edges. Wide MemBar's are
// anti-dependent on everything (except immutable memories).
const TypePtr* adr_type = store->adr_type();
const TypePtr* adr_type = use_mem_state->adr_type();
if (!C->can_alias(adr_type, load_alias_idx)) continue;
// Most slow-path runtime calls do NOT modify Java memory, but
// they can block and so write Raw memory.
if (store->is_Mach()) {
MachNode* mstore = store->as_Mach();
if (use_mem_state->is_Mach()) {
MachNode* mstore = use_mem_state->as_Mach();
if (load_alias_idx != Compile::AliasIdxRaw) {
// Check for call into the runtime using the Java calling
// convention (and from there into a wrapper); it has no
@ -736,10 +828,10 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
// or else observe that 'store' is all the way up in the
// earliest legal block for 'load'. In the latter case,
// immediately insert an anti-dependence edge.
Block* store_block = get_block_for_node(store);
Block* store_block = get_block_for_node(use_mem_state);
assert(store_block != nullptr, "unused killing projections skipped above");
if (store->is_Phi()) {
if (use_mem_state->is_Phi()) {
// Loop-phis need to raise load before input. (Other phis are treated
// as store below.)
//
@ -754,8 +846,8 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
// Do not assert(store_block != early, "Phi merging memory after access")
// PhiNode may be at start of block 'early' with backedge to 'early'
DEBUG_ONLY(bool found_match = false);
for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
if (store->in(j) == mem) { // Found matching input?
for (uint j = PhiNode::Input, jmax = use_mem_state->req(); j < jmax; j++) {
if (use_mem_state->in(j) == def_mem_state) { // Found matching input?
DEBUG_ONLY(found_match = true);
Block* pred_block = get_block_for_node(store_block->pred(j));
if (pred_block != early) {
@ -781,21 +873,21 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
// will find him on the non_early_stores list and stick him
// with a precedence edge.
// (But, don't bother if LCA is already raised all the way.)
if (LCA != early && !unrelated_load_in_store_null_block(store, load)) {
if (LCA != early && !unrelated_load_in_store_null_block(use_mem_state, load)) {
store_block->set_raise_LCA_mark(load_index);
must_raise_LCA = true;
non_early_stores.push(store);
non_early_stores.push(use_mem_state);
}
} else {
// Found a possibly-interfering store in the load's 'early' block.
// This means 'load' cannot sink at all in the dominator tree.
// Add an anti-dep edge, and squeeze 'load' into the highest block.
assert(store != load->find_exact_control(load->in(0)), "dependence cycle found");
assert(use_mem_state != load->find_exact_control(load->in(0)), "dependence cycle found");
if (verify) {
assert(store->find_edge(load) != -1 || unrelated_load_in_store_null_block(store, load),
assert(use_mem_state->find_edge(load) != -1 || unrelated_load_in_store_null_block(use_mem_state, load),
"missing precedence edge");
} else {
store->add_prec(load);
use_mem_state->add_prec(load);
}
LCA = early;
// This turns off the process of gathering non_early_stores.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff