8258894: C2: Forbid GCM to move stores into loops

Prevent GCM from placing memory-writing nodes (such as stores) into loops deeper
than their home loop (determined by their control input). Such placements are
invalid, as they cause memory definitions to interfere, and risk causing
miscompilations. This change complements JDK-8255763, which only addresses
invalid placements in irreducible CFGs.

Add control input to stores in generated stubs to ensure that all memory-writing
nodes have control inputs from which their home block can be derived.

Add a battery of simplified fuzzer test cases where, before this change, GCM
moves stores into deeper loops.

Reviewed-by: thartmann, kvn
This commit is contained in:
Roberto Castañeda Lozano 2021-01-27 15:08:39 +00:00 committed by Tobias Hartmann
parent ac276bb394
commit f353fcf256
7 changed files with 233 additions and 46 deletions

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1207,9 +1207,26 @@ void PhaseCFG::dump_headers() {
}
}
}
#endif // !PRODUCT
#ifdef ASSERT
void PhaseCFG::verify_memory_writer_placement(const Block* b, const Node* n) const {
if (!n->is_memory_writer()) {
return;
}
CFGLoop* home_or_ancestor = find_block_for_node(n->in(0))->_loop;
bool found = false;
do {
if (b->_loop == home_or_ancestor) {
found = true;
break;
}
home_or_ancestor = home_or_ancestor->parent();
} while (home_or_ancestor != NULL);
assert(found, "block b is not in n's home loop or an ancestor of it");
}
void PhaseCFG::verify() const {
#ifdef ASSERT
// Verify sane CFG
for (uint i = 0; i < number_of_blocks(); i++) {
Block* block = get_block(i);
@ -1221,6 +1238,7 @@ void PhaseCFG::verify() const {
if (j >= 1 && n->is_Mach() && n->as_Mach()->ideal_Opcode() == Op_CreateEx) {
assert(j == 1 || block->get_node(j-1)->is_Phi(), "CreateEx must be first instruction in block");
}
verify_memory_writer_placement(block, n);
if (n->needs_anti_dependence_check()) {
verify_anti_dependences(block, n);
}
@ -1263,9 +1281,8 @@ void PhaseCFG::verify() const {
assert(block->_num_succs == 2, "Conditional branch must have two targets");
}
}
#endif
}
#endif
#endif // ASSERT
UnionFind::UnionFind( uint max ) : _cnt(max), _max(max), _indices(NEW_RESOURCE_ARRAY(uint,max)) {
Copy::zero_to_bytes( _indices, sizeof(uint)*max );

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -620,11 +620,15 @@ class PhaseCFG : public Phase {
// Debugging print of CFG
void dump( ) const; // CFG only
void _dump_cfg( const Node *end, VectorSet &visited ) const;
void verify() const;
void dump_headers();
#else
bool trace_opto_pipelining() const { return false; }
#endif
// Check that block b is in the home loop (or an ancestor) of n, if n is a
// memory writer.
void verify_memory_writer_placement(const Block* b, const Node* n) const NOT_DEBUG_RETURN;
void verify() const NOT_DEBUG_RETURN;
};
@ -719,6 +723,7 @@ class CFGLoop : public CFGElement {
double trip_count() const { return 1.0 / _exit_prob; }
virtual bool is_loop() { return true; }
int id() { return _id; }
int depth() { return _depth; }
#ifndef PRODUCT
void dump( ) const;

@ -2748,7 +2748,7 @@ void Compile::Code_Gen() {
print_method(PHASE_GLOBAL_CODE_MOTION, 2);
NOT_PRODUCT( verify_graph_edges(); )
debug_only( cfg.verify(); )
cfg.verify();
}
PhaseChaitin regalloc(unique(), cfg, matcher, false);

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1163,6 +1163,14 @@ Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
if (mach && LCA == root_block)
break;
if (self->is_memory_writer() &&
(LCA->_loop->depth() > early->_loop->depth())) {
// LCA is an invalid placement for a memory writer: choosing it would
// cause memory interference, as illustrated in schedule_late().
continue;
}
verify_memory_writer_placement(LCA, self);
uint start_lat = get_latency_for_node(LCA->head());
uint end_idx = LCA->end_idx();
uint end_lat = get_latency_for_node(LCA->get_node(end_idx));
@ -1250,6 +1258,17 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
if( self->pinned() ) // Pinned in block?
continue;
#ifdef ASSERT
// Assert that memory writers (e.g. stores) have a "home" block (the block
// given by their control input), and that this block corresponds to their
// earliest possible placement. This guarantees that
// hoist_to_cheaper_block() will always have at least one valid choice.
if (self->is_memory_writer()) {
assert(find_block_for_node(self->in(0)) == early,
"The home of a memory writer must also be its earliest placement");
}
#endif
MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
if (mach) {
switch (mach->ideal_Opcode()) {
@ -1274,13 +1293,12 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
default:
break;
}
if (C->has_irreducible_loop() && self->bottom_type()->has_memory()) {
// If the CFG is irreducible, keep memory-writing nodes as close as
// possible to their original block (given by the control input). This
// prevents PhaseCFG::hoist_to_cheaper_block() from placing such nodes
// into descendants of their original loop, as in the following example:
if (C->has_irreducible_loop() && self->is_memory_writer()) {
// If the CFG is irreducible, place memory writers in their home block.
// This prevents hoist_to_cheaper_block() from accidentally placing such
// nodes into deeper loops, as in the following example:
//
// Original placement of store in B1 (loop L1):
// Home placement of store in B1 (loop L1):
//
// B1 (L1):
// m1 <- ..
@ -1301,12 +1319,16 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
// B3 (L1):
// .. <- .. m2, ..
//
// This "hoist inversion" can happen due to CFGLoop::compute_freq()'s
// inaccurate estimation of frequencies for irreducible CFGs, which can
// lead to for example assigning B1 and B3 a higher frequency than B2.
// This "hoist inversion" can happen due to different factors such as
// inaccurate estimation of frequencies for irreducible CFGs, and loops
// with always-taken exits in reducible CFGs. In the reducible case,
// hoist inversion is prevented by discarding invalid blocks (those in
// deeper loops than the home block). In the irreducible case, the
// invalid blocks cannot be identified due to incomplete loop nesting
// information, hence a conservative solution is taken.
#ifndef PRODUCT
if (trace_opto_pipelining()) {
tty->print_cr("# Irreducible loops: schedule in earliest block B%d:",
tty->print_cr("# Irreducible loops: schedule in home block B%d:",
early->_pre_order);
self->dump();
}
@ -1359,6 +1381,16 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
return;
}
if (self->is_memory_writer()) {
// If the LCA of a memory writer is a descendant of its home loop, hoist
// it into a valid placement.
while (LCA->_loop->depth() > early->_loop->depth()) {
LCA = LCA->_idom;
}
assert(LCA != NULL, "a valid LCA must exist");
verify_memory_writer_placement(LCA, self);
}
// If there is no opportunity to hoist, then we're done.
// In stress mode, try to hoist even the single operations.
bool try_to_hoist = StressGCM || (LCA != early);

@ -101,7 +101,7 @@ void GraphKit::gen_stub(address C_function,
//
Node *adr_sp = basic_plus_adr(top(), thread, in_bytes(JavaThread::last_Java_sp_offset()));
Node *last_sp = frameptr();
store_to_memory(NULL, adr_sp, last_sp, T_ADDRESS, NoAlias, MemNode::unordered);
store_to_memory(control(), adr_sp, last_sp, T_ADDRESS, NoAlias, MemNode::unordered);
// Set _thread_in_native
// The order of stores into TLS is critical! Setting _thread_in_native MUST
@ -221,12 +221,12 @@ void GraphKit::gen_stub(address C_function,
//-----------------------------
// Clear last_Java_sp
store_to_memory(NULL, adr_sp, null(), T_ADDRESS, NoAlias, MemNode::unordered);
store_to_memory(control(), adr_sp, null(), T_ADDRESS, NoAlias, MemNode::unordered);
// Clear last_Java_pc
store_to_memory(NULL, adr_last_Java_pc, null(), T_ADDRESS, NoAlias, MemNode::unordered);
store_to_memory(control(), adr_last_Java_pc, null(), T_ADDRESS, NoAlias, MemNode::unordered);
#if (defined(IA64) && !defined(AIX))
Node* adr_last_Java_fp = basic_plus_adr(top(), thread, in_bytes(JavaThread::last_Java_fp_offset()));
store_to_memory(NULL, adr_last_Java_fp, null(), T_ADDRESS, NoAlias, MemNode::unordered);
store_to_memory(control(), adr_last_Java_fp, null(), T_ADDRESS, NoAlias, MemNode::unordered);
#endif
// For is-fancy-jump, the C-return value is also the branch target
@ -237,7 +237,7 @@ void GraphKit::gen_stub(address C_function,
Node* vm_result = make_load(NULL, adr, TypeOopPtr::BOTTOM, T_OBJECT, NoAlias, MemNode::unordered);
map()->set_req(TypeFunc::Parms, vm_result); // vm_result passed as result
// clear thread-local-storage(tls)
store_to_memory(NULL, adr, null(), T_ADDRESS, NoAlias, MemNode::unordered);
store_to_memory(control(), adr, null(), T_ADDRESS, NoAlias, MemNode::unordered);
}
//-----------------------------

@ -1148,6 +1148,9 @@ public:
virtual int cisc_operand() const { return AdlcVMDeps::Not_cisc_spillable; }
bool is_cisc_alternate() const { return (_flags & Flag_is_cisc_alternate) != 0; }
// Whether this is a memory-writing machine node.
bool is_memory_writer() const { return is_Mach() && bottom_type()->has_memory(); }
//----------------- Printing, etc
#ifndef PRODUCT
private:

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -27,20 +27,27 @@ import jdk.test.lib.Asserts;
/**
* @test
* @bug 8255763
* @summary Tests GCM's store placement for reducible and irreducible CFGs.
* @bug 8255763 8258894
* @summary Tests GCM's store placement in different scenarios (regular and OSR
* compilations, reducible and irreducible CFGs).
* @library /test/lib /
* @run main/othervm -Xbatch compiler.codegen.TestGCMStorePlacement reducible
* @run main/othervm -Xbatch compiler.codegen.TestGCMStorePlacement irreducible
* @run main/othervm -Xbatch compiler.codegen.TestGCMStorePlacement regularReducible1
* @run main/othervm -Xcomp -XX:-TieredCompilation -XX:CompileOnly=compiler.codegen.TestGCMStorePlacement:: compiler.codegen.TestGCMStorePlacement regularReducible2
* @run main/othervm -Xcomp -XX:CompileOnly=compiler.codegen.TestGCMStorePlacement:: compiler.codegen.TestGCMStorePlacement regularReducible3
* @run main/othervm -Xcomp -XX:CompileOnly=compiler.codegen.TestGCMStorePlacement:: compiler.codegen.TestGCMStorePlacement regularReducible4
* @run main/othervm -Xcomp -XX:-TieredCompilation -XX:CompileOnly=compiler.codegen.TestGCMStorePlacement:: compiler.codegen.TestGCMStorePlacement osrReducible1
* @run main/othervm -Xcomp -XX:-TieredCompilation -XX:CompileOnly=compiler.codegen.TestGCMStorePlacement:: compiler.codegen.TestGCMStorePlacement osrReducible2
* @run main/othervm -Xbatch compiler.codegen.TestGCMStorePlacement osrIrreducible1
*/
public class TestGCMStorePlacement {
static int counter;
static int intCounter;
static long longCounter;
// Reducible case: counter++ should not be placed into the loop.
static void testReducible() {
counter++;
static void testRegularReducible1() {
// This should not be placed into the loop.
intCounter++;
int acc = 0;
for (int i = 0; i < 50; i++) {
if (i % 2 == 0) {
@ -50,9 +57,104 @@ public class TestGCMStorePlacement {
return;
}
// Irreducible case (due to OSR compilation): counter++ should not be placed
// outside its switch case block.
static void testIrreducible() {
static void testRegularReducible2() {
int i;
for (i = 22; i < 384; i++)
longCounter += 1;
switch (i % 9) {
case 49:
int i17 = 70;
while (i17 > 0) {
longCounter = 42;
switch (9) {
case 97:
break;
case 11398:
for (int i18 = 1; ; );
default:
}
}
}
}
static void testRegularReducible3() {
int i = 0, i23, i27 = 184;
for (int i21 = 0; i21 < 100; i21++) {
i23 = 1;
longCounter += 1;
while (++i23 < 190)
switch (i23 % 10) {
case 86:
continue;
case 0:
i += 76.854F;
for (; i27 < 1; i27++);
}
}
}
static void testRegularReducible4() {
int i16 = 0, i17;
longCounter += 1;
while (++i16 < 100) {
i17 = 0;
while (++i17 < 200) {
switch ((i16 * 5) + 123) {
case 129:
break;
case 149:
break;
default:
}
}
}
}
public static void bar() {
int[] a = new int[0];
long sum = 0;
for (int j = 0; j < 0; j++) {
sum += (a[j] / (j + 1) + a[j] % (j + 1));
}
}
static void foo() {
bar();
}
static void testOsrReducible1() {
int count = 100;
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 100; j++) {
foo();
try {
count = (100000 / count);
} catch (Exception e) {}
switch (0) {
case 0:
for (int k = 0; k < 2; k++) {
count = 0;
}
longCounter += 1;
}
}
}
}
static void testOsrReducible2() {
System.out.println();
boolean cond = false;
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 100; j++) {
intCounter = 42;
if (cond) {
break;
}
for (int k = 0; k < 2; k++);
}
}
}
static void testOsrIrreducible1() {
for (int i = 0; i < 30; i++) {
switch (i % 3) {
case 0:
@ -63,7 +165,8 @@ public class TestGCMStorePlacement {
break;
}
}
counter++;
// This should not be placed outside the "case 0" block.
intCounter++;
break;
case 1:
break;
@ -76,19 +179,46 @@ public class TestGCMStorePlacement {
public static void main(String[] args) {
switch (args[0]) {
case "reducible":
// Cause a regular C2 compilation of testReducible.
case "regularReducible1":
for (int i = 0; i < 100_000; i++) {
counter = 0;
testReducible();
Asserts.assertEQ(counter, 1);
intCounter = 0;
testRegularReducible1();
Asserts.assertEQ(intCounter, 1);
}
break;
case "irreducible":
// Cause an OSR C2 compilation of testIrreducible.
counter = 0;
testIrreducible();
Asserts.assertEQ(counter, 10);
case "regularReducible2":
longCounter = 0;
testRegularReducible2();
Asserts.assertEQ(longCounter, 362L);
break;
case "regularReducible3":
for (int i = 0; i < 10; i++) {
longCounter = 0;
testRegularReducible3();
Asserts.assertEQ(longCounter, 100L);
}
break;
case "regularReducible4":
for (int i = 0; i < 10; i++) {
longCounter = 0;
testRegularReducible4();
Asserts.assertEQ(longCounter, 1L);
}
break;
case "osrReducible1":
longCounter = 0;
testOsrReducible1();
Asserts.assertEQ(longCounter, 10000L);
break;
case "osrReducible2":
intCounter = 0;
testOsrReducible2();
Asserts.assertEQ(intCounter, 42);
break;
case "osrIrreducible1":
intCounter = 0;
testOsrIrreducible1();
Asserts.assertEQ(intCounter, 10);
break;
default:
System.out.println("invalid mode");