8131330: G1CollectedHeap::verify_dirty_young_list fails with assert

Use assembly loop to avoid compiler optimization into memset

Reviewed-by: ecaspole, tschatzl
This commit is contained in:
Kim Barrett 2015-08-31 13:06:01 -04:00
parent 673798137b
commit 15196341a5
9 changed files with 341 additions and 71 deletions

View File

@ -0,0 +1,159 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "gc/shared/memset_with_concurrent_readers.hpp"
#include "runtime/prefetch.inline.hpp"
#include "utilities/debug.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/macros.hpp"
#if INCLUDE_ALL_GCS
// An implementation of memset, for use when there may be concurrent
// readers of the region being stored into.
//
// We can't use the standard library memset if it is implemented using
// block initializing stores. Doing so can result in concurrent readers
// seeing spurious zeros.
//
// We can't use the obvious C/C++ for-loop, because the compiler may
// recognize the idiomatic loop and optimize it into a call to the
// standard library memset; we've seen exactly this happen with, for
// example, Solaris Studio 12.3. Hence the use of inline assembly
// code, hiding loops from the compiler's optimizer.
//
// We don't attempt to use the standard library memset when it is safe
// to do so. We could conservatively do so by detecting the presence
// of block initializing stores (VM_Version::has_blk_init()), but the
// implementation provided here should be sufficient.
inline void fill_subword(void* start, void* end, int value) {
STATIC_ASSERT(BytesPerWord == 8);
assert(pointer_delta(end, start, 1) < BytesPerWord, "precondition");
// Dispatch on (end - start).
void* pc;
__asm__ volatile(
// offset := (7 - (end - start)) + 3
// 3 instructions from rdpc to DISPATCH
" sub %[offset], %[end], %[offset]\n\t" // offset := start - end
" sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
" add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
" rd %pc, %[pc]\n\t" // dispatch on scaled offset
" jmpl %[pc]+%[offset], %g0\n\t"
" nop\n\t"
// DISPATCH: no direct reference, but without it the store block may be elided.
"1:\n\t"
" stb %[value], [%[end]-7]\n\t" // end[-7] = value
" stb %[value], [%[end]-6]\n\t"
" stb %[value], [%[end]-5]\n\t"
" stb %[value], [%[end]-4]\n\t"
" stb %[value], [%[end]-3]\n\t"
" stb %[value], [%[end]-2]\n\t"
" stb %[value], [%[end]-1]\n\t" // end[-1] = value
: /* no outputs */
[pc] "&=r" (pc) // temp
: [offset] "&+r" (start),
[end] "r" (end),
[value] "r" (value)
: "memory");
}
void memset_with_concurrent_readers(void* to, int value, size_t size) {
Prefetch::write(to, 0);
void* end = static_cast<char*>(to) + size;
if (size >= BytesPerWord) {
// Fill any partial word prefix.
uintx* aligned_to = static_cast<uintx*>(align_ptr_up(to, BytesPerWord));
fill_subword(to, aligned_to, value);
// Compute fill word.
STATIC_ASSERT(BitsPerByte == 8);
STATIC_ASSERT(BitsPerWord == 64);
uintx xvalue = value & 0xff;
xvalue |= (xvalue << 8);
xvalue |= (xvalue << 16);
xvalue |= (xvalue << 32);
uintx* aligned_end = static_cast<uintx*>(align_ptr_down(end, BytesPerWord));
assert(aligned_to <= aligned_end, "invariant");
// for ( ; aligned_to < aligned_end; ++aligned_to) {
// *aligned_to = xvalue;
// }
uintptr_t temp;
__asm__ volatile(
// Unroll loop x8.
" sub %[aend], %[ato], %[temp]\n\t"
" cmp %[temp], 56\n\t" // cc := (aligned_end - aligned_to) > 7 words
" ba %xcc, 2f\n\t" // goto TEST always
" sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
// LOOP:
"1:\n\t" // unrolled x8 store loop top
" cmp %[temp], %[ato]\n\t" // cc := limit > (next) aligned_to
" stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
" stx %[xvalue], [%[ato]-56]\n\t"
" stx %[xvalue], [%[ato]-48]\n\t"
" stx %[xvalue], [%[ato]-40]\n\t"
" stx %[xvalue], [%[ato]-32]\n\t"
" stx %[xvalue], [%[ato]-24]\n\t"
" stx %[xvalue], [%[ato]-16]\n\t"
" stx %[xvalue], [%[ato]-8]\n\t"
// TEST:
"2:\n\t"
" bgu,a %xcc, 1b\n\t" // goto LOOP if more than 7 words remaining
" add %[ato], 64, %[ato]\n\t" // aligned_to += 8, for next iteration
// Fill remaining < 8 full words.
// Dispatch on (aligned_end - aligned_to).
// offset := (7 - (aligned_end - aligned_to)) + 3
// 3 instructions from rdpc to DISPATCH
" sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
" srax %[ato], 1, %[ato]\n\t" // scale offset for instruction size of 4
" add %[ato], 40, %[ato]\n\t" // offset += 10 * instruction size
" rd %pc, %[temp]\n\t" // dispatch on scaled offset
" jmpl %[temp]+%[ato], %g0\n\t"
" nop\n\t"
// DISPATCH: no direct reference, but without it the store block may be elided.
"3:\n\t"
" stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
" stx %[xvalue], [%[aend]-48]\n\t"
" stx %[xvalue], [%[aend]-40]\n\t"
" stx %[xvalue], [%[aend]-32]\n\t"
" stx %[xvalue], [%[aend]-24]\n\t"
" stx %[xvalue], [%[aend]-16]\n\t"
" stx %[xvalue], [%[aend]-8]\n\t" // aligned_end[-1] = xvalue
: /* no outputs */
[temp] "&=r" (temp)
: [ato] "&+r" (aligned_to),
[aend] "r" (aligned_end),
[xvalue] "r" (xvalue)
: "cc", "memory");
to = aligned_end; // setup for suffix
}
// Fill any partial word suffix. Also the prefix if size < BytesPerWord.
fill_subword(to, end, value);
}
#endif // INCLUDE_ALL_GCS

View File

@ -85,27 +85,6 @@ void VM_Version::initialize() {
_supports_cx8 = has_v9();
_supports_atomic_getset4 = true; // swap instruction
// There are Fujitsu Sparc64 CPUs which support blk_init as well so
// we have to take this check out of the 'is_niagara()' block below.
if (has_blk_init()) {
// When using CMS or G1, we cannot use memset() in BOT updates
// because the sun4v/CMT version in libc_psr uses BIS which
// exposes "phantom zeros" to concurrent readers. See 6948537.
if (FLAG_IS_DEFAULT(UseMemSetInBOT) && (UseConcMarkSweepGC || UseG1GC)) {
FLAG_SET_DEFAULT(UseMemSetInBOT, false);
}
// Issue a stern warning if the user has explicitly set
// UseMemSetInBOT (it is known to cause issues), but allow
// use for experimentation and debugging.
if (UseConcMarkSweepGC || UseG1GC) {
if (UseMemSetInBOT) {
assert(!FLAG_IS_DEFAULT(UseMemSetInBOT), "Error");
warning("Experimental flag -XX:+UseMemSetInBOT is known to cause instability"
" on sun4v; please understand that you are using at your own risk!");
}
}
}
if (is_niagara()) {
// Indirect branch is the same cost as direct
if (FLAG_IS_DEFAULT(UseInlineCaches)) {

View File

@ -27,6 +27,7 @@
#include "gc/g1/g1BlockOffsetTable.hpp"
#include "gc/g1/heapRegion.hpp"
#include "gc/shared/memset_with_concurrent_readers.hpp"
#include "gc/shared/space.hpp"
inline HeapWord* G1BlockOffsetTable::block_start(const void* addr) {
@ -68,15 +69,7 @@ void G1BlockOffsetSharedArray::set_offset_array(size_t left, size_t right, u_cha
check_index(right, "right index out of range");
assert(left <= right, "indexes out of order");
size_t num_cards = right - left + 1;
if (UseMemSetInBOT) {
memset(&_offset_array[left], offset, num_cards);
} else {
size_t i = left;
const size_t end = i + num_cards;
for (; i < end; i++) {
_offset_array[i] = offset;
}
}
memset_with_concurrent_readers(&_offset_array[left], offset, num_cards);
}
// Variant of index_for that does not check the index for validity.

View File

@ -27,6 +27,7 @@
#include "gc/g1/g1SATBCardTableModRefBS.hpp"
#include "gc/g1/heapRegion.hpp"
#include "gc/g1/satbQueue.hpp"
#include "gc/shared/memset_with_concurrent_readers.hpp"
#include "oops/oop.inline.hpp"
#include "runtime/atomic.inline.hpp"
#include "runtime/mutexLocker.hpp"
@ -108,15 +109,7 @@ void G1SATBCardTableModRefBS::g1_mark_as_young(const MemRegion& mr) {
jbyte *const first = byte_for(mr.start());
jbyte *const last = byte_after(mr.last());
// Below we may use an explicit loop instead of memset() because on
// certain platforms memset() can give concurrent readers phantom zeros.
if (UseMemSetInBOT) {
memset(first, g1_young_gen, last - first);
} else {
for (jbyte* i = first; i < last; i++) {
*i = g1_young_gen;
}
}
memset_with_concurrent_readers(first, g1_young_gen, last - first);
}
#ifndef PRODUCT

View File

@ -25,9 +25,12 @@
#ifndef SHARE_VM_GC_SHARED_BLOCKOFFSETTABLE_HPP
#define SHARE_VM_GC_SHARED_BLOCKOFFSETTABLE_HPP
#include "gc/shared/memset_with_concurrent_readers.hpp"
#include "memory/memRegion.hpp"
#include "memory/virtualspace.hpp"
#include "runtime/globals.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/macros.hpp"
// The CollectedHeap type requires subtypes to implement a method
// "block_start". For some subtypes, notably generational
@ -126,6 +129,19 @@ class BlockOffsetSharedArray: public CHeapObj<mtGC> {
VirtualSpace _vs;
u_char* _offset_array; // byte array keeping backwards offsets
void fill_range(size_t start, size_t num_cards, u_char offset) {
void* start_ptr = &_offset_array[start];
#if INCLUDE_ALL_GCS
// If collector is concurrent, special handling may be needed.
assert(!UseG1GC, "Shouldn't be here when using G1");
if (UseConcMarkSweepGC) {
memset_with_concurrent_readers(start_ptr, offset, num_cards);
return;
}
#endif // INCLUDE_ALL_GCS
memset(start_ptr, offset, num_cards);
}
protected:
// Bounds checking accessors:
// For performance these have to devolve to array accesses in product builds.
@ -160,20 +176,7 @@ class BlockOffsetSharedArray: public CHeapObj<mtGC> {
assert(left < right, "Heap addresses out of order");
size_t num_cards = pointer_delta(right, left) >> LogN_words;
// Below, we may use an explicit loop instead of memset()
// because on certain platforms memset() can give concurrent
// readers "out-of-thin-air," phantom zeros; see 6948537.
if (UseMemSetInBOT) {
memset(&_offset_array[index_for(left)], offset, num_cards);
} else {
size_t i = index_for(left);
const size_t end = i + num_cards;
for (; i < end; i++) {
// Elided until CR 6977974 is fixed properly.
// assert(!reducing || _offset_array[i] >= offset, "Not reducing");
_offset_array[i] = offset;
}
}
fill_range(index_for(left), num_cards, offset);
}
void set_offset_array(size_t left, size_t right, u_char offset, bool reducing = false) {
@ -182,20 +185,7 @@ class BlockOffsetSharedArray: public CHeapObj<mtGC> {
assert(left <= right, "indexes out of order");
size_t num_cards = right - left + 1;
// Below, we may use an explicit loop instead of memset
// because on certain platforms memset() can give concurrent
// readers "out-of-thin-air," phantom zeros; see 6948537.
if (UseMemSetInBOT) {
memset(&_offset_array[left], offset, num_cards);
} else {
size_t i = left;
const size_t end = i + num_cards;
for (; i < end; i++) {
// Elided until CR 6977974 is fixed properly.
// assert(!reducing || _offset_array[i] >= offset, "Not reducing");
_offset_array[i] = offset;
}
}
fill_range(left, num_cards, offset);
}
void check_offset_array(size_t index, HeapWord* high, HeapWord* low) const {

View File

@ -0,0 +1,104 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include <string.h>
#include "gc/shared/memset_with_concurrent_readers.hpp"
#include "utilities/debug.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/macros.hpp"
#include "utilities/ostream.hpp"
#if INCLUDE_ALL_GCS
// Unit test
#ifdef ASSERT
static unsigned line_byte(const char* line, size_t i) {
return unsigned(line[i]) & 0xFF;
}
// Verify memset_with_concurrent_readers mimics memset.
// We don't attempt to verify the concurrent reader case.
void test_memset_with_concurrent_readers() {
const size_t chunk_size = 8 * BytesPerWord;
const unsigned chunk_count = 4;
const size_t block_size = (chunk_count + 4) * chunk_size;
char block[block_size];
char clear_block[block_size];
char set_block[block_size];
// block format:
// 0: unused leading chunk
// 1: chunk written from start index to end of chunk
// ... nchunks fully written chunks
// N: chunk written from start of chunk to end index
// N+1: unused trailing chunk
const int clear_value = 0;
const int set_value = 0xAC;
memset(clear_block, clear_value, block_size);
memset(set_block, set_value, block_size);
for (unsigned nchunks = 0; nchunks <= chunk_count; ++nchunks) {
for (size_t start = 1; start <= chunk_size; ++start) {
for (size_t end = 0; end <= chunk_size; ++end) {
size_t set_start = chunk_size + start;
size_t set_end = (2 + nchunks) * chunk_size + end;
size_t set_size = set_end - set_start;
memset(block, clear_value, block_size);
memset_with_concurrent_readers(&block[set_start], set_value, set_size);
bool head_clear = !memcmp(clear_block, block, set_start);
bool middle_set = !memcmp(set_block, block + set_start, set_size);
bool tail_clear = !memcmp(clear_block, block + set_end, block_size - set_end);
if (!(head_clear && middle_set && tail_clear)) {
tty->print_cr("*** memset_with_concurrent_readers failed: "
"set start " SIZE_FORMAT ", set end " SIZE_FORMAT,
set_start, set_end);
for (unsigned chunk = 0; chunk < (block_size / chunk_size); ++chunk) {
for (unsigned line = 0; line < (chunk_size / BytesPerWord); ++line) {
const char* lp = &block[chunk * chunk_size + line * BytesPerWord];
tty->print_cr("%d,%d: %2x %2x %2x %2x %2x %2x %2x %2x",
chunk, line,
line_byte(lp, 0), line_byte(lp, 1),
line_byte(lp, 2), line_byte(lp, 3),
line_byte(lp, 4), line_byte(lp, 5),
line_byte(lp, 6), line_byte(lp, 7));
}
}
assert(head_clear, "leading byte not clear");
assert(middle_set, "memset byte not set");
assert(tail_clear, "trailing bye not clear");
}
}
}
}
}
#endif // end unit test
#endif // INCLUDE_ALL_GCS

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#ifndef SRC_SHARE_VM_GC_SHARED_MEMSETWITHCONCURRENTREADERS_HPP
#define SRC_SHARE_VM_GC_SHARED_MEMSETWITHCONCURRENTREADERS_HPP
#include <stddef.h>
#include <string.h>
#include "utilities/macros.hpp"
// Only used by concurrent collectors.
#if INCLUDE_ALL_GCS
// Fill a block of memory with value, like memset, but with the
// understanding that there may be concurrent readers of that memory.
void memset_with_concurrent_readers(void* to, int value, size_t size);
#ifdef TARGET_ARCH_sparc
// SPARC requires special handling. See SPARC-specific definition.
#else
// All others just use memset.
inline void memset_with_concurrent_readers(void* to, int value, size_t size) {
::memset(to, value, size);
}
#endif // End of target dispatch.
#endif // INCLUDE_ALL_GCS
#endif // include guard

View File

@ -3869,6 +3869,7 @@ void TestG1BiasedArray_test();
void TestBufferingOopClosure_test();
void TestCodeCacheRemSet_test();
void FreeRegionList_test();
void test_memset_with_concurrent_readers();
#endif
void execute_internal_vm_tests() {
@ -3910,6 +3911,7 @@ void execute_internal_vm_tests() {
if (UseG1GC) {
run_unit_test(FreeRegionList_test());
}
run_unit_test(test_memset_with_concurrent_readers());
#endif
tty->print_cr("All internal VM tests passed");
}

View File

@ -638,10 +638,6 @@ public:
experimental(bool, AlwaysSafeConstructors, false, \
"Force safe construction, as if all fields are final.") \
\
/* Temporary: See 6948537 */ \
experimental(bool, UseMemSetInBOT, true, \
"(Unstable) uses memset in BOT updates in GC code") \
\
diagnostic(bool, UnlockDiagnosticVMOptions, trueInDebug, \
"Enable normal processing of flags relating to field diagnostics")\
\