diff --git a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp index 99a7eebf4c4..8ec62553086 100644 --- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp +++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp @@ -1587,6 +1587,185 @@ class StubGenerator: public StubCodeGenerator { return start; } + // + // Generate stub for disjoint short fill. If "aligned" is true, the + // "to" address is assumed to be heapword aligned. + // + // Arguments for generated stub: + // to: O0 + // value: O1 + // count: O2 treated as signed + // + address generate_fill(BasicType t, bool aligned, const char* name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + const Register to = O0; // source array address + const Register value = O1; // fill value + const Register count = O2; // elements count + // O3 is used as a temp register + + assert_clean_int(count, O3); // Make sure 'count' is clean int. + + Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; + Label L_fill_2_bytes, L_fill_4_bytes, L_fill_32_bytes; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 2; + break; + case T_SHORT: + shift = 1; + break; + case T_INT: + shift = 0; + break; + default: ShouldNotReachHere(); + } + + BLOCK_COMMENT("Entry:"); + + if (t == T_BYTE) { + // Zero extend value + __ and3(value, 0xff, value); + __ sllx(value, 8, O3); + __ or3(value, O3, value); + } + if (t == T_SHORT) { + // Zero extend value + __ sethi(0xffff0000, O3); + __ andn(value, O3, value); + } + if (t == T_BYTE || t == T_SHORT) { + __ sllx(value, 16, O3); + __ or3(value, O3, value); + } + + __ cmp(count, 2<andcc(count, 1<nop(); + __ stb(value, to, 0); + __ inc(to, 1); + __ dec(count, 1); + __ BIND(L_skip_align1); + } + // Two bytes misalignment happens only for byte and short (char) arrays + __ andcc(to, 2, G0); + __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); + __ delayed()->nop(); + __ sth(value, to, 0); + __ inc(to, 2); + __ dec(count, 1 << (shift - 1)); + __ BIND(L_skip_align2); + } +#ifdef _LP64 + if (!aligned) { +#endif + // align to 8 bytes, we know we are 4 byte aligned to start + __ andcc(to, 7, G0); + __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); + __ delayed()->nop(); + __ stw(value, to, 0); + __ inc(to, 4); + __ dec(count, 1 << shift); + __ BIND(L_fill_32_bytes); +#ifdef _LP64 + } +#endif + + Label L_check_fill_8_bytes; + // Fill 32-byte chunks + __ subcc(count, 8 << shift, count); + __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); + __ delayed()->nop(); + + if (t == T_INT) { + // Zero extend value + __ srl(value, 0, value); + } + if (t == T_BYTE || t == T_SHORT || t == T_INT) { + __ sllx(value, 32, O3); + __ or3(value, O3, value); + } + + Label L_fill_32_bytes_loop; + __ align(16); + __ BIND(L_fill_32_bytes_loop); + + __ stx(value, to, 0); + __ stx(value, to, 8); + __ stx(value, to, 16); + __ stx(value, to, 24); + + __ subcc(count, 8 << shift, count); + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); + __ delayed()->add(to, 32, to); + + __ BIND(L_check_fill_8_bytes); + __ addcc(count, 8 << shift, count); + __ brx(Assembler::zero, false, Assembler::pn, L_exit); + __ delayed()->subcc(count, 1 << (shift + 1), count); + __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); + __ delayed()->andcc(count, 1<add(to, 8, to); + + // fill trailing 4 bytes + __ andcc(count, 1<andcc(count, 1<<(shift-1), G0); + } else { + __ delayed()->nop(); + } + __ stw(value, to, 0); + if (t == T_BYTE || t == T_SHORT) { + __ inc(to, 4); + // fill trailing 2 bytes + __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches + __ BIND(L_fill_2_bytes); + __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); + __ delayed()->andcc(count, 1, count); + __ sth(value, to, 0); + if (t == T_BYTE) { + __ inc(to, 2); + // fill trailing byte + __ andcc(count, 1, count); // in delay slot of branches + __ BIND(L_fill_byte); + __ brx(Assembler::zero, false, Assembler::pt, L_exit); + __ delayed()->nop(); + __ stb(value, to, 0); + } else { + __ BIND(L_fill_byte); + } + } else { + __ BIND(L_fill_2_bytes); + } + __ BIND(L_exit); + __ retl(); + __ delayed()->mov(G0, O0); // return 0 + return start; + } + // // Generate stub for conjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. @@ -2855,6 +3034,13 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy"); StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); + + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); } void generate_initial() { diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index bb7517f533d..43e6a982c66 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -8767,6 +8767,186 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist bind(DONE); } +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#else +#define BLOCK_COMMENT(str) block_comment(str) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") +void MacroAssembler::generate_fill(BasicType t, bool aligned, + Register to, Register value, Register count, + Register rtmp, XMMRegister xtmp) { + assert_different_registers(to, value, count, rtmp); + Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; + Label L_fill_2_bytes, L_fill_4_bytes; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 2; + break; + case T_SHORT: + shift = 1; + break; + case T_INT: + shift = 0; + break; + default: ShouldNotReachHere(); + } + + if (t == T_BYTE) { + andl(value, 0xff); + movl(rtmp, value); + shll(rtmp, 8); + orl(value, rtmp); + } + if (t == T_SHORT) { + andl(value, 0xffff); + } + if (t == T_BYTE || t == T_SHORT) { + movl(rtmp, value); + shll(rtmp, 16); + orl(value, rtmp); + } + + cmpl(count, 2<= 2, "supported cpu only" ); + Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; + // Fill 32-byte chunks + movdl(xtmp, value); + pshufd(xtmp, xtmp, 0); + + subl(count, 8 << shift); + jcc(Assembler::less, L_check_fill_8_bytes); + align(16); + + BIND(L_fill_32_bytes_loop); + + if (UseUnalignedLoadStores) { + movdqu(Address(to, 0), xtmp); + movdqu(Address(to, 16), xtmp); + } else { + movq(Address(to, 0), xtmp); + movq(Address(to, 8), xtmp); + movq(Address(to, 16), xtmp); + movq(Address(to, 24), xtmp); + } + + addptr(to, 32); + subl(count, 8 << shift); + jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); + BIND(L_check_fill_8_bytes); + addl(count, 8 << shift); + jccb(Assembler::zero, L_exit); + jmpb(L_fill_8_bytes); + + // + // length is too short, just fill qwords + // + BIND(L_fill_8_bytes_loop); + movq(Address(to, 0), xtmp); + addptr(to, 8); + BIND(L_fill_8_bytes); + subl(count, 1 << (shift + 1)); + jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); + } + } + // fill trailing 4 bytes + BIND(L_fill_4_bytes); + testl(count, 1< Input and output aligned on a HeapWord == 8-byte boundary // ignored @@ -2712,6 +2732,13 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); + // We don't generate specialized code for HeapWord-aligned source // arrays, so just use the code we've already generated StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; diff --git a/hotspot/src/share/vm/asm/codeBuffer.cpp b/hotspot/src/share/vm/asm/codeBuffer.cpp index cae51ddb893..d7e4bb54d3b 100644 --- a/hotspot/src/share/vm/asm/codeBuffer.cpp +++ b/hotspot/src/share/vm/asm/codeBuffer.cpp @@ -143,13 +143,6 @@ void CodeBuffer::initialize_oop_recorder(OopRecorder* r) { void CodeBuffer::initialize_section_size(CodeSection* cs, csize_t size) { assert(cs != &_insts, "insts is the memory provider, not the consumer"); -#ifdef ASSERT - for (int n = (int)SECT_INSTS+1; n < (int)SECT_LIMIT; n++) { - CodeSection* prevCS = code_section(n); - if (prevCS == cs) break; - assert(!prevCS->is_allocated(), "section allocation must be in reverse order"); - } -#endif csize_t slop = CodeSection::end_slop(); // margin between sections int align = cs->alignment(); assert(is_power_of_2(align), "sanity"); @@ -199,13 +192,13 @@ void CodeBuffer::set_blob(BufferBlob* blob) { _total_start = start; _total_size = end - start; } else { - #ifdef ASSERT +#ifdef ASSERT // Clean out dangling pointers. _total_start = badAddress; + _consts._start = _consts._end = badAddress; _insts._start = _insts._end = badAddress; _stubs._start = _stubs._end = badAddress; - _consts._start = _consts._end = badAddress; - #endif //ASSERT +#endif //ASSERT } } @@ -221,9 +214,9 @@ const char* CodeBuffer::code_section_name(int n) { return NULL; #else //PRODUCT switch (n) { + case SECT_CONSTS: return "consts"; case SECT_INSTS: return "insts"; case SECT_STUBS: return "stubs"; - case SECT_CONSTS: return "consts"; default: return NULL; } #endif //PRODUCT @@ -445,12 +438,11 @@ void CodeBuffer::compute_final_layout(CodeBuffer* dest) const { const CodeSection* prev_cs = NULL; CodeSection* prev_dest_cs = NULL; - for (int n = 0; n < (int)SECT_LIMIT; n++) { + + for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) { // figure compact layout of each section const CodeSection* cs = code_section(n); - address cstart = cs->start(); - address cend = cs->end(); - csize_t csize = cend - cstart; + csize_t csize = cs->size(); CodeSection* dest_cs = dest->code_section(n); if (!cs->is_empty()) { @@ -463,7 +455,7 @@ void CodeBuffer::compute_final_layout(CodeBuffer* dest) const { prev_dest_cs->_limit += padding; } #ifdef ASSERT - if (prev_cs != NULL && prev_cs->is_frozen() && n < SECT_CONSTS) { + if (prev_cs != NULL && prev_cs->is_frozen() && n < (SECT_LIMIT - 1)) { // Make sure the ends still match up. // This is important because a branch in a frozen section // might target code in a following section, via a Label, @@ -492,22 +484,18 @@ void CodeBuffer::compute_final_layout(CodeBuffer* dest) const { assert(dest->verify_section_allocation(), "final configuration works"); } -csize_t CodeBuffer::total_offset_of(address addr) const { - csize_t code_size_so_far = 0; - for (int n = 0; n < (int)SECT_LIMIT; n++) { - const CodeSection* cs = code_section(n); - if (!cs->is_empty()) { - code_size_so_far = cs->align_at_start(code_size_so_far); +csize_t CodeBuffer::total_offset_of(CodeSection* cs) const { + csize_t size_so_far = 0; + for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) { + const CodeSection* cur_cs = code_section(n); + if (!cur_cs->is_empty()) { + size_so_far = cur_cs->align_at_start(size_so_far); } - if (cs->contains2(addr)) { - return code_size_so_far + (addr - cs->start()); + if (cur_cs->index() == cs->index()) { + return size_so_far; } - code_size_so_far += cs->size(); + size_so_far += cur_cs->size(); } -#ifndef PRODUCT - tty->print_cr("Dangling address " PTR_FORMAT " in:", addr); - ((CodeBuffer*)this)->print(); -#endif ShouldNotReachHere(); return -1; } @@ -533,7 +521,7 @@ csize_t CodeBuffer::copy_relocations_to(CodeBlob* dest) const { csize_t code_end_so_far = 0; csize_t code_point_so_far = 0; - for (int n = 0; n < (int)SECT_LIMIT; n++) { + for (int n = (int) SECT_FIRST; n < (int)SECT_LIMIT; n++) { // pull relocs out of each section const CodeSection* cs = code_section(n); assert(!(cs->is_empty() && cs->locs_count() > 0), "sanity"); @@ -635,11 +623,14 @@ void CodeBuffer::copy_code_to(CodeBlob* dest_blob) { ICache::invalidate_range(dest_blob->code_begin(), dest_blob->code_size()); } -// Move all my code into another code buffer. -// Consult applicable relocs to repair embedded addresses. +// Move all my code into another code buffer. Consult applicable +// relocs to repair embedded addresses. The layout in the destination +// CodeBuffer is different to the source CodeBuffer: the destination +// CodeBuffer gets the final layout (consts, insts, stubs in order of +// ascending address). void CodeBuffer::relocate_code_to(CodeBuffer* dest) const { DEBUG_ONLY(address dest_end = dest->_total_start + dest->_total_size); - for (int n = 0; n < (int)SECT_LIMIT; n++) { + for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) { // pull code out of each section const CodeSection* cs = code_section(n); if (cs->is_empty()) continue; // skip trivial section @@ -681,20 +672,19 @@ csize_t CodeBuffer::figure_expanded_capacities(CodeSection* which_cs, csize_t* new_capacity) { csize_t new_total_cap = 0; - int prev_n = -1; - for (int n = 0; n < (int)SECT_LIMIT; n++) { + for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) { const CodeSection* sect = code_section(n); if (!sect->is_empty()) { - // Compute initial padding; assign it to the previous non-empty guy. - // Cf. compute_final_layout. + // Compute initial padding; assign it to the previous section, + // even if it's empty (e.g. consts section can be empty). + // Cf. compute_final_layout csize_t padding = sect->align_at_start(new_total_cap) - new_total_cap; if (padding != 0) { new_total_cap += padding; - assert(prev_n >= 0, "sanity"); - new_capacity[prev_n] += padding; + assert(n - 1 >= SECT_FIRST, "sanity"); + new_capacity[n - 1] += padding; } - prev_n = n; } csize_t exp = sect->size(); // 100% increase @@ -774,11 +764,11 @@ void CodeBuffer::expand(CodeSection* which_cs, csize_t amount) { this->_before_expand = bxp; // Give each section its required (expanded) capacity. - for (int n = (int)SECT_LIMIT-1; n >= SECT_INSTS; n--) { + for (int n = (int)SECT_LIMIT-1; n >= SECT_FIRST; n--) { CodeSection* cb_sect = cb.code_section(n); CodeSection* this_sect = code_section(n); if (new_capacity[n] == 0) continue; // already nulled out - if (n > SECT_INSTS) { + if (n != SECT_INSTS) { cb.initialize_section_size(cb_sect, new_capacity[n]); } assert(cb_sect->capacity() >= new_capacity[n], "big enough"); @@ -844,17 +834,22 @@ bool CodeBuffer::verify_section_allocation() { assert(tstart >= _blob->content_begin(), "sanity"); assert(tend <= _blob->content_end(), "sanity"); } - address tcheck = tstart; // advancing pointer to verify disjointness - for (int n = 0; n < (int)SECT_LIMIT; n++) { + // Verify disjointness. + for (int n = (int) SECT_FIRST; n < (int) SECT_LIMIT; n++) { CodeSection* sect = code_section(n); - if (!sect->is_allocated()) continue; - assert(sect->start() >= tcheck, "sanity"); - tcheck = sect->start(); - assert((intptr_t)tcheck % sect->alignment() == 0 + if (!sect->is_allocated() || sect->is_empty()) continue; + assert((intptr_t)sect->start() % sect->alignment() == 0 || sect->is_empty() || _blob == NULL, "start is aligned"); - assert(sect->end() >= tcheck, "sanity"); - assert(sect->end() <= tend, "sanity"); + for (int m = (int) SECT_FIRST; m < (int) SECT_LIMIT; m++) { + CodeSection* other = code_section(m); + if (!other->is_allocated() || other == sect) continue; + assert(!other->contains(sect->start() ), "sanity"); + // limit is an exclusive address and can be the start of another + // section. + assert(!other->contains(sect->limit() - 1), "sanity"); + } + assert(sect->end() <= tend, "sanity"); } return true; } diff --git a/hotspot/src/share/vm/asm/codeBuffer.hpp b/hotspot/src/share/vm/asm/codeBuffer.hpp index 6e6897b0678..16880dcfadd 100644 --- a/hotspot/src/share/vm/asm/codeBuffer.hpp +++ b/hotspot/src/share/vm/asm/codeBuffer.hpp @@ -289,10 +289,12 @@ class CodeBuffer: public StackObj { public: typedef int csize_t; // code size type; would be size_t except for history enum { - // Here is the list of all possible sections, in order of ascending address. + // Here is the list of all possible sections. The order reflects + // the final layout. + SECT_FIRST = 0, + SECT_CONSTS = SECT_FIRST, // Non-instruction data: Floats, jump tables, etc. SECT_INSTS, // Executable instructions. SECT_STUBS, // Outbound trampolines for supporting call sites. - SECT_CONSTS, // Non-instruction data: Floats, jump tables, etc. SECT_LIMIT, SECT_NONE = -1 }; @@ -304,9 +306,9 @@ class CodeBuffer: public StackObj { const char* _name; + CodeSection _consts; // constants, jump tables CodeSection _insts; // instructions (the main section) CodeSection _stubs; // stubs (call site support), deopt, exception handling - CodeSection _consts; // constants, jump tables CodeBuffer* _before_expand; // dead buffer, from before the last expansion @@ -334,9 +336,9 @@ class CodeBuffer: public StackObj { } void initialize(address code_start, csize_t code_size) { + _consts.initialize_outer(this, SECT_CONSTS); _insts.initialize_outer(this, SECT_INSTS); _stubs.initialize_outer(this, SECT_STUBS); - _consts.initialize_outer(this, SECT_CONSTS); _total_start = code_start; _total_size = code_size; // Initialize the main section: @@ -414,16 +416,16 @@ class CodeBuffer: public StackObj { // construction. void initialize(csize_t code_size, csize_t locs_size); + CodeSection* consts() { return &_consts; } CodeSection* insts() { return &_insts; } CodeSection* stubs() { return &_stubs; } - CodeSection* consts() { return &_consts; } - // present sections in order; return NULL at end; insts is #0, etc. + // present sections in order; return NULL at end; consts is #0, etc. CodeSection* code_section(int n) { - // This makes the slightly questionable but portable assumption that - // the various members (_insts, _stubs, etc.) are adjacent in the - // layout of CodeBuffer. - CodeSection* cs = &_insts + n; + // This makes the slightly questionable but portable assumption + // that the various members (_consts, _insts, _stubs, etc.) are + // adjacent in the layout of CodeBuffer. + CodeSection* cs = &_consts + n; assert(cs->index() == n || !cs->is_allocated(), "sanity"); return cs; } @@ -484,9 +486,9 @@ class CodeBuffer: public StackObj { // CodeBlob). csize_t total_content_size() const; - // combined offset (relative to start of insts) of given address, - // as eventually found in the final CodeBlob - csize_t total_offset_of(address addr) const; + // Combined offset (relative to start of first section) of given + // section, as eventually found in the final CodeBlob. + csize_t total_offset_of(CodeSection* cs) const; // allocated size of all relocation data, including index, rounded up csize_t total_relocation_size() const; diff --git a/hotspot/src/share/vm/code/codeBlob.cpp b/hotspot/src/share/vm/code/codeBlob.cpp index fa98026c95e..823eedd498d 100644 --- a/hotspot/src/share/vm/code/codeBlob.cpp +++ b/hotspot/src/share/vm/code/codeBlob.cpp @@ -92,7 +92,7 @@ CodeBlob::CodeBlob( _header_size = header_size; _relocation_size = round_to(cb->total_relocation_size(), oopSize); _content_offset = align_code_offset(header_size + _relocation_size); - _code_offset = _content_offset + cb->total_offset_of(cb->insts()->start()); + _code_offset = _content_offset + cb->total_offset_of(cb->insts()); _data_offset = _content_offset + round_to(cb->total_content_size(), oopSize); assert(_data_offset <= size, "codeBlob is too small"); diff --git a/hotspot/src/share/vm/code/nmethod.cpp b/hotspot/src/share/vm/code/nmethod.cpp index 0526f43552e..e2de56852ff 100644 --- a/hotspot/src/share/vm/code/nmethod.cpp +++ b/hotspot/src/share/vm/code/nmethod.cpp @@ -87,9 +87,9 @@ struct nmethod_stats_struct { int nmethod_count; int total_size; int relocation_size; + int consts_size; int insts_size; int stub_size; - int consts_size; int scopes_data_size; int scopes_pcs_size; int dependencies_size; @@ -101,9 +101,9 @@ struct nmethod_stats_struct { nmethod_count += 1; total_size += nm->size(); relocation_size += nm->relocation_size(); + consts_size += nm->consts_size(); insts_size += nm->insts_size(); stub_size += nm->stub_size(); - consts_size += nm->consts_size(); oops_size += nm->oops_size(); scopes_data_size += nm->scopes_data_size(); scopes_pcs_size += nm->scopes_pcs_size(); @@ -116,9 +116,9 @@ struct nmethod_stats_struct { tty->print_cr("Statistics for %d bytecoded nmethods:", nmethod_count); if (total_size != 0) tty->print_cr(" total in heap = %d", total_size); if (relocation_size != 0) tty->print_cr(" relocation = %d", relocation_size); + if (consts_size != 0) tty->print_cr(" constants = %d", consts_size); if (insts_size != 0) tty->print_cr(" main code = %d", insts_size); if (stub_size != 0) tty->print_cr(" stub code = %d", stub_size); - if (consts_size != 0) tty->print_cr(" constants = %d", consts_size); if (oops_size != 0) tty->print_cr(" oops = %d", oops_size); if (scopes_data_size != 0) tty->print_cr(" scopes data = %d", scopes_data_size); if (scopes_pcs_size != 0) tty->print_cr(" scopes pcs = %d", scopes_pcs_size); @@ -404,9 +404,9 @@ void nmethod::add_handler_for_exception_and_pc(Handle exception, address pc, add int nmethod::total_size() const { return + consts_size() + insts_size() + stub_size() + - consts_size() + scopes_data_size() + scopes_pcs_size() + handler_table_size() + @@ -789,13 +789,17 @@ nmethod::nmethod( _orig_pc_offset = orig_pc_offset; // Section offsets - _consts_offset = content_offset() + code_buffer->total_offset_of(code_buffer->consts()->start()); - _stub_offset = content_offset() + code_buffer->total_offset_of(code_buffer->stubs()->start()); + _consts_offset = content_offset() + code_buffer->total_offset_of(code_buffer->consts()); + _stub_offset = content_offset() + code_buffer->total_offset_of(code_buffer->stubs()); // Exception handler and deopt handler are in the stub section _exception_offset = _stub_offset + offsets->value(CodeOffsets::Exceptions); _deoptimize_offset = _stub_offset + offsets->value(CodeOffsets::Deopt); - _deoptimize_mh_offset = _stub_offset + offsets->value(CodeOffsets::DeoptMH); + if (has_method_handle_invokes()) { + _deoptimize_mh_offset = _stub_offset + offsets->value(CodeOffsets::DeoptMH); + } else { + _deoptimize_mh_offset = -1; + } if (offsets->value(CodeOffsets::UnwindHandler) != -1) { _unwind_handler_offset = code_offset() + offsets->value(CodeOffsets::UnwindHandler); } else { @@ -885,9 +889,9 @@ void nmethod::log_new_nmethod() const { xtty->print(" address='" INTPTR_FORMAT "'", (intptr_t) this); LOG_OFFSET(xtty, relocation); + LOG_OFFSET(xtty, consts); LOG_OFFSET(xtty, insts); LOG_OFFSET(xtty, stub); - LOG_OFFSET(xtty, consts); LOG_OFFSET(xtty, scopes_data); LOG_OFFSET(xtty, scopes_pcs); LOG_OFFSET(xtty, dependencies); @@ -2336,6 +2340,10 @@ void nmethod::print() const { relocation_begin(), relocation_end(), relocation_size()); + if (consts_size () > 0) tty->print_cr(" constants [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d", + consts_begin(), + consts_end(), + consts_size()); if (insts_size () > 0) tty->print_cr(" main code [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d", insts_begin(), insts_end(), @@ -2344,10 +2352,6 @@ void nmethod::print() const { stub_begin(), stub_end(), stub_size()); - if (consts_size () > 0) tty->print_cr(" constants [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d", - consts_begin(), - consts_end(), - consts_size()); if (oops_size () > 0) tty->print_cr(" oops [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d", oops_begin(), oops_end(), @@ -2372,10 +2376,6 @@ void nmethod::print() const { nul_chk_table_begin(), nul_chk_table_end(), nul_chk_table_size()); - if (oops_size () > 0) tty->print_cr(" oops [" INTPTR_FORMAT "," INTPTR_FORMAT "] = %d", - oops_begin(), - oops_end(), - oops_size()); } void nmethod::print_code() { diff --git a/hotspot/src/share/vm/code/nmethod.hpp b/hotspot/src/share/vm/code/nmethod.hpp index ff37f96b888..2be8d08b455 100644 --- a/hotspot/src/share/vm/code/nmethod.hpp +++ b/hotspot/src/share/vm/code/nmethod.hpp @@ -143,8 +143,8 @@ class nmethod : public CodeBlob { #ifdef HAVE_DTRACE_H int _trap_offset; #endif // def HAVE_DTRACE_H - int _stub_offset; int _consts_offset; + int _stub_offset; int _oops_offset; // offset to where embedded oop table begins (inside data) int _scopes_data_offset; int _scopes_pcs_offset; @@ -336,16 +336,16 @@ class nmethod : public CodeBlob { bool is_compiled_by_shark() const; // boundaries for different parts - address insts_begin () const { return code_begin(); } + address consts_begin () const { return header_begin() + _consts_offset ; } + address consts_end () const { return header_begin() + code_offset() ; } + address insts_begin () const { return header_begin() + code_offset() ; } address insts_end () const { return header_begin() + _stub_offset ; } + address stub_begin () const { return header_begin() + _stub_offset ; } + address stub_end () const { return header_begin() + _oops_offset ; } address exception_begin () const { return header_begin() + _exception_offset ; } address deopt_handler_begin () const { return header_begin() + _deoptimize_offset ; } address deopt_mh_handler_begin() const { return header_begin() + _deoptimize_mh_offset ; } address unwind_handler_begin () const { return _unwind_handler_offset != -1 ? (header_begin() + _unwind_handler_offset) : NULL; } - address stub_begin () const { return header_begin() + _stub_offset ; } - address stub_end () const { return header_begin() + _consts_offset ; } - address consts_begin () const { return header_begin() + _consts_offset ; } - address consts_end () const { return header_begin() + _oops_offset ; } oop* oops_begin () const { return (oop*) (header_begin() + _oops_offset) ; } oop* oops_end () const { return (oop*) (header_begin() + _scopes_data_offset) ; } @@ -361,9 +361,9 @@ class nmethod : public CodeBlob { address nul_chk_table_end () const { return header_begin() + _nmethod_end_offset ; } // Sizes + int consts_size () const { return consts_end () - consts_begin (); } int insts_size () const { return insts_end () - insts_begin (); } int stub_size () const { return stub_end () - stub_begin (); } - int consts_size () const { return consts_end () - consts_begin (); } int oops_size () const { return (address) oops_end () - (address) oops_begin (); } int scopes_data_size () const { return scopes_data_end () - scopes_data_begin (); } int scopes_pcs_size () const { return (intptr_t) scopes_pcs_end () - (intptr_t) scopes_pcs_begin (); } @@ -374,9 +374,9 @@ class nmethod : public CodeBlob { int total_size () const; // Containment + bool consts_contains (address addr) const { return consts_begin () <= addr && addr < consts_end (); } bool insts_contains (address addr) const { return insts_begin () <= addr && addr < insts_end (); } bool stub_contains (address addr) const { return stub_begin () <= addr && addr < stub_end (); } - bool consts_contains (address addr) const { return consts_begin () <= addr && addr < consts_end (); } bool oops_contains (oop* addr) const { return oops_begin () <= addr && addr < oops_end (); } bool scopes_data_contains (address addr) const { return scopes_data_begin () <= addr && addr < scopes_data_end (); } bool scopes_pcs_contains (PcDesc* addr) const { return scopes_pcs_begin () <= addr && addr < scopes_pcs_end (); } diff --git a/hotspot/src/share/vm/code/relocInfo.cpp b/hotspot/src/share/vm/code/relocInfo.cpp index 7df9e47e01b..a21f34642e4 100644 --- a/hotspot/src/share/vm/code/relocInfo.cpp +++ b/hotspot/src/share/vm/code/relocInfo.cpp @@ -128,7 +128,16 @@ void RelocIterator::initialize(nmethod* nm, address begin, address limit) { _code = nm; _current = nm->relocation_begin() - 1; _end = nm->relocation_end(); - _addr = (address) nm->code_begin(); + _addr = nm->content_begin(); + + // Initialize code sections. + _section_start[CodeBuffer::SECT_CONSTS] = nm->consts_begin(); + _section_start[CodeBuffer::SECT_INSTS ] = nm->insts_begin() ; + _section_start[CodeBuffer::SECT_STUBS ] = nm->stub_begin() ; + + _section_end [CodeBuffer::SECT_CONSTS] = nm->consts_end() ; + _section_end [CodeBuffer::SECT_INSTS ] = nm->insts_end() ; + _section_end [CodeBuffer::SECT_STUBS ] = nm->stub_end() ; assert(!has_current(), "just checking"); assert(begin == NULL || begin >= nm->code_begin(), "in bounds"); @@ -146,9 +155,11 @@ RelocIterator::RelocIterator(CodeSection* cs, address begin, address limit) { _code = NULL; // Not cb->blob(); CodeBuffer* cb = cs->outer(); - assert((int)SECT_LIMIT == CodeBuffer::SECT_LIMIT, "my copy must be equal"); - for (int n = 0; n < (int)SECT_LIMIT; n++) { - _section_start[n] = cb->code_section(n)->start(); + assert((int) SECT_LIMIT == CodeBuffer::SECT_LIMIT, "my copy must be equal"); + for (int n = (int) CodeBuffer::SECT_FIRST; n < (int) CodeBuffer::SECT_LIMIT; n++) { + CodeSection* cs = cb->code_section(n); + _section_start[n] = cs->start(); + _section_end [n] = cs->end(); } assert(!has_current(), "just checking"); @@ -166,6 +177,12 @@ struct RelocIndexEntry { }; +bool RelocIterator::addr_in_const() const { + const int n = CodeBuffer::SECT_CONSTS; + return section_start(n) <= addr() && addr() < section_end(n); +} + + static inline int num_cards(int code_size) { return (code_size-1) / indexCardSize; } @@ -360,31 +377,12 @@ void RelocIterator::advance_over_prefix() { } -address RelocIterator::compute_section_start(int n) const { -// This routine not only computes a section start, but also -// memoizes it for later. -#define CACHE ((RelocIterator*)this)->_section_start[n] - CodeBlob* cb = code(); - guarantee(cb != NULL, "must have a code blob"); - if (n == CodeBuffer::SECT_INSTS) - return CACHE = cb->code_begin(); - assert(cb->is_nmethod(), "only nmethods have these sections"); - nmethod* nm = (nmethod*) cb; - address res = NULL; - switch (n) { - case CodeBuffer::SECT_STUBS: - res = nm->stub_begin(); - break; - case CodeBuffer::SECT_CONSTS: - res = nm->consts_begin(); - break; - default: - ShouldNotReachHere(); +void RelocIterator::initialize_misc() { + set_has_current(false); + for (int i = (int) CodeBuffer::SECT_FIRST; i < (int) CodeBuffer::SECT_LIMIT; i++) { + _section_start[i] = NULL; // these will be lazily computed, if needed + _section_end [i] = NULL; } - assert(nm->contains(res) || res == nm->code_end(), "tame pointer"); - CACHE = res; - return res; -#undef CACHE } diff --git a/hotspot/src/share/vm/code/relocInfo.hpp b/hotspot/src/share/vm/code/relocInfo.hpp index 7bb1887376b..116a097f395 100644 --- a/hotspot/src/share/vm/code/relocInfo.hpp +++ b/hotspot/src/share/vm/code/relocInfo.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2008, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -502,8 +502,7 @@ class RelocationHolder VALUE_OBJ_CLASS_SPEC { // } class RelocIterator : public StackObj { - enum { SECT_CONSTS = 2, - SECT_LIMIT = 3 }; // must be equal to CodeBuffer::SECT_LIMIT + enum { SECT_LIMIT = 3 }; // must be equal to CodeBuffer::SECT_LIMIT, checked in ctor friend class Relocation; friend class relocInfo; // for change_reloc_info_for_address only typedef relocInfo::relocType relocType; @@ -521,6 +520,7 @@ class RelocIterator : public StackObj { // Base addresses needed to compute targets of section_word_type relocs. address _section_start[SECT_LIMIT]; + address _section_end [SECT_LIMIT]; void set_has_current(bool b) { _datalen = !b ? -1 : 0; @@ -540,14 +540,7 @@ class RelocIterator : public StackObj { void advance_over_prefix(); // helper method - void initialize_misc() { - set_has_current(false); - for (int i = 0; i < SECT_LIMIT; i++) { - _section_start[i] = NULL; // these will be lazily computed, if needed - } - } - - address compute_section_start(int n) const; // out-of-line helper + void initialize_misc(); void initialize(nmethod* nm, address begin, address limit); @@ -598,11 +591,15 @@ class RelocIterator : public StackObj { bool has_current() const { return _datalen >= 0; } void set_addr(address addr) { _addr = addr; } - bool addr_in_const() const { return addr() >= section_start(SECT_CONSTS); } + bool addr_in_const() const; address section_start(int n) const { - address res = _section_start[n]; - return (res != NULL) ? res : compute_section_start(n); + assert(_section_start[n], "must be initialized"); + return _section_start[n]; + } + address section_end(int n) const { + assert(_section_end[n], "must be initialized"); + return _section_end[n]; } // The address points to the affected displacement part of the instruction. diff --git a/hotspot/src/share/vm/includeDB_compiler2 b/hotspot/src/share/vm/includeDB_compiler2 index a231dcb4f7f..1e1ab5aa0ba 100644 --- a/hotspot/src/share/vm/includeDB_compiler2 +++ b/hotspot/src/share/vm/includeDB_compiler2 @@ -625,6 +625,7 @@ loopTransform.cpp divnode.hpp loopTransform.cpp loopnode.hpp loopTransform.cpp mulnode.hpp loopTransform.cpp rootnode.hpp +loopTransform.cpp runtime.hpp loopTransform.cpp subnode.hpp loopUnswitch.cpp allocation.inline.hpp diff --git a/hotspot/src/share/vm/opto/addnode.cpp b/hotspot/src/share/vm/opto/addnode.cpp index 5f2332a14f6..b6d073d5b9e 100644 --- a/hotspot/src/share/vm/opto/addnode.cpp +++ b/hotspot/src/share/vm/opto/addnode.cpp @@ -705,6 +705,9 @@ int AddPNode::unpack_offsets(Node* elements[], int length) { } addr = addr->in(AddPNode::Address); } + if (addr != base) { + return -1; + } return count; } diff --git a/hotspot/src/share/vm/opto/c2_globals.hpp b/hotspot/src/share/vm/opto/c2_globals.hpp index e07e7fdec2e..9b35d5be86a 100644 --- a/hotspot/src/share/vm/opto/c2_globals.hpp +++ b/hotspot/src/share/vm/opto/c2_globals.hpp @@ -157,6 +157,12 @@ develop(bool, TraceLoopPredicate, false, \ "Trace generation of loop predicates") \ \ + product(bool, OptimizeFill, false, \ + "convert fill/copy loops into intrinsic") \ + \ + develop(bool, TraceOptimizeFill, false, \ + "print detailed information about fill conversion") \ + \ develop(bool, OptoCoalesce, true, \ "Use Conservative Copy Coalescing in the Register Allocator") \ \ diff --git a/hotspot/src/share/vm/opto/loopTransform.cpp b/hotspot/src/share/vm/opto/loopTransform.cpp index 31daf7d8fef..94d74d3a828 100644 --- a/hotspot/src/share/vm/opto/loopTransform.cpp +++ b/hotspot/src/share/vm/opto/loopTransform.cpp @@ -2049,11 +2049,18 @@ bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, Invari if (cmp->Opcode() != Op_CmpU ) { return false; } - if (cmp->in(2)->Opcode() != Op_LoadRange) { - return false; + Node* range = cmp->in(2); + if (range->Opcode() != Op_LoadRange) { + const TypeInt* tint = phase->_igvn.type(range)->isa_int(); + if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) { + // Allow predication on positive values that aren't LoadRanges. + // This allows optimization of loops where the length of the + // array is a known value and doesn't need to be loaded back + // from the array. + return false; + } } - LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2); - if (!invar.is_invariant(lr)) { // loadRange must be invariant + if (!invar.is_invariant(range)) { return false; } Node *iv = _head->as_CountedLoop()->phi(); @@ -2248,9 +2255,9 @@ bool PhaseIdealLoop::loop_predication_impl(IdealLoopTree *loop) { const Node* cmp = bol->in(1)->as_Cmp(); Node* idx = cmp->in(1); assert(!invar.is_invariant(idx), "index is variant"); - assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be"); - Node* ld_rng = cmp->in(2); // LoadRangeNode - assert(invar.is_invariant(ld_rng), "load range must be invariant"); + assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be"); + Node* rng = cmp->in(2); + assert(invar.is_invariant(rng), "range must be invariant"); int scale = 1; Node* offset = zero; bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); @@ -2271,21 +2278,21 @@ bool PhaseIdealLoop::loop_predication_impl(IdealLoopTree *loop) { // Perform cloning to keep Invariance state correct since the // late schedule will place invariant things in the loop. - ld_rng = invar.clone(ld_rng, ctrl); + rng = invar.clone(rng, ctrl); if (offset && offset != zero) { assert(invar.is_invariant(offset), "offset must be loop invariant"); offset = invar.clone(offset, ctrl); } // Test the lower bound - Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false); + Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false); IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); _igvn.hash_delete(lower_bound_iff); lower_bound_iff->set_req(1, lower_bound_bol); if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx); // Test the upper bound - Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true); + Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true); IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); _igvn.hash_delete(upper_bound_iff); upper_bound_iff->set_req(1, upper_bound_bol); @@ -2366,3 +2373,348 @@ bool IdealLoopTree::loop_predication( PhaseIdealLoop *phase) { return hoisted; } + + +// Process all the loops in the loop tree and replace any fill +// patterns with an intrisc version. +bool PhaseIdealLoop::do_intrinsify_fill() { + bool changed = false; + for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { + IdealLoopTree* lpt = iter.current(); + changed |= intrinsify_fill(lpt); + } + return changed; +} + + +// Examine an inner loop looking for a a single store of an invariant +// value in a unit stride loop, +bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, + Node*& shift, Node*& con) { + const char* msg = NULL; + Node* msg_node = NULL; + + store_value = NULL; + con = NULL; + shift = NULL; + + // Process the loop looking for stores. If there are multiple + // stores or extra control flow give at this point. + CountedLoopNode* head = lpt->_head->as_CountedLoop(); + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + if (n->outcnt() == 0) continue; // Ignore dead + if (n->is_Store()) { + if (store != NULL) { + msg = "multiple stores"; + break; + } + int opc = n->Opcode(); + if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) { + msg = "oop fills not handled"; + break; + } + Node* value = n->in(MemNode::ValueIn); + if (!lpt->is_invariant(value)) { + msg = "variant store value"; + } + store = n; + store_value = value; + } else if (n->is_If() && n != head->loopexit()) { + msg = "extra control flow"; + msg_node = n; + } + } + + if (store == NULL) { + // No store in loop + return false; + } + + if (msg == NULL && head->stride_con() != 1) { + // could handle negative strides too + if (head->stride_con() < 0) { + msg = "negative stride"; + } else { + msg = "non-unit stride"; + } + } + + if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) { + msg = "can't handle store address"; + msg_node = store->in(MemNode::Address); + } + + // Make sure there is an appropriate fill routine + BasicType t = store->as_Mem()->memory_type(); + const char* fill_name; + if (msg == NULL && + StubRoutines::select_fill_function(t, false, fill_name) == NULL) { + msg = "unsupported store"; + msg_node = store; + } + + if (msg != NULL) { +#ifndef PRODUCT + if (TraceOptimizeFill) { + tty->print_cr("not fill intrinsic candidate: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } +#endif + return false; + } + + // Make sure the address expression can be handled. It should be + // head->phi * elsize + con. head->phi might have a ConvI2L. + Node* elements[4]; + Node* conv = NULL; + int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements)); + for (int e = 0; e < count; e++) { + Node* n = elements[e]; + if (n->is_Con() && con == NULL) { + con = n; + } else if (n->Opcode() == Op_LShiftX && shift == NULL) { + Node* value = n->in(1); +#ifdef _LP64 + if (value->Opcode() == Op_ConvI2L) { + conv = value; + value = value->in(1); + } +#endif + if (value != head->phi()) { + msg = "unhandled shift in address"; + } else { + shift = n; + assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match"); + } + } else if (n->Opcode() == Op_ConvI2L && conv == NULL) { + if (n->in(1) == head->phi()) { + conv = n; + } else { + msg = "unhandled input to ConvI2L"; + } + } else if (n == head->phi()) { + // no shift, check below for allowed cases + } else { + msg = "unhandled node in address"; + msg_node = n; + } + } + + if (count == -1) { + msg = "malformed address expression"; + msg_node = store; + } + + // byte sized items won't have a shift + if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) { + msg = "can't find shift"; + msg_node = store; + } + + if (msg != NULL) { +#ifndef PRODUCT + if (TraceOptimizeFill) { + tty->print_cr("not fill intrinsic: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } +#endif + return false; + } + + // No make sure all the other nodes in the loop can be handled + VectorSet ok(Thread::current()->resource_area()); + + // store related values are ok + ok.set(store->_idx); + ok.set(store->in(MemNode::Memory)->_idx); + + // Loop structure is ok + ok.set(head->_idx); + ok.set(head->loopexit()->_idx); + ok.set(head->phi()->_idx); + ok.set(head->incr()->_idx); + ok.set(head->loopexit()->cmp_node()->_idx); + ok.set(head->loopexit()->in(1)->_idx); + + // Address elements are ok + if (con) ok.set(con->_idx); + if (shift) ok.set(shift->_idx); + if (conv) ok.set(conv->_idx); + + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + if (n->outcnt() == 0) continue; // Ignore dead + if (ok.test(n->_idx)) continue; + // Backedge projection is ok + if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue; + if (!n->is_AddP()) { + msg = "unhandled node"; + msg_node = n; + break; + } + } + + // Make sure no unexpected values are used outside the loop + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + // These values can be replaced with other nodes if they are used + // outside the loop. + if (n == store || n == head->loopexit() || n == head->incr()) continue; + for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) { + Node* use = iter.get(); + if (!lpt->_body.contains(use)) { + msg = "node is used outside loop"; + // lpt->_body.dump(); + msg_node = n; + break; + } + } + } + +#ifdef ASSERT + if (TraceOptimizeFill) { + if (msg != NULL) { + tty->print_cr("no fill intrinsic: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } else { + tty->print_cr("fill intrinsic for:"); + } + store->dump(); + if (Verbose) { + lpt->_body.dump(); + } + } +#endif + + return msg == NULL; +} + + + +bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) { + // Only for counted inner loops + if (!lpt->is_counted() || !lpt->is_inner()) { + return false; + } + + // Must have constant stride + CountedLoopNode* head = lpt->_head->as_CountedLoop(); + if (!head->stride_is_con() || !head->is_normal_loop()) { + return false; + } + + // Check that the body only contains a store of a loop invariant + // value that is indexed by the loop phi. + Node* store = NULL; + Node* store_value = NULL; + Node* shift = NULL; + Node* offset = NULL; + if (!match_fill_loop(lpt, store, store_value, shift, offset)) { + return false; + } + + // Now replace the whole loop body by a call to a fill routine that + // covers the same region as the loop. + Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base); + + // Build an expression for the beginning of the copy region + Node* index = head->init_trip(); +#ifdef _LP64 + index = new (C, 2) ConvI2LNode(index); + _igvn.register_new_node_with_optimizer(index); +#endif + if (shift != NULL) { + // byte arrays don't require a shift but others do. + index = new (C, 3) LShiftXNode(index, shift->in(2)); + _igvn.register_new_node_with_optimizer(index); + } + index = new (C, 4) AddPNode(base, base, index); + _igvn.register_new_node_with_optimizer(index); + Node* from = new (C, 4) AddPNode(base, index, offset); + _igvn.register_new_node_with_optimizer(from); + // Compute the number of elements to copy + Node* len = new (C, 3) SubINode(head->limit(), head->init_trip()); + _igvn.register_new_node_with_optimizer(len); + + BasicType t = store->as_Mem()->memory_type(); + bool aligned = false; + if (offset != NULL && head->init_trip()->is_Con()) { + int element_size = type2aelembytes(t); + aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0; + } + + // Build a call to the fill routine + const char* fill_name; + address fill = StubRoutines::select_fill_function(t, aligned, fill_name); + assert(fill != NULL, "what?"); + + // Convert float/double to int/long for fill routines + if (t == T_FLOAT) { + store_value = new (C, 2) MoveF2INode(store_value); + _igvn.register_new_node_with_optimizer(store_value); + } else if (t == T_DOUBLE) { + store_value = new (C, 2) MoveD2LNode(store_value); + _igvn.register_new_node_with_optimizer(store_value); + } + + Node* mem_phi = store->in(MemNode::Memory); + Node* result_ctrl; + Node* result_mem; + const TypeFunc* call_type = OptoRuntime::array_fill_Type(); + int size = call_type->domain()->cnt(); + CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill, + fill_name, TypeAryPtr::get_array_body_type(t)); + call->init_req(TypeFunc::Parms+0, from); + call->init_req(TypeFunc::Parms+1, store_value); + call->init_req(TypeFunc::Parms+2, len); + call->init_req( TypeFunc::Control, head->init_control()); + call->init_req( TypeFunc::I_O , C->top() ) ; // does no i/o + call->init_req( TypeFunc::Memory , mem_phi->in(LoopNode::EntryControl) ); + call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) ); + call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) ); + _igvn.register_new_node_with_optimizer(call); + result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control); + _igvn.register_new_node_with_optimizer(result_ctrl); + result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory); + _igvn.register_new_node_with_optimizer(result_mem); + + // If this fill is tightly coupled to an allocation and overwrites + // the whole body, allow it to take over the zeroing. + AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this); + if (alloc != NULL && alloc->is_AllocateArray()) { + Node* length = alloc->as_AllocateArray()->Ideal_length(); + if (head->limit() == length && + head->init_trip() == _igvn.intcon(0)) { + if (TraceOptimizeFill) { + tty->print_cr("Eliminated zeroing in allocation"); + } + alloc->maybe_set_complete(&_igvn); + } else { +#ifdef ASSERT + if (TraceOptimizeFill) { + tty->print_cr("filling array but bounds don't match"); + alloc->dump(); + head->init_trip()->dump(); + head->limit()->dump(); + length->dump(); + } +#endif + } + } + + // Redirect the old control and memory edges that are outside the loop. + Node* exit = head->loopexit()->proj_out(0); + _igvn.replace_node(exit, result_ctrl); + _igvn.replace_node(store, result_mem); + // Any uses the increment outside of the loop become the loop limit. + _igvn.replace_node(head->incr(), head->limit()); + + // Disconnect the head from the loop. + for (uint i = 0; i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + _igvn.replace_node(n, C->top()); + } + + return true; +} diff --git a/hotspot/src/share/vm/opto/loopnode.cpp b/hotspot/src/share/vm/opto/loopnode.cpp index df9224af021..17277fa572a 100644 --- a/hotspot/src/share/vm/opto/loopnode.cpp +++ b/hotspot/src/share/vm/opto/loopnode.cpp @@ -1673,6 +1673,12 @@ void PhaseIdealLoop::build_and_optimize(bool do_split_ifs, bool do_loop_pred) { _ltree_root->_child->loop_predication(this); } + if (OptimizeFill && UseLoopPredicate && C->has_loops() && !C->major_progress()) { + if (do_intrinsify_fill()) { + C->set_major_progress(); + } + } + // Perform iteration-splitting on inner loops. Split iterations to avoid // range checks or one-shot null checks. diff --git a/hotspot/src/share/vm/opto/loopnode.hpp b/hotspot/src/share/vm/opto/loopnode.hpp index 0fd4c6bbc50..3b3b42053a0 100644 --- a/hotspot/src/share/vm/opto/loopnode.hpp +++ b/hotspot/src/share/vm/opto/loopnode.hpp @@ -937,6 +937,12 @@ public: // same block. Split thru the Region. void do_split_if( Node *iff ); + // Conversion of fill/copy patterns into intrisic versions + bool do_intrinsify_fill(); + bool intrinsify_fill(IdealLoopTree* lpt); + bool match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, + Node*& shift, Node*& offset); + private: // Return a type based on condition control flow const TypeInt* filtered_type( Node *n, Node* n_ctrl); diff --git a/hotspot/src/share/vm/opto/memnode.cpp b/hotspot/src/share/vm/opto/memnode.cpp index 9b6e26f9883..9297e4e224c 100644 --- a/hotspot/src/share/vm/opto/memnode.cpp +++ b/hotspot/src/share/vm/opto/memnode.cpp @@ -1547,8 +1547,8 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const { adr->is_AddP() && off != Type::OffsetBot) { // For constant Strings treat the fields as compile time constants. Node* base = adr->in(AddPNode::Base); - if (base->Opcode() == Op_ConP) { - const TypeOopPtr* t = phase->type(base)->isa_oopptr(); + const TypeOopPtr* t = phase->type(base)->isa_oopptr(); + if (t != NULL && t->singleton()) { ciObject* string = t->const_oop(); ciConstant constant = string->as_instance()->field_value_by_offset(off); if (constant.basic_type() == T_INT) { diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp index e56b139d6bb..389c3f9ee7f 100644 --- a/hotspot/src/share/vm/opto/runtime.cpp +++ b/hotspot/src/share/vm/opto/runtime.cpp @@ -645,6 +645,22 @@ const TypeFunc* OptoRuntime::generic_arraycopy_Type() { } +const TypeFunc* OptoRuntime::array_fill_Type() { + // create input type (domain) + const Type** fields = TypeTuple::fields(3); + fields[TypeFunc::Parms+0] = TypePtr::NOTNULL; + fields[TypeFunc::Parms+1] = TypeInt::INT; + fields[TypeFunc::Parms+2] = TypeInt::INT; + const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms + 3, fields); + + // create result type + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple *range = TypeTuple::make(TypeFunc::Parms, fields); + + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp index 9c930426bf9..c5053853060 100644 --- a/hotspot/src/share/vm/opto/runtime.hpp +++ b/hotspot/src/share/vm/opto/runtime.hpp @@ -260,6 +260,8 @@ private: static const TypeFunc* generic_arraycopy_Type(); static const TypeFunc* slow_arraycopy_Type(); // the full routine + static const TypeFunc* array_fill_Type(); + // leaf on stack replacement interpreter accessor types static const TypeFunc* osr_end_Type(); diff --git a/hotspot/src/share/vm/opto/type.cpp b/hotspot/src/share/vm/opto/type.cpp index 7f95984273f..284159974ae 100644 --- a/hotspot/src/share/vm/opto/type.cpp +++ b/hotspot/src/share/vm/opto/type.cpp @@ -314,7 +314,7 @@ void Type::Initialize_shared(Compile* current) { mreg2type[Op_RegL] = TypeLong::LONG; mreg2type[Op_RegFlags] = TypeInt::CC; - TypeAryPtr::RANGE = TypeAryPtr::make( TypePtr::BotPTR, TypeAry::make(Type::BOTTOM,TypeInt::POS), current->env()->Object_klass(), false, arrayOopDesc::length_offset_in_bytes()); + TypeAryPtr::RANGE = TypeAryPtr::make( TypePtr::BotPTR, TypeAry::make(Type::BOTTOM,TypeInt::POS), NULL /* current->env()->Object_klass() */, false, arrayOopDesc::length_offset_in_bytes()); TypeAryPtr::NARROWOOPS = TypeAryPtr::make(TypePtr::BotPTR, TypeAry::make(TypeNarrowOop::BOTTOM, TypeInt::POS), NULL /*ciArrayKlass::make(o)*/, false, Type::OffsetBot); @@ -3369,7 +3369,7 @@ const Type *TypeAryPtr::xmeet( const Type *t ) const { tary = TypeAry::make(Type::BOTTOM, tary->_size); } } - bool xk; + bool xk = false; switch (tap->ptr()) { case AnyNull: case TopPTR: @@ -3391,9 +3391,10 @@ const Type *TypeAryPtr::xmeet( const Type *t ) const { o = tap->const_oop(); xk = true; } else { - xk = this->_klass_is_exact; + // Only precise for identical arrays + xk = this->_klass_is_exact && (klass() == tap->klass()); } - return TypeAryPtr::make( ptr, o, tary, tap->_klass, xk, off, instance_id ); + return TypeAryPtr::make( ptr, o, tary, lazy_klass, xk, off, instance_id ); } case NotNull: case BotPTR: @@ -3683,12 +3684,10 @@ int TypeKlassPtr::hash(void) const { } -//------------------------------klass------------------------------------------ -// Return the defining klass for this class -ciKlass* TypeAryPtr::klass() const { - if( _klass ) return _klass; // Return cached value, if possible - - // Oops, need to compute _klass and cache it +//----------------------compute_klass------------------------------------------ +// Compute the defining klass for this class +ciKlass* TypeAryPtr::compute_klass(DEBUG_ONLY(bool verify)) const { + // Compute _klass based on element type. ciKlass* k_ary = NULL; const TypeInstPtr *tinst; const TypeAryPtr *tary; @@ -3715,11 +3714,39 @@ ciKlass* TypeAryPtr::klass() const { } else { // Cannot compute array klass directly from basic type, // since subtypes of TypeInt all have basic type T_INT. +#ifdef ASSERT + if (verify && el->isa_int()) { + // Check simple cases when verifying klass. + BasicType bt = T_ILLEGAL; + if (el == TypeInt::BYTE) { + bt = T_BYTE; + } else if (el == TypeInt::SHORT) { + bt = T_SHORT; + } else if (el == TypeInt::CHAR) { + bt = T_CHAR; + } else if (el == TypeInt::INT) { + bt = T_INT; + } else { + return _klass; // just return specified klass + } + return ciTypeArrayKlass::make(bt); + } +#endif assert(!el->isa_int(), "integral arrays must be pre-equipped with a class"); // Compute array klass directly from basic type k_ary = ciTypeArrayKlass::make(el->basic_type()); } + return k_ary; +} + +//------------------------------klass------------------------------------------ +// Return the defining klass for this class +ciKlass* TypeAryPtr::klass() const { + if( _klass ) return _klass; // Return cached value, if possible + + // Oops, need to compute _klass and cache it + ciKlass* k_ary = compute_klass(); if( this != TypeAryPtr::OOPS ) { // The _klass field acts as a cache of the underlying diff --git a/hotspot/src/share/vm/opto/type.hpp b/hotspot/src/share/vm/opto/type.hpp index a922307dc7f..e244fa279ce 100644 --- a/hotspot/src/share/vm/opto/type.hpp +++ b/hotspot/src/share/vm/opto/type.hpp @@ -831,11 +831,30 @@ class TypeInstPtr : public TypeOopPtr { //------------------------------TypeAryPtr------------------------------------- // Class of Java array pointers class TypeAryPtr : public TypeOopPtr { - TypeAryPtr( PTR ptr, ciObject* o, const TypeAry *ary, ciKlass* k, bool xk, int offset, int instance_id ) : TypeOopPtr(AryPtr,ptr,k,xk,o,offset, instance_id), _ary(ary) {}; + TypeAryPtr( PTR ptr, ciObject* o, const TypeAry *ary, ciKlass* k, bool xk, int offset, int instance_id ) : TypeOopPtr(AryPtr,ptr,k,xk,o,offset, instance_id), _ary(ary) { +#ifdef ASSERT + if (k != NULL) { + // Verify that specified klass and TypeAryPtr::klass() follow the same rules. + ciKlass* ck = compute_klass(true); + if (UseNewCode || k != ck) { + this->dump(); tty->cr(); + tty->print(" k: "); + k->print(); tty->cr(); + tty->print("ck: "); + if (ck != NULL) ck->print(); + else tty->print(""); + tty->cr(); + assert(false, "unexpected TypeAryPtr::_klass"); + } + } +#endif + } virtual bool eq( const Type *t ) const; virtual int hash() const; // Type specific hashing const TypeAry *_ary; // Array we point into + ciKlass* compute_klass(DEBUG_ONLY(bool verify = false)) const; + public: // Accessors ciKlass* klass() const; diff --git a/hotspot/src/share/vm/runtime/arguments.cpp b/hotspot/src/share/vm/runtime/arguments.cpp index 80c004d5034..6a15d99fe7e 100644 --- a/hotspot/src/share/vm/runtime/arguments.cpp +++ b/hotspot/src/share/vm/runtime/arguments.cpp @@ -1513,6 +1513,9 @@ void Arguments::set_aggressive_opts_flags() { if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeStringConcat)) { FLAG_SET_DEFAULT(OptimizeStringConcat, true); } + if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeFill)) { + FLAG_SET_DEFAULT(OptimizeFill, true); + } #endif if (AggressiveOpts) { diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp index 66bd611255d..d87c8bfae04 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp @@ -97,6 +97,15 @@ address StubRoutines::_checkcast_arraycopy = NULL; address StubRoutines::_unsafe_arraycopy = NULL; address StubRoutines::_generic_arraycopy = NULL; + +address StubRoutines::_jbyte_fill; +address StubRoutines::_jshort_fill; +address StubRoutines::_jint_fill; +address StubRoutines::_arrayof_jbyte_fill; +address StubRoutines::_arrayof_jshort_fill; +address StubRoutines::_arrayof_jint_fill; + + double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; double (* StubRoutines::_intrinsic_exp )(double) = NULL; @@ -193,6 +202,46 @@ void StubRoutines::initialize2() { #undef TEST_ARRAYCOPY +#define TEST_FILL(type) \ + if (_##type##_fill != NULL) { \ + union { \ + double d; \ + type body[96]; \ + } s; \ + \ + int v = 32; \ + for (int offset = -2; offset <= 2; offset++) { \ + for (int i = 0; i < 96; i++) { \ + s.body[i] = 1; \ + } \ + type* start = s.body + 8 + offset; \ + for (int aligned = 0; aligned < 2; aligned++) { \ + if (aligned) { \ + if (((intptr_t)start) % HeapWordSize == 0) { \ + ((void (*)(type*, int, int))StubRoutines::_arrayof_##type##_fill)(start, v, 80); \ + } else { \ + continue; \ + } \ + } else { \ + ((void (*)(type*, int, int))StubRoutines::_##type##_fill)(start, v, 80); \ + } \ + for (int i = 0; i < 96; i++) { \ + if (i < (8 + offset) || i >= (88 + offset)) { \ + assert(s.body[i] == 1, "what?"); \ + } else { \ + assert(s.body[i] == 32, "what?"); \ + } \ + } \ + } \ + } \ + } \ + + TEST_FILL(jbyte); + TEST_FILL(jshort); + TEST_FILL(jint); + +#undef TEST_FILL + #define TEST_COPYRTN(type) \ test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::conjoint_##type##s_atomic), sizeof(type)); \ test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::arrayof_conjoint_##type##s), (int)MAX2(sizeof(HeapWord), sizeof(type))) @@ -313,3 +362,39 @@ JRT_LEAF(void, StubRoutines::arrayof_oop_copy(HeapWord* src, HeapWord* dest, siz Copy::arrayof_conjoint_oops(src, dest, count); gen_arraycopy_barrier((oop *) dest, count); JRT_END + + +address StubRoutines::select_fill_function(BasicType t, bool aligned, const char* &name) { +#define RETURN_STUB(xxx_fill) { \ + name = #xxx_fill; \ + return StubRoutines::xxx_fill(); } + + switch (t) { + case T_BYTE: + case T_BOOLEAN: + if (!aligned) RETURN_STUB(jbyte_fill); + RETURN_STUB(arrayof_jbyte_fill); + case T_CHAR: + case T_SHORT: + if (!aligned) RETURN_STUB(jshort_fill); + RETURN_STUB(arrayof_jshort_fill); + case T_INT: + case T_FLOAT: + if (!aligned) RETURN_STUB(jint_fill); + RETURN_STUB(arrayof_jint_fill); + case T_DOUBLE: + case T_LONG: + case T_ARRAY: + case T_OBJECT: + case T_NARROWOOP: + case T_ADDRESS: + // Currently unsupported + return NULL; + + default: + ShouldNotReachHere(); + return NULL; + } + +#undef RETURN_STUB +} diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp index f09d5ae1b8f..6b51e316bd8 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp @@ -148,6 +148,13 @@ class StubRoutines: AllStatic { static address _unsafe_arraycopy; static address _generic_arraycopy; + static address _jbyte_fill; + static address _jshort_fill; + static address _jint_fill; + static address _arrayof_jbyte_fill; + static address _arrayof_jshort_fill; + static address _arrayof_jint_fill; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -259,6 +266,16 @@ class StubRoutines: AllStatic { static address unsafe_arraycopy() { return _unsafe_arraycopy; } static address generic_arraycopy() { return _generic_arraycopy; } + static address jbyte_fill() { return _jbyte_fill; } + static address jshort_fill() { return _jshort_fill; } + static address jint_fill() { return _jint_fill; } + static address arrayof_jbyte_fill() { return _arrayof_jbyte_fill; } + static address arrayof_jshort_fill() { return _arrayof_jshort_fill; } + static address arrayof_jint_fill() { return _arrayof_jint_fill; } + + static address select_fill_function(BasicType t, bool aligned, const char* &name); + + static double intrinsic_log(double d) { assert(_intrinsic_log != NULL, "must be defined"); return _intrinsic_log(d); diff --git a/hotspot/src/share/vm/utilities/globalDefinitions.hpp b/hotspot/src/share/vm/utilities/globalDefinitions.hpp index 97d96c21f12..2b4746a334b 100644 --- a/hotspot/src/share/vm/utilities/globalDefinitions.hpp +++ b/hotspot/src/share/vm/utilities/globalDefinitions.hpp @@ -529,7 +529,7 @@ extern int _type2aelembytes[T_CONFLICT+1]; // maps a BasicType to nof bytes used #ifdef ASSERT extern int type2aelembytes(BasicType t, bool allow_address = false); // asserts #else -inline int type2aelembytes(BasicType t) { return _type2aelembytes[t]; } +inline int type2aelembytes(BasicType t, bool allow_address = false) { return _type2aelembytes[t]; } #endif