Merge
This commit is contained in:
commit
740031d711
hotspot
@ -1575,6 +1575,35 @@ void Assembler::movdqa(Address dst, XMMRegister src) {
|
||||
emit_operand(src, dst);
|
||||
}
|
||||
|
||||
void Assembler::movdqu(XMMRegister dst, Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionMark im(this);
|
||||
emit_byte(0xF3);
|
||||
prefix(src, dst);
|
||||
emit_byte(0x0F);
|
||||
emit_byte(0x6F);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
emit_byte(0xF3);
|
||||
int encode = prefixq_and_encode(dst->encoding(), src->encoding());
|
||||
emit_byte(0x0F);
|
||||
emit_byte(0x6F);
|
||||
emit_byte(0xC0 | encode);
|
||||
}
|
||||
|
||||
void Assembler::movdqu(Address dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
InstructionMark im(this);
|
||||
emit_byte(0xF3);
|
||||
prefix(dst, src);
|
||||
emit_byte(0x0F);
|
||||
emit_byte(0x7F);
|
||||
emit_operand(src, dst);
|
||||
}
|
||||
|
||||
// Uses zero extension on 64bit
|
||||
|
||||
void Assembler::movl(Register dst, int32_t imm32) {
|
||||
|
@ -1055,6 +1055,11 @@ private:
|
||||
void movdqa(XMMRegister dst, Address src);
|
||||
void movdqa(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Move Unaligned Double Quadword
|
||||
void movdqu(Address dst, XMMRegister src);
|
||||
void movdqu(XMMRegister dst, Address src);
|
||||
void movdqu(XMMRegister dst, XMMRegister src);
|
||||
|
||||
void movl(Register dst, int32_t imm32);
|
||||
void movl(Address dst, int32_t imm32);
|
||||
void movl(Register dst, Register src);
|
||||
|
@ -791,6 +791,69 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Copy 64 bytes chunks
|
||||
//
|
||||
// Inputs:
|
||||
// from - source array address
|
||||
// to_from - destination array address - from
|
||||
// qword_count - 8-bytes element count, negative
|
||||
//
|
||||
void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
|
||||
assert( UseSSE >= 2, "supported cpu only" );
|
||||
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
|
||||
// Copy 64-byte chunks
|
||||
__ jmpb(L_copy_64_bytes);
|
||||
__ align(16);
|
||||
__ BIND(L_copy_64_bytes_loop);
|
||||
|
||||
if(UseUnalignedLoadStores) {
|
||||
__ movdqu(xmm0, Address(from, 0));
|
||||
__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
|
||||
__ movdqu(xmm1, Address(from, 16));
|
||||
__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
|
||||
__ movdqu(xmm2, Address(from, 32));
|
||||
__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
|
||||
__ movdqu(xmm3, Address(from, 48));
|
||||
__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
|
||||
|
||||
} else {
|
||||
__ movq(xmm0, Address(from, 0));
|
||||
__ movq(Address(from, to_from, Address::times_1, 0), xmm0);
|
||||
__ movq(xmm1, Address(from, 8));
|
||||
__ movq(Address(from, to_from, Address::times_1, 8), xmm1);
|
||||
__ movq(xmm2, Address(from, 16));
|
||||
__ movq(Address(from, to_from, Address::times_1, 16), xmm2);
|
||||
__ movq(xmm3, Address(from, 24));
|
||||
__ movq(Address(from, to_from, Address::times_1, 24), xmm3);
|
||||
__ movq(xmm4, Address(from, 32));
|
||||
__ movq(Address(from, to_from, Address::times_1, 32), xmm4);
|
||||
__ movq(xmm5, Address(from, 40));
|
||||
__ movq(Address(from, to_from, Address::times_1, 40), xmm5);
|
||||
__ movq(xmm6, Address(from, 48));
|
||||
__ movq(Address(from, to_from, Address::times_1, 48), xmm6);
|
||||
__ movq(xmm7, Address(from, 56));
|
||||
__ movq(Address(from, to_from, Address::times_1, 56), xmm7);
|
||||
}
|
||||
|
||||
__ addl(from, 64);
|
||||
__ BIND(L_copy_64_bytes);
|
||||
__ subl(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
|
||||
__ addl(qword_count, 8);
|
||||
__ jccb(Assembler::zero, L_exit);
|
||||
//
|
||||
// length is too short, just copy qwords
|
||||
//
|
||||
__ BIND(L_copy_8_bytes);
|
||||
__ movq(xmm0, Address(from, 0));
|
||||
__ movq(Address(from, to_from, Address::times_1), xmm0);
|
||||
__ addl(from, 8);
|
||||
__ decrement(qword_count);
|
||||
__ jcc(Assembler::greater, L_copy_8_bytes);
|
||||
__ BIND(L_exit);
|
||||
}
|
||||
|
||||
// Copy 64 bytes chunks
|
||||
//
|
||||
// Inputs:
|
||||
@ -799,6 +862,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// qword_count - 8-bytes element count, negative
|
||||
//
|
||||
void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
|
||||
assert( VM_Version::supports_mmx(), "supported cpu only" );
|
||||
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
|
||||
// Copy 64-byte chunks
|
||||
__ jmpb(L_copy_64_bytes);
|
||||
@ -876,7 +940,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ subptr(to, from); // to --> to_from
|
||||
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
|
||||
__ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
|
||||
if (!aligned && (t == T_BYTE || t == T_SHORT)) {
|
||||
if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
|
||||
// align source address at 4 bytes address boundary
|
||||
if (t == T_BYTE) {
|
||||
// One byte misalignment happens only for byte arrays
|
||||
@ -906,20 +970,26 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ mov(count, rax); // restore 'count'
|
||||
__ jmpb(L_copy_2_bytes); // all dwords were copied
|
||||
} else {
|
||||
// align to 8 bytes, we know we are 4 byte aligned to start
|
||||
__ testptr(from, 4);
|
||||
__ jccb(Assembler::zero, L_copy_64_bytes);
|
||||
__ movl(rax, Address(from, 0));
|
||||
__ movl(Address(from, to_from, Address::times_1, 0), rax);
|
||||
__ addptr(from, 4);
|
||||
__ subl(count, 1<<shift);
|
||||
if (!UseUnalignedLoadStores) {
|
||||
// align to 8 bytes, we know we are 4 byte aligned to start
|
||||
__ testptr(from, 4);
|
||||
__ jccb(Assembler::zero, L_copy_64_bytes);
|
||||
__ movl(rax, Address(from, 0));
|
||||
__ movl(Address(from, to_from, Address::times_1, 0), rax);
|
||||
__ addptr(from, 4);
|
||||
__ subl(count, 1<<shift);
|
||||
}
|
||||
__ BIND(L_copy_64_bytes);
|
||||
__ mov(rax, count);
|
||||
__ shrl(rax, shift+1); // 8 bytes chunk count
|
||||
//
|
||||
// Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
|
||||
//
|
||||
mmx_copy_forward(from, to_from, rax);
|
||||
if (UseXMMForArrayCopy) {
|
||||
xmm_copy_forward(from, to_from, rax);
|
||||
} else {
|
||||
mmx_copy_forward(from, to_from, rax);
|
||||
}
|
||||
}
|
||||
// copy tailing dword
|
||||
__ BIND(L_copy_4_bytes);
|
||||
@ -1069,13 +1139,20 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(16);
|
||||
// Move 8 bytes
|
||||
__ BIND(L_copy_8_bytes_loop);
|
||||
__ movq(mmx0, Address(from, count, sf, 0));
|
||||
__ movq(Address(to, count, sf, 0), mmx0);
|
||||
if (UseXMMForArrayCopy) {
|
||||
__ movq(xmm0, Address(from, count, sf, 0));
|
||||
__ movq(Address(to, count, sf, 0), xmm0);
|
||||
} else {
|
||||
__ movq(mmx0, Address(from, count, sf, 0));
|
||||
__ movq(Address(to, count, sf, 0), mmx0);
|
||||
}
|
||||
__ BIND(L_copy_8_bytes);
|
||||
__ subl(count, 2<<shift);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
|
||||
__ addl(count, 2<<shift);
|
||||
__ emms();
|
||||
if (!UseXMMForArrayCopy) {
|
||||
__ emms();
|
||||
}
|
||||
}
|
||||
__ BIND(L_copy_4_bytes);
|
||||
// copy prefix qword
|
||||
@ -1143,7 +1220,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
__ subptr(to, from); // to --> to_from
|
||||
if (VM_Version::supports_mmx()) {
|
||||
mmx_copy_forward(from, to_from, count);
|
||||
if (UseXMMForArrayCopy) {
|
||||
xmm_copy_forward(from, to_from, count);
|
||||
} else {
|
||||
mmx_copy_forward(from, to_from, count);
|
||||
}
|
||||
} else {
|
||||
__ jmpb(L_copy_8_bytes);
|
||||
__ align(16);
|
||||
@ -1196,8 +1277,13 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ align(16);
|
||||
__ BIND(L_copy_8_bytes_loop);
|
||||
if (VM_Version::supports_mmx()) {
|
||||
__ movq(mmx0, Address(from, count, Address::times_8));
|
||||
__ movq(Address(to, count, Address::times_8), mmx0);
|
||||
if (UseXMMForArrayCopy) {
|
||||
__ movq(xmm0, Address(from, count, Address::times_8));
|
||||
__ movq(Address(to, count, Address::times_8), xmm0);
|
||||
} else {
|
||||
__ movq(mmx0, Address(from, count, Address::times_8));
|
||||
__ movq(Address(to, count, Address::times_8), mmx0);
|
||||
}
|
||||
} else {
|
||||
__ fild_d(Address(from, count, Address::times_8));
|
||||
__ fistp_d(Address(to, count, Address::times_8));
|
||||
@ -1206,7 +1292,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ decrement(count);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
|
||||
|
||||
if (VM_Version::supports_mmx()) {
|
||||
if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
|
||||
__ emms();
|
||||
}
|
||||
inc_copy_counter_np(T_LONG);
|
||||
|
@ -1251,6 +1251,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Copy big chunks forward
|
||||
//
|
||||
// Inputs:
|
||||
@ -1268,14 +1269,22 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Label L_loop;
|
||||
__ align(16);
|
||||
__ BIND(L_loop);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, -16), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
|
||||
if(UseUnalignedLoadStores) {
|
||||
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
|
||||
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
|
||||
|
||||
} else {
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, -24));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, -16), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
|
||||
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
|
||||
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
|
||||
}
|
||||
__ BIND(L_copy_32_bytes);
|
||||
__ addptr(qword_count, 4);
|
||||
__ jcc(Assembler::lessEqual, L_loop);
|
||||
@ -1301,14 +1310,22 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Label L_loop;
|
||||
__ align(16);
|
||||
__ BIND(L_loop);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 24));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 16), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 8));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
|
||||
if(UseUnalignedLoadStores) {
|
||||
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
|
||||
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
|
||||
|
||||
} else {
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 24));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 16));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 16), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 8));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
|
||||
__ movq(to, Address(from, qword_count, Address::times_8, 0));
|
||||
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
|
||||
}
|
||||
__ BIND(L_copy_32_bytes);
|
||||
__ subptr(qword_count, 4);
|
||||
__ jcc(Assembler::greaterEqual, L_loop);
|
||||
|
@ -242,9 +242,11 @@ void VM_Version::get_processor_features() {
|
||||
_supports_cx8 = supports_cmpxchg8();
|
||||
// if the OS doesn't support SSE, we can't use this feature even if the HW does
|
||||
if( !os::supports_sse())
|
||||
_cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A);
|
||||
if (UseSSE < 4)
|
||||
_cpuFeatures &= ~CPU_SSE4;
|
||||
_cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
|
||||
if (UseSSE < 4) {
|
||||
_cpuFeatures &= ~CPU_SSE4_1;
|
||||
_cpuFeatures &= ~CPU_SSE4_2;
|
||||
}
|
||||
if (UseSSE < 3) {
|
||||
_cpuFeatures &= ~CPU_SSE3;
|
||||
_cpuFeatures &= ~CPU_SSSE3;
|
||||
@ -261,7 +263,7 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
|
||||
char buf[256];
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
cores_per_cpu(), threads_per_core(),
|
||||
cpu_family(), _model, _stepping,
|
||||
(supports_cmov() ? ", cmov" : ""),
|
||||
@ -272,7 +274,8 @@ void VM_Version::get_processor_features() {
|
||||
(supports_sse2() ? ", sse2" : ""),
|
||||
(supports_sse3() ? ", sse3" : ""),
|
||||
(supports_ssse3()? ", ssse3": ""),
|
||||
(supports_sse4() ? ", sse4" : ""),
|
||||
(supports_sse4_1() ? ", sse4.1" : ""),
|
||||
(supports_sse4_2() ? ", sse4.2" : ""),
|
||||
(supports_mmx_ext() ? ", mmxext" : ""),
|
||||
(supports_3dnow() ? ", 3dnow" : ""),
|
||||
(supports_3dnow2() ? ", 3dnowext" : ""),
|
||||
@ -285,7 +288,7 @@ void VM_Version::get_processor_features() {
|
||||
// older Pentiums which do not support it.
|
||||
if( UseSSE > 4 ) UseSSE=4;
|
||||
if( UseSSE < 0 ) UseSSE=0;
|
||||
if( !supports_sse4() ) // Drop to 3 if no SSE4 support
|
||||
if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
|
||||
UseSSE = MIN2((intx)3,UseSSE);
|
||||
if( !supports_sse3() ) // Drop to 2 if no SSE3 support
|
||||
UseSSE = MIN2((intx)2,UseSSE);
|
||||
@ -375,6 +378,14 @@ void VM_Version::get_processor_features() {
|
||||
MaxLoopPad = 11;
|
||||
}
|
||||
#endif // COMPILER2
|
||||
if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
|
||||
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
|
||||
}
|
||||
if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
|
||||
if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
|
||||
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -413,7 +424,7 @@ void VM_Version::get_processor_features() {
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose) {
|
||||
tty->print_cr("Logical CPUs per package: %u",
|
||||
tty->print_cr("Logical CPUs per core: %u",
|
||||
logical_processors_per_package());
|
||||
tty->print_cr("UseSSE=%d",UseSSE);
|
||||
tty->print("Allocation: ");
|
||||
|
@ -68,9 +68,9 @@ public:
|
||||
cmpxchg16: 1,
|
||||
: 4,
|
||||
dca : 1,
|
||||
: 4,
|
||||
popcnt : 1,
|
||||
: 8;
|
||||
sse4_1 : 1,
|
||||
sse4_2 : 1,
|
||||
: 11;
|
||||
} bits;
|
||||
};
|
||||
|
||||
@ -177,8 +177,9 @@ protected:
|
||||
CPU_SSE2 = (1 << 7),
|
||||
CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX)
|
||||
CPU_SSSE3= (1 << 9),
|
||||
CPU_SSE4 = (1 <<10),
|
||||
CPU_SSE4A= (1 <<11)
|
||||
CPU_SSE4A= (1 <<10),
|
||||
CPU_SSE4_1 = (1 << 11),
|
||||
CPU_SSE4_2 = (1 << 12)
|
||||
} cpuFeatureFlags;
|
||||
|
||||
// cpuid information block. All info derived from executing cpuid with
|
||||
@ -240,22 +241,14 @@ protected:
|
||||
static CpuidInfo _cpuid_info;
|
||||
|
||||
// Extractors and predicates
|
||||
static bool is_extended_cpu_family() {
|
||||
const uint32_t Extended_Cpu_Family = 0xf;
|
||||
return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
|
||||
}
|
||||
static uint32_t extended_cpu_family() {
|
||||
uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
|
||||
if (is_extended_cpu_family()) {
|
||||
result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
|
||||
}
|
||||
result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
|
||||
return result;
|
||||
}
|
||||
static uint32_t extended_cpu_model() {
|
||||
uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
|
||||
if (is_extended_cpu_family()) {
|
||||
result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
|
||||
}
|
||||
result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
|
||||
return result;
|
||||
}
|
||||
static uint32_t cpu_stepping() {
|
||||
@ -293,6 +286,10 @@ protected:
|
||||
result |= CPU_SSSE3;
|
||||
if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
|
||||
result |= CPU_SSE4A;
|
||||
if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
|
||||
result |= CPU_SSE4_1;
|
||||
if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
|
||||
result |= CPU_SSE4_2;
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -380,7 +377,8 @@ public:
|
||||
static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; }
|
||||
static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; }
|
||||
static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; }
|
||||
static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; }
|
||||
static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; }
|
||||
static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; }
|
||||
//
|
||||
// AMD features
|
||||
//
|
||||
|
@ -186,8 +186,10 @@ void VM_Version::get_processor_features() {
|
||||
if (!VM_Version::supports_sse2()) {
|
||||
vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
|
||||
}
|
||||
if (UseSSE < 4)
|
||||
_cpuFeatures &= ~CPU_SSE4;
|
||||
if (UseSSE < 4) {
|
||||
_cpuFeatures &= ~CPU_SSE4_1;
|
||||
_cpuFeatures &= ~CPU_SSE4_2;
|
||||
}
|
||||
if (UseSSE < 3) {
|
||||
_cpuFeatures &= ~CPU_SSE3;
|
||||
_cpuFeatures &= ~CPU_SSSE3;
|
||||
@ -204,7 +206,7 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
|
||||
char buf[256];
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
cores_per_cpu(), threads_per_core(),
|
||||
cpu_family(), _model, _stepping,
|
||||
(supports_cmov() ? ", cmov" : ""),
|
||||
@ -215,7 +217,8 @@ void VM_Version::get_processor_features() {
|
||||
(supports_sse2() ? ", sse2" : ""),
|
||||
(supports_sse3() ? ", sse3" : ""),
|
||||
(supports_ssse3()? ", ssse3": ""),
|
||||
(supports_sse4() ? ", sse4" : ""),
|
||||
(supports_sse4_1() ? ", sse4.1" : ""),
|
||||
(supports_sse4_2() ? ", sse4.2" : ""),
|
||||
(supports_mmx_ext() ? ", mmxext" : ""),
|
||||
(supports_3dnow() ? ", 3dnow" : ""),
|
||||
(supports_3dnow2() ? ", 3dnowext" : ""),
|
||||
@ -228,7 +231,7 @@ void VM_Version::get_processor_features() {
|
||||
// older Pentiums which do not support it.
|
||||
if( UseSSE > 4 ) UseSSE=4;
|
||||
if( UseSSE < 0 ) UseSSE=0;
|
||||
if( !supports_sse4() ) // Drop to 3 if no SSE4 support
|
||||
if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
|
||||
UseSSE = MIN2((intx)3,UseSSE);
|
||||
if( !supports_sse3() ) // Drop to 2 if no SSE3 support
|
||||
UseSSE = MIN2((intx)2,UseSSE);
|
||||
@ -314,6 +317,14 @@ void VM_Version::get_processor_features() {
|
||||
MaxLoopPad = 11;
|
||||
}
|
||||
#endif // COMPILER2
|
||||
if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
|
||||
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
|
||||
}
|
||||
if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
|
||||
if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
|
||||
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -355,7 +366,7 @@ void VM_Version::get_processor_features() {
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose) {
|
||||
tty->print_cr("Logical CPUs per package: %u",
|
||||
tty->print_cr("Logical CPUs per core: %u",
|
||||
logical_processors_per_package());
|
||||
tty->print_cr("UseSSE=%d",UseSSE);
|
||||
tty->print("Allocation: ");
|
||||
|
@ -68,9 +68,9 @@ public:
|
||||
cmpxchg16: 1,
|
||||
: 4,
|
||||
dca : 1,
|
||||
: 4,
|
||||
popcnt : 1,
|
||||
: 8;
|
||||
sse4_1 : 1,
|
||||
sse4_2 : 1,
|
||||
: 11;
|
||||
} bits;
|
||||
};
|
||||
|
||||
@ -177,8 +177,9 @@ protected:
|
||||
CPU_SSE2 = (1 << 7),
|
||||
CPU_SSE3 = (1 << 8),
|
||||
CPU_SSSE3= (1 << 9),
|
||||
CPU_SSE4 = (1 <<10),
|
||||
CPU_SSE4A= (1 <<11)
|
||||
CPU_SSE4A= (1 <<10),
|
||||
CPU_SSE4_1 = (1 << 11),
|
||||
CPU_SSE4_2 = (1 << 12)
|
||||
} cpuFeatureFlags;
|
||||
|
||||
// cpuid information block. All info derived from executing cpuid with
|
||||
@ -240,22 +241,14 @@ protected:
|
||||
static CpuidInfo _cpuid_info;
|
||||
|
||||
// Extractors and predicates
|
||||
static bool is_extended_cpu_family() {
|
||||
const uint32_t Extended_Cpu_Family = 0xf;
|
||||
return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
|
||||
}
|
||||
static uint32_t extended_cpu_family() {
|
||||
uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
|
||||
if (is_extended_cpu_family()) {
|
||||
result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
|
||||
}
|
||||
result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
|
||||
return result;
|
||||
}
|
||||
static uint32_t extended_cpu_model() {
|
||||
uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
|
||||
if (is_extended_cpu_family()) {
|
||||
result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
|
||||
}
|
||||
result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
|
||||
return result;
|
||||
}
|
||||
static uint32_t cpu_stepping() {
|
||||
@ -293,6 +286,10 @@ protected:
|
||||
result |= CPU_SSSE3;
|
||||
if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
|
||||
result |= CPU_SSE4A;
|
||||
if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
|
||||
result |= CPU_SSE4_1;
|
||||
if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
|
||||
result |= CPU_SSE4_2;
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -380,7 +377,8 @@ public:
|
||||
static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; }
|
||||
static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; }
|
||||
static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; }
|
||||
static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; }
|
||||
static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; }
|
||||
static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; }
|
||||
//
|
||||
// AMD features
|
||||
//
|
||||
|
@ -4810,6 +4810,16 @@ operand immL0() %{
|
||||
interface(CONST_INTER);
|
||||
%}
|
||||
|
||||
// Long Immediate zero
|
||||
operand immL_M1() %{
|
||||
predicate( n->get_long() == -1L );
|
||||
match(ConL);
|
||||
op_cost(0);
|
||||
|
||||
format %{ %}
|
||||
interface(CONST_INTER);
|
||||
%}
|
||||
|
||||
// Long immediate from 0 to 127.
|
||||
// Used for a shorter form of long mul by 10.
|
||||
operand immL_127() %{
|
||||
@ -8621,6 +8631,18 @@ instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
|
||||
ins_pipe( ialu_reg_reg );
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate -1
|
||||
instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
|
||||
match(Set dst (XorI dst imm));
|
||||
|
||||
size(2);
|
||||
format %{ "NOT $dst" %}
|
||||
ins_encode %{
|
||||
__ notl($dst$$Register);
|
||||
%}
|
||||
ins_pipe( ialu_reg );
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate
|
||||
instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
|
||||
match(Set dst (XorI dst src));
|
||||
@ -8938,6 +8960,18 @@ instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
|
||||
ins_pipe( ialu_reg_reg_long );
|
||||
%}
|
||||
|
||||
// Xor Long Register with Immediate -1
|
||||
instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
|
||||
match(Set dst (XorL dst imm));
|
||||
format %{ "NOT $dst.lo\n\t"
|
||||
"NOT $dst.hi" %}
|
||||
ins_encode %{
|
||||
__ notl($dst$$Register);
|
||||
__ notl(HIGH_FROM_LOW($dst$$Register));
|
||||
%}
|
||||
ins_pipe( ialu_reg_long );
|
||||
%}
|
||||
|
||||
// Xor Long Register with Immediate
|
||||
instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
|
||||
match(Set dst (XorL dst src));
|
||||
|
@ -9309,6 +9309,17 @@ instruct xorI_rReg(rRegI dst, rRegI src, rFlagsReg cr)
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate -1
|
||||
instruct xorI_rReg_im1(rRegI dst, immI_M1 imm) %{
|
||||
match(Set dst (XorI dst imm));
|
||||
|
||||
format %{ "not $dst" %}
|
||||
ins_encode %{
|
||||
__ notl($dst$$Register);
|
||||
%}
|
||||
ins_pipe(ialu_reg);
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate
|
||||
instruct xorI_rReg_imm(rRegI dst, immI src, rFlagsReg cr)
|
||||
%{
|
||||
@ -9529,6 +9540,17 @@ instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate -1
|
||||
instruct xorL_rReg_im1(rRegL dst, immL_M1 imm) %{
|
||||
match(Set dst (XorL dst imm));
|
||||
|
||||
format %{ "notq $dst" %}
|
||||
ins_encode %{
|
||||
__ notq($dst$$Register);
|
||||
%}
|
||||
ins_pipe(ialu_reg);
|
||||
%}
|
||||
|
||||
// Xor Register with Immediate
|
||||
instruct xorL_rReg_imm(rRegL dst, immL32 src, rFlagsReg cr)
|
||||
%{
|
||||
|
@ -99,6 +99,7 @@ heapDumper.cpp ostream.hpp
|
||||
heapDumper.cpp reflectionUtils.hpp
|
||||
heapDumper.cpp symbolTable.hpp
|
||||
heapDumper.cpp systemDictionary.hpp
|
||||
heapDumper.cpp threadService.hpp
|
||||
heapDumper.cpp universe.hpp
|
||||
heapDumper.cpp vframe.hpp
|
||||
heapDumper.cpp vmGCOperations.hpp
|
||||
|
@ -100,7 +100,7 @@ public:
|
||||
|
||||
enum {
|
||||
vtbl_list_size = 16, // number of entries in the shared space vtable list.
|
||||
num_virtuals = 100 // number of virtual methods in Klass (or
|
||||
num_virtuals = 200 // number of virtual methods in Klass (or
|
||||
// subclass) objects, or greater.
|
||||
};
|
||||
|
||||
|
@ -818,6 +818,40 @@ static void print_contents() {
|
||||
// across the space while doing this, as that causes the vtables to be
|
||||
// patched, undoing our useful work. Instead, iterate to make a list,
|
||||
// then use the list to do the fixing.
|
||||
//
|
||||
// Our constructed vtables:
|
||||
// Dump time:
|
||||
// 1. init_self_patching_vtbl_list: table of pointers to current virtual method addrs
|
||||
// 2. generate_vtable_methods: create jump table, appended to above vtbl_list
|
||||
// 3. PatchKlassVtables: for Klass list, patch the vtable entry to point to jump table
|
||||
// rather than to current vtbl
|
||||
// Table layout: NOTE FIXED SIZE
|
||||
// 1. vtbl pointers
|
||||
// 2. #Klass X #virtual methods per Klass
|
||||
// 1 entry for each, in the order:
|
||||
// Klass1:method1 entry, Klass1:method2 entry, ... Klass1:method<num_virtuals> entry
|
||||
// Klass2:method1 entry, Klass2:method2 entry, ... Klass2:method<num_virtuals> entry
|
||||
// ...
|
||||
// Klass<vtbl_list_size>:method1 entry, Klass<vtbl_list_size>:method2 entry,
|
||||
// ... Klass<vtbl_list_size>:method<num_virtuals> entry
|
||||
// Sample entry: (Sparc):
|
||||
// save(sp, -256, sp)
|
||||
// ba,pt common_code
|
||||
// mov XXX, %L0 %L0 gets: Klass index <<8 + method index (note: max method index 255)
|
||||
//
|
||||
// Restore time:
|
||||
// 1. initialize_oops: reserve space for table
|
||||
// 2. init_self_patching_vtbl_list: update pointers to NEW virtual method addrs in text
|
||||
//
|
||||
// Execution time:
|
||||
// First virtual method call for any object of these Klass types:
|
||||
// 1. object->klass->klass_part
|
||||
// 2. vtable entry for that klass_part points to the jump table entries
|
||||
// 3. branches to common_code with %O0/klass_part, %L0: Klass index <<8 + method index
|
||||
// 4. common_code:
|
||||
// Get address of new vtbl pointer for this Klass from updated table
|
||||
// Update new vtbl pointer in the Klass: future virtual calls go direct
|
||||
// Jump to method, using new vtbl pointer and method index
|
||||
|
||||
class PatchKlassVtables: public ObjectClosure {
|
||||
private:
|
||||
|
@ -475,8 +475,8 @@ jint objArrayKlass::compute_modifier_flags(TRAPS) const {
|
||||
assert(Universe::is_bootstrapping(), "partial objArray only at startup");
|
||||
return JVM_ACC_ABSTRACT | JVM_ACC_FINAL | JVM_ACC_PUBLIC;
|
||||
}
|
||||
// Recurse down the element list
|
||||
jint element_flags = Klass::cast(element_klass())->compute_modifier_flags(CHECK_0);
|
||||
// Return the flags of the bottom element type.
|
||||
jint element_flags = Klass::cast(bottom_klass())->compute_modifier_flags(CHECK_0);
|
||||
|
||||
return (element_flags & (JVM_ACC_PUBLIC | JVM_ACC_PRIVATE | JVM_ACC_PROTECTED))
|
||||
| (JVM_ACC_ABSTRACT | JVM_ACC_FINAL);
|
||||
|
@ -156,7 +156,8 @@ Node *AddNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
if( add1_op == this_op && !con_right ) {
|
||||
Node *a12 = add1->in(2);
|
||||
const Type *t12 = phase->type( a12 );
|
||||
if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) ) {
|
||||
if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) &&
|
||||
!(add1->in(1)->is_Phi() && add1->in(1)->as_Phi()->is_tripcount()) ) {
|
||||
assert(add1->in(1) != this, "dead loop in AddNode::Ideal");
|
||||
add2 = add1->clone();
|
||||
add2->set_req(2, in(2));
|
||||
@ -173,7 +174,8 @@ Node *AddNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
if( add2_op == this_op && !con_left ) {
|
||||
Node *a22 = add2->in(2);
|
||||
const Type *t22 = phase->type( a22 );
|
||||
if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) ) {
|
||||
if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) &&
|
||||
!(add2->in(1)->is_Phi() && add2->in(1)->as_Phi()->is_tripcount()) ) {
|
||||
assert(add2->in(1) != this, "dead loop in AddNode::Ideal");
|
||||
Node *addx = add2->clone();
|
||||
addx->set_req(1, in(1));
|
||||
@ -225,34 +227,63 @@ const Type *AddNode::add_of_identity( const Type *t1, const Type *t2 ) const {
|
||||
//=============================================================================
|
||||
//------------------------------Idealize---------------------------------------
|
||||
Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
int op1 = in(1)->Opcode();
|
||||
int op2 = in(2)->Opcode();
|
||||
Node* in1 = in(1);
|
||||
Node* in2 = in(2);
|
||||
int op1 = in1->Opcode();
|
||||
int op2 = in2->Opcode();
|
||||
// Fold (con1-x)+con2 into (con1+con2)-x
|
||||
if ( op1 == Op_AddI && op2 == Op_SubI ) {
|
||||
// Swap edges to try optimizations below
|
||||
in1 = in2;
|
||||
in2 = in(1);
|
||||
op1 = op2;
|
||||
op2 = in2->Opcode();
|
||||
}
|
||||
if( op1 == Op_SubI ) {
|
||||
const Type *t_sub1 = phase->type( in(1)->in(1) );
|
||||
const Type *t_2 = phase->type( in(2) );
|
||||
const Type *t_sub1 = phase->type( in1->in(1) );
|
||||
const Type *t_2 = phase->type( in2 );
|
||||
if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP )
|
||||
return new (phase->C, 3) SubINode(phase->makecon( add_ring( t_sub1, t_2 ) ),
|
||||
in(1)->in(2) );
|
||||
in1->in(2) );
|
||||
// Convert "(a-b)+(c-d)" into "(a+c)-(b+d)"
|
||||
if( op2 == Op_SubI ) {
|
||||
// Check for dead cycle: d = (a-b)+(c-d)
|
||||
assert( in(1)->in(2) != this && in(2)->in(2) != this,
|
||||
assert( in1->in(2) != this && in2->in(2) != this,
|
||||
"dead loop in AddINode::Ideal" );
|
||||
Node *sub = new (phase->C, 3) SubINode(NULL, NULL);
|
||||
sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in(1)->in(1), in(2)->in(1) ) ));
|
||||
sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in(1)->in(2), in(2)->in(2) ) ));
|
||||
sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in1->in(1), in2->in(1) ) ));
|
||||
sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in1->in(2), in2->in(2) ) ));
|
||||
return sub;
|
||||
}
|
||||
// Convert "(a-b)+(b+c)" into "(a+c)"
|
||||
if( op2 == Op_AddI && in1->in(2) == in2->in(1) ) {
|
||||
assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal");
|
||||
return new (phase->C, 3) AddINode(in1->in(1), in2->in(2));
|
||||
}
|
||||
// Convert "(a-b)+(c+b)" into "(a+c)"
|
||||
if( op2 == Op_AddI && in1->in(2) == in2->in(2) ) {
|
||||
assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddINode::Ideal");
|
||||
return new (phase->C, 3) AddINode(in1->in(1), in2->in(1));
|
||||
}
|
||||
// Convert "(a-b)+(b-c)" into "(a-c)"
|
||||
if( op2 == Op_SubI && in1->in(2) == in2->in(1) ) {
|
||||
assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal");
|
||||
return new (phase->C, 3) SubINode(in1->in(1), in2->in(2));
|
||||
}
|
||||
// Convert "(a-b)+(c-a)" into "(c-b)"
|
||||
if( op2 == Op_SubI && in1->in(1) == in2->in(2) ) {
|
||||
assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddINode::Ideal");
|
||||
return new (phase->C, 3) SubINode(in2->in(1), in1->in(2));
|
||||
}
|
||||
}
|
||||
|
||||
// Convert "x+(0-y)" into "(x-y)"
|
||||
if( op2 == Op_SubI && phase->type(in(2)->in(1)) == TypeInt::ZERO )
|
||||
return new (phase->C, 3) SubINode(in(1), in(2)->in(2) );
|
||||
if( op2 == Op_SubI && phase->type(in2->in(1)) == TypeInt::ZERO )
|
||||
return new (phase->C, 3) SubINode(in1, in2->in(2) );
|
||||
|
||||
// Convert "(0-y)+x" into "(x-y)"
|
||||
if( op1 == Op_SubI && phase->type(in(1)->in(1)) == TypeInt::ZERO )
|
||||
return new (phase->C, 3) SubINode( in(2), in(1)->in(2) );
|
||||
if( op1 == Op_SubI && phase->type(in1->in(1)) == TypeInt::ZERO )
|
||||
return new (phase->C, 3) SubINode( in2, in1->in(2) );
|
||||
|
||||
// Convert (x>>>z)+y into (x+(y<<z))>>>z for small constant z and y.
|
||||
// Helps with array allocation math constant folding
|
||||
@ -266,15 +297,15 @@ Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
// Have not observed cases where type information exists to support
|
||||
// positive y and (x <= -(y << z))
|
||||
if( op1 == Op_URShiftI && op2 == Op_ConI &&
|
||||
in(1)->in(2)->Opcode() == Op_ConI ) {
|
||||
jint z = phase->type( in(1)->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter
|
||||
jint y = phase->type( in(2) )->is_int()->get_con();
|
||||
in1->in(2)->Opcode() == Op_ConI ) {
|
||||
jint z = phase->type( in1->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter
|
||||
jint y = phase->type( in2 )->is_int()->get_con();
|
||||
|
||||
if( z < 5 && -5 < y && y < 0 ) {
|
||||
const Type *t_in11 = phase->type(in(1)->in(1));
|
||||
const Type *t_in11 = phase->type(in1->in(1));
|
||||
if( t_in11 != Type::TOP && (t_in11->is_int()->_lo >= -(y << z)) ) {
|
||||
Node *a = phase->transform( new (phase->C, 3) AddINode( in(1)->in(1), phase->intcon(y<<z) ) );
|
||||
return new (phase->C, 3) URShiftINode( a, in(1)->in(2) );
|
||||
Node *a = phase->transform( new (phase->C, 3) AddINode( in1->in(1), phase->intcon(y<<z) ) );
|
||||
return new (phase->C, 3) URShiftINode( a, in1->in(2) );
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -328,39 +359,73 @@ const Type *AddINode::add_ring( const Type *t0, const Type *t1 ) const {
|
||||
//=============================================================================
|
||||
//------------------------------Idealize---------------------------------------
|
||||
Node *AddLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
int op1 = in(1)->Opcode();
|
||||
int op2 = in(2)->Opcode();
|
||||
Node* in1 = in(1);
|
||||
Node* in2 = in(2);
|
||||
int op1 = in1->Opcode();
|
||||
int op2 = in2->Opcode();
|
||||
// Fold (con1-x)+con2 into (con1+con2)-x
|
||||
if ( op1 == Op_AddL && op2 == Op_SubL ) {
|
||||
// Swap edges to try optimizations below
|
||||
in1 = in2;
|
||||
in2 = in(1);
|
||||
op1 = op2;
|
||||
op2 = in2->Opcode();
|
||||
}
|
||||
// Fold (con1-x)+con2 into (con1+con2)-x
|
||||
if( op1 == Op_SubL ) {
|
||||
const Type *t_sub1 = phase->type( in(1)->in(1) );
|
||||
const Type *t_2 = phase->type( in(2) );
|
||||
const Type *t_sub1 = phase->type( in1->in(1) );
|
||||
const Type *t_2 = phase->type( in2 );
|
||||
if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP )
|
||||
return new (phase->C, 3) SubLNode(phase->makecon( add_ring( t_sub1, t_2 ) ),
|
||||
in(1)->in(2) );
|
||||
in1->in(2) );
|
||||
// Convert "(a-b)+(c-d)" into "(a+c)-(b+d)"
|
||||
if( op2 == Op_SubL ) {
|
||||
// Check for dead cycle: d = (a-b)+(c-d)
|
||||
assert( in(1)->in(2) != this && in(2)->in(2) != this,
|
||||
assert( in1->in(2) != this && in2->in(2) != this,
|
||||
"dead loop in AddLNode::Ideal" );
|
||||
Node *sub = new (phase->C, 3) SubLNode(NULL, NULL);
|
||||
sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(1), in(2)->in(1) ) ));
|
||||
sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(2), in(2)->in(2) ) ));
|
||||
sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in1->in(1), in2->in(1) ) ));
|
||||
sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in1->in(2), in2->in(2) ) ));
|
||||
return sub;
|
||||
}
|
||||
// Convert "(a-b)+(b+c)" into "(a+c)"
|
||||
if( op2 == Op_AddL && in1->in(2) == in2->in(1) ) {
|
||||
assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal");
|
||||
return new (phase->C, 3) AddLNode(in1->in(1), in2->in(2));
|
||||
}
|
||||
// Convert "(a-b)+(c+b)" into "(a+c)"
|
||||
if( op2 == Op_AddL && in1->in(2) == in2->in(2) ) {
|
||||
assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal");
|
||||
return new (phase->C, 3) AddLNode(in1->in(1), in2->in(1));
|
||||
}
|
||||
// Convert "(a-b)+(b-c)" into "(a-c)"
|
||||
if( op2 == Op_SubL && in1->in(2) == in2->in(1) ) {
|
||||
assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal");
|
||||
return new (phase->C, 3) SubLNode(in1->in(1), in2->in(2));
|
||||
}
|
||||
// Convert "(a-b)+(c-a)" into "(c-b)"
|
||||
if( op2 == Op_SubL && in1->in(1) == in1->in(2) ) {
|
||||
assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal");
|
||||
return new (phase->C, 3) SubLNode(in2->in(1), in1->in(2));
|
||||
}
|
||||
}
|
||||
|
||||
// Convert "x+(0-y)" into "(x-y)"
|
||||
if( op2 == Op_SubL && phase->type(in(2)->in(1)) == TypeLong::ZERO )
|
||||
return new (phase->C, 3) SubLNode(in(1), in(2)->in(2) );
|
||||
if( op2 == Op_SubL && phase->type(in2->in(1)) == TypeLong::ZERO )
|
||||
return new (phase->C, 3) SubLNode( in1, in2->in(2) );
|
||||
|
||||
// Convert "(0-y)+x" into "(x-y)"
|
||||
if( op1 == Op_SubL && phase->type(in1->in(1)) == TypeInt::ZERO )
|
||||
return new (phase->C, 3) SubLNode( in2, in1->in(2) );
|
||||
|
||||
// Convert "X+X+X+X+X...+X+Y" into "k*X+Y" or really convert "X+(X+Y)"
|
||||
// into "(X<<1)+Y" and let shift-folding happen.
|
||||
if( op2 == Op_AddL &&
|
||||
in(2)->in(1) == in(1) &&
|
||||
in2->in(1) == in1 &&
|
||||
op1 != Op_ConL &&
|
||||
0 ) {
|
||||
Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in(1),phase->intcon(1)));
|
||||
return new (phase->C, 3) AddLNode(shift,in(2)->in(2));
|
||||
Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in1,phase->intcon(1)));
|
||||
return new (phase->C, 3) AddLNode(shift,in2->in(2));
|
||||
}
|
||||
|
||||
return AddNode::Ideal(phase, can_reshape);
|
||||
|
@ -1817,6 +1817,12 @@ Node *PhiNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
||||
return progress; // Return any progress
|
||||
}
|
||||
|
||||
//------------------------------is_tripcount-----------------------------------
|
||||
bool PhiNode::is_tripcount() const {
|
||||
return (in(0) != NULL && in(0)->is_CountedLoop() &&
|
||||
in(0)->as_CountedLoop()->phi() == this);
|
||||
}
|
||||
|
||||
//------------------------------out_RegMask------------------------------------
|
||||
const RegMask &PhiNode::in_RegMask(uint i) const {
|
||||
return i ? out_RegMask() : RegMask::Empty;
|
||||
@ -1832,9 +1838,7 @@ const RegMask &PhiNode::out_RegMask() const {
|
||||
#ifndef PRODUCT
|
||||
void PhiNode::dump_spec(outputStream *st) const {
|
||||
TypeNode::dump_spec(st);
|
||||
if (in(0) != NULL &&
|
||||
in(0)->is_CountedLoop() &&
|
||||
in(0)->as_CountedLoop()->phi() == this) {
|
||||
if (is_tripcount()) {
|
||||
st->print(" #tripcount");
|
||||
}
|
||||
}
|
||||
|
@ -162,6 +162,8 @@ public:
|
||||
return NULL; // not a copy!
|
||||
}
|
||||
|
||||
bool is_tripcount() const;
|
||||
|
||||
// Determine a unique non-trivial input, if any.
|
||||
// Ignore casts if it helps. Return NULL on failure.
|
||||
Node* unique_input(PhaseTransform *phase);
|
||||
|
@ -110,10 +110,13 @@ static Node *transform_int_divide( PhaseGVN *phase, Node *dividend, jint divisor
|
||||
} else if( dividend->Opcode() == Op_AndI ) {
|
||||
// An AND mask of sufficient size clears the low bits and
|
||||
// I can avoid rounding.
|
||||
const TypeInt *andconi = phase->type( dividend->in(2) )->isa_int();
|
||||
if( andconi && andconi->is_con(-d) ) {
|
||||
dividend = dividend->in(1);
|
||||
needs_rounding = false;
|
||||
const TypeInt *andconi_t = phase->type( dividend->in(2) )->isa_int();
|
||||
if( andconi_t && andconi_t->is_con() ) {
|
||||
jint andconi = andconi_t->get_con();
|
||||
if( andconi < 0 && is_power_of_2(-andconi) && (-andconi) >= d ) {
|
||||
dividend = dividend->in(1);
|
||||
needs_rounding = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -316,10 +319,13 @@ static Node *transform_long_divide( PhaseGVN *phase, Node *dividend, jlong divis
|
||||
} else if( dividend->Opcode() == Op_AndL ) {
|
||||
// An AND mask of sufficient size clears the low bits and
|
||||
// I can avoid rounding.
|
||||
const TypeLong *andconl = phase->type( dividend->in(2) )->isa_long();
|
||||
if( andconl && andconl->is_con(-d)) {
|
||||
dividend = dividend->in(1);
|
||||
needs_rounding = false;
|
||||
const TypeLong *andconl_t = phase->type( dividend->in(2) )->isa_long();
|
||||
if( andconl_t && andconl_t->is_con() ) {
|
||||
jlong andconl = andconl_t->get_con();
|
||||
if( andconl < 0 && is_power_of_2_long(-andconl) && (-andconl) >= d ) {
|
||||
dividend = dividend->in(1);
|
||||
needs_rounding = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -704,11 +710,18 @@ const Type *DivDNode::Value( PhaseTransform *phase ) const {
|
||||
if( t2 == TypeD::ONE )
|
||||
return t1;
|
||||
|
||||
// If divisor is a constant and not zero, divide them numbers
|
||||
if( t1->base() == Type::DoubleCon &&
|
||||
t2->base() == Type::DoubleCon &&
|
||||
t2->getd() != 0.0 ) // could be negative zero
|
||||
return TypeD::make( t1->getd()/t2->getd() );
|
||||
#if defined(IA32)
|
||||
if (!phase->C->method()->is_strict())
|
||||
// Can't trust native compilers to properly fold strict double
|
||||
// division with round-to-zero on this platform.
|
||||
#endif
|
||||
{
|
||||
// If divisor is a constant and not zero, divide them numbers
|
||||
if( t1->base() == Type::DoubleCon &&
|
||||
t2->base() == Type::DoubleCon &&
|
||||
t2->getd() != 0.0 ) // could be negative zero
|
||||
return TypeD::make( t1->getd()/t2->getd() );
|
||||
}
|
||||
|
||||
// If the dividend is a constant zero
|
||||
// Note: if t1 and t2 are zero then result is NaN (JVMS page 213)
|
||||
|
@ -679,6 +679,10 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_
|
||||
CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
|
||||
post_head->set_post_loop(main_head);
|
||||
|
||||
// Reduce the post-loop trip count.
|
||||
CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
|
||||
post_end->_prob = PROB_FAIR;
|
||||
|
||||
// Build the main-loop normal exit.
|
||||
IfFalseNode *new_main_exit = new (C, 1) IfFalseNode(main_end);
|
||||
_igvn.register_new_node_with_optimizer( new_main_exit );
|
||||
@ -748,6 +752,9 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_
|
||||
pre_head->set_pre_loop(main_head);
|
||||
Node *pre_incr = old_new[incr->_idx];
|
||||
|
||||
// Reduce the pre-loop trip count.
|
||||
pre_end->_prob = PROB_FAIR;
|
||||
|
||||
// Find the pre-loop normal exit.
|
||||
Node* pre_exit = pre_end->proj_out(false);
|
||||
assert( pre_exit->Opcode() == Op_IfFalse, "" );
|
||||
@ -767,8 +774,8 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_
|
||||
register_new_node( min_cmp , new_pre_exit );
|
||||
register_new_node( min_bol , new_pre_exit );
|
||||
|
||||
// Build the IfNode
|
||||
IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_FAIR, COUNT_UNKNOWN );
|
||||
// Build the IfNode (assume the main-loop is executed always).
|
||||
IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN );
|
||||
_igvn.register_new_node_with_optimizer( min_iff );
|
||||
set_idom(min_iff, new_pre_exit, dd_main_head);
|
||||
set_loop(min_iff, loop->_parent);
|
||||
@ -1583,10 +1590,10 @@ bool IdealLoopTree::policy_do_remove_empty_loop( PhaseIdealLoop *phase ) {
|
||||
|
||||
//=============================================================================
|
||||
//------------------------------iteration_split_impl---------------------------
|
||||
void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) {
|
||||
bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) {
|
||||
// Check and remove empty loops (spam micro-benchmarks)
|
||||
if( policy_do_remove_empty_loop(phase) )
|
||||
return; // Here we removed an empty loop
|
||||
return true; // Here we removed an empty loop
|
||||
|
||||
bool should_peel = policy_peeling(phase); // Should we peel?
|
||||
|
||||
@ -1596,7 +1603,8 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
||||
// This removes loop-invariant tests (usually null checks).
|
||||
if( !_head->is_CountedLoop() ) { // Non-counted loop
|
||||
if (PartialPeelLoop && phase->partial_peel(this, old_new)) {
|
||||
return;
|
||||
// Partial peel succeeded so terminate this round of loop opts
|
||||
return false;
|
||||
}
|
||||
if( should_peel ) { // Should we peel?
|
||||
#ifndef PRODUCT
|
||||
@ -1606,14 +1614,14 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
||||
} else if( should_unswitch ) {
|
||||
phase->do_unswitching(this, old_new);
|
||||
}
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
CountedLoopNode *cl = _head->as_CountedLoop();
|
||||
|
||||
if( !cl->loopexit() ) return; // Ignore various kinds of broken loops
|
||||
if( !cl->loopexit() ) return true; // Ignore various kinds of broken loops
|
||||
|
||||
// Do nothing special to pre- and post- loops
|
||||
if( cl->is_pre_loop() || cl->is_post_loop() ) return;
|
||||
if( cl->is_pre_loop() || cl->is_post_loop() ) return true;
|
||||
|
||||
// Compute loop trip count from profile data
|
||||
compute_profile_trip_cnt(phase);
|
||||
@ -1626,11 +1634,11 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
||||
// Here we did some unrolling and peeling. Eventually we will
|
||||
// completely unroll this loop and it will no longer be a loop.
|
||||
phase->do_maximally_unroll(this,old_new);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
if (should_unswitch) {
|
||||
phase->do_unswitching(this, old_new);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1691,14 +1699,16 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
||||
if( should_peel ) // Might want to peel but do nothing else
|
||||
phase->do_peeling(this,old_new);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//=============================================================================
|
||||
//------------------------------iteration_split--------------------------------
|
||||
void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) {
|
||||
bool IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) {
|
||||
// Recursively iteration split nested loops
|
||||
if( _child ) _child->iteration_split( phase, old_new );
|
||||
if( _child && !_child->iteration_split( phase, old_new ))
|
||||
return false;
|
||||
|
||||
// Clean out prior deadwood
|
||||
DCE_loop_body();
|
||||
@ -1720,7 +1730,9 @@ void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new )
|
||||
_allow_optimizations &&
|
||||
!tail()->is_top() ) { // Also ignore the occasional dead backedge
|
||||
if (!_has_call) {
|
||||
iteration_split_impl( phase, old_new );
|
||||
if (!iteration_split_impl( phase, old_new )) {
|
||||
return false;
|
||||
}
|
||||
} else if (policy_unswitching(phase)) {
|
||||
phase->do_unswitching(this, old_new);
|
||||
}
|
||||
@ -1729,5 +1741,7 @@ void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new )
|
||||
// Minor offset re-organization to remove loop-fallout uses of
|
||||
// trip counter.
|
||||
if( _head->is_CountedLoop() ) phase->reorg_offsets( this );
|
||||
if( _next ) _next->iteration_split( phase, old_new );
|
||||
if( _next && !_next->iteration_split( phase, old_new ))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
@ -325,12 +325,14 @@ public:
|
||||
// Returns TRUE if loop tree is structurally changed.
|
||||
bool beautify_loops( PhaseIdealLoop *phase );
|
||||
|
||||
// Perform iteration-splitting on inner loops. Split iterations to avoid
|
||||
// range checks or one-shot null checks.
|
||||
void iteration_split( PhaseIdealLoop *phase, Node_List &old_new );
|
||||
// Perform iteration-splitting on inner loops. Split iterations to
|
||||
// avoid range checks or one-shot null checks. Returns false if the
|
||||
// current round of loop opts should stop.
|
||||
bool iteration_split( PhaseIdealLoop *phase, Node_List &old_new );
|
||||
|
||||
// Driver for various flavors of iteration splitting
|
||||
void iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new );
|
||||
// Driver for various flavors of iteration splitting. Returns false
|
||||
// if the current round of loop opts should stop.
|
||||
bool iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new );
|
||||
|
||||
// Given dominators, try to find loops with calls that must always be
|
||||
// executed (call dominates loop tail). These loops do not need non-call
|
||||
|
@ -1903,9 +1903,6 @@ void PhaseIdealLoop::clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, N
|
||||
// Use in a phi is considered a use in the associated predecessor block
|
||||
use_c = use->in(0)->in(j);
|
||||
}
|
||||
if (use_c->is_CountedLoop()) {
|
||||
use_c = use_c->in(LoopNode::EntryControl);
|
||||
}
|
||||
set_ctrl(n_clone, use_c);
|
||||
assert(!loop->is_member(get_loop(use_c)), "should be outside loop");
|
||||
get_loop(use_c)->_body.push(n_clone);
|
||||
|
@ -152,6 +152,14 @@ const Type *MulNode::Value( PhaseTransform *phase ) const {
|
||||
if( t1 == Type::BOTTOM || t2 == Type::BOTTOM )
|
||||
return bottom_type();
|
||||
|
||||
#if defined(IA32)
|
||||
// Can't trust native compilers to properly fold strict double
|
||||
// multiplication with round-to-zero on this platform.
|
||||
if (op == Op_MulD && phase->C->method()->is_strict()) {
|
||||
return TypeD::DOUBLE;
|
||||
}
|
||||
#endif
|
||||
|
||||
return mul_ring(t1,t2); // Local flavor of type multiplication
|
||||
}
|
||||
|
||||
@ -360,7 +368,7 @@ const Type *MulFNode::mul_ring(const Type *t0, const Type *t1) const {
|
||||
// Compute the product type of two double ranges into this node.
|
||||
const Type *MulDNode::mul_ring(const Type *t0, const Type *t1) const {
|
||||
if( t0 == Type::DOUBLE || t1 == Type::DOUBLE ) return Type::DOUBLE;
|
||||
// We must be adding 2 double constants.
|
||||
// We must be multiplying 2 double constants.
|
||||
return TypeD::make( t0->getd() * t1->getd() );
|
||||
}
|
||||
|
||||
|
@ -1320,7 +1320,8 @@ public:
|
||||
Node *pop() {
|
||||
if( _clock_index >= size() ) _clock_index = 0;
|
||||
Node *b = at(_clock_index);
|
||||
map( _clock_index++, Node_List::pop());
|
||||
map( _clock_index, Node_List::pop());
|
||||
if (size() != 0) _clock_index++; // Always start from 0
|
||||
_in_worklist >>= b->_idx;
|
||||
return b;
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ static bool is_single_register(uint x) {
|
||||
#endif
|
||||
}
|
||||
|
||||
//------------------------------may_be_copy_of_callee-----------------------------
|
||||
//---------------------------may_be_copy_of_callee-----------------------------
|
||||
// Check to see if we can possibly be a copy of a callee-save value.
|
||||
bool PhaseChaitin::may_be_copy_of_callee( Node *def ) const {
|
||||
// Short circuit if there are no callee save registers
|
||||
@ -225,6 +225,20 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v
|
||||
|
||||
// Scan all registers to see if this value is around already
|
||||
for( uint reg = 0; reg < (uint)_max_reg; reg++ ) {
|
||||
if (reg == (uint)nk_reg) {
|
||||
// Found ourselves so check if there is only one user of this
|
||||
// copy and keep on searching for a better copy if so.
|
||||
bool ignore_self = true;
|
||||
x = n->in(k);
|
||||
DUIterator_Fast imax, i = x->fast_outs(imax);
|
||||
Node* first = x->fast_out(i); i++;
|
||||
while (i < imax && ignore_self) {
|
||||
Node* use = x->fast_out(i); i++;
|
||||
if (use != first) ignore_self = false;
|
||||
}
|
||||
if (ignore_self) continue;
|
||||
}
|
||||
|
||||
Node *vv = value[reg];
|
||||
if( !single ) { // Doubles check for aligned-adjacent pair
|
||||
if( (reg&1)==0 ) continue; // Wrong half of a pair
|
||||
|
@ -206,6 +206,14 @@ Node *SubINode::Ideal(PhaseGVN *phase, bool can_reshape){
|
||||
if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(2) )
|
||||
return new (phase->C, 3) SubINode( in1->in(1), in2->in(1) );
|
||||
|
||||
// Convert "(A+X) - (X+B)" into "A - B"
|
||||
if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(1) )
|
||||
return new (phase->C, 3) SubINode( in1->in(1), in2->in(2) );
|
||||
|
||||
// Convert "(X+A) - (B+X)" into "A - B"
|
||||
if( op1 == Op_AddI && op2 == Op_AddI && in1->in(1) == in2->in(2) )
|
||||
return new (phase->C, 3) SubINode( in1->in(2), in2->in(1) );
|
||||
|
||||
// Convert "A-(B-C)" into (A+C)-B", since add is commutative and generally
|
||||
// nicer to optimize than subtract.
|
||||
if( op2 == Op_SubI && in2->outcnt() == 1) {
|
||||
|
@ -997,6 +997,12 @@ class CommandLineFlags {
|
||||
product(bool, UseXmmI2F, false, \
|
||||
"Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \
|
||||
\
|
||||
product(bool, UseXMMForArrayCopy, false, \
|
||||
"Use SSE2 MOVQ instruction for Arraycopy") \
|
||||
\
|
||||
product(bool, UseUnalignedLoadStores, false, \
|
||||
"Use SSE2 MOVDQU instruction for Arraycopy") \
|
||||
\
|
||||
product(intx, FieldsAllocationStyle, 1, \
|
||||
"0 - type based with oops first, 1 - with oops last") \
|
||||
\
|
||||
@ -2555,7 +2561,7 @@ class CommandLineFlags {
|
||||
develop(intx, MaxRecursiveInlineLevel, 1, \
|
||||
"maximum number of nested recursive calls that are inlined") \
|
||||
\
|
||||
develop(intx, InlineSmallCode, 1000, \
|
||||
product(intx, InlineSmallCode, 1000, \
|
||||
"Only inline already compiled methods if their code size is " \
|
||||
"less than this") \
|
||||
\
|
||||
|
@ -343,7 +343,8 @@ typedef enum {
|
||||
|
||||
// Default stack trace ID (used for dummy HPROF_TRACE record)
|
||||
enum {
|
||||
STACK_TRACE_ID = 1
|
||||
STACK_TRACE_ID = 1,
|
||||
INITIAL_CLASS_COUNT = 200
|
||||
};
|
||||
|
||||
|
||||
@ -408,6 +409,7 @@ class DumpWriter : public StackObj {
|
||||
void write_u8(u8 x);
|
||||
void write_objectID(oop o);
|
||||
void write_classID(Klass* k);
|
||||
void write_id(u4 x);
|
||||
};
|
||||
|
||||
DumpWriter::DumpWriter(const char* path) {
|
||||
@ -548,6 +550,14 @@ void DumpWriter::write_objectID(oop o) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void DumpWriter::write_id(u4 x) {
|
||||
#ifdef _LP64
|
||||
write_u8((u8) x);
|
||||
#else
|
||||
write_u4(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
// We use java mirror as the class ID
|
||||
void DumpWriter::write_classID(Klass* k) {
|
||||
write_objectID(k->java_mirror());
|
||||
@ -596,6 +606,8 @@ class DumperSupport : AllStatic {
|
||||
static void dump_object_array(DumpWriter* writer, objArrayOop array);
|
||||
// creates HPROF_GC_PRIM_ARRAY_DUMP record for the given type array
|
||||
static void dump_prim_array(DumpWriter* writer, typeArrayOop array);
|
||||
// create HPROF_FRAME record for the given method and bci
|
||||
static void dump_stack_frame(DumpWriter* writer, int frame_serial_num, int class_serial_num, methodOop m, int bci);
|
||||
};
|
||||
|
||||
// write a header of the given type
|
||||
@ -1070,6 +1082,29 @@ void DumperSupport::dump_prim_array(DumpWriter* writer, typeArrayOop array) {
|
||||
}
|
||||
}
|
||||
|
||||
// create a HPROF_FRAME record of the given methodOop and bci
|
||||
void DumperSupport::dump_stack_frame(DumpWriter* writer,
|
||||
int frame_serial_num,
|
||||
int class_serial_num,
|
||||
methodOop m,
|
||||
int bci) {
|
||||
int line_number;
|
||||
if (m->is_native()) {
|
||||
line_number = -3; // native frame
|
||||
} else {
|
||||
line_number = m->line_number_from_bci(bci);
|
||||
}
|
||||
|
||||
write_header(writer, HPROF_FRAME, 4*oopSize + 2*sizeof(u4));
|
||||
writer->write_id(frame_serial_num); // frame serial number
|
||||
writer->write_objectID(m->name()); // method's name
|
||||
writer->write_objectID(m->signature()); // method's signature
|
||||
|
||||
assert(Klass::cast(m->method_holder())->oop_is_instance(), "not instanceKlass");
|
||||
writer->write_objectID(instanceKlass::cast(m->method_holder())->source_file_name()); // source file name
|
||||
writer->write_u4(class_serial_num); // class serial number
|
||||
writer->write_u4((u4) line_number); // line number
|
||||
}
|
||||
|
||||
// Support class used to generate HPROF_UTF8 records from the entries in the
|
||||
// SymbolTable.
|
||||
@ -1104,12 +1139,15 @@ class JNILocalsDumper : public OopClosure {
|
||||
private:
|
||||
DumpWriter* _writer;
|
||||
u4 _thread_serial_num;
|
||||
int _frame_num;
|
||||
DumpWriter* writer() const { return _writer; }
|
||||
public:
|
||||
JNILocalsDumper(DumpWriter* writer, u4 thread_serial_num) {
|
||||
_writer = writer;
|
||||
_thread_serial_num = thread_serial_num;
|
||||
_frame_num = -1; // default - empty stack
|
||||
}
|
||||
void set_frame_number(int n) { _frame_num = n; }
|
||||
void do_oop(oop* obj_p);
|
||||
void do_oop(narrowOop* obj_p) { ShouldNotReachHere(); }
|
||||
};
|
||||
@ -1122,7 +1160,7 @@ void JNILocalsDumper::do_oop(oop* obj_p) {
|
||||
writer()->write_u1(HPROF_GC_ROOT_JNI_LOCAL);
|
||||
writer()->write_objectID(o);
|
||||
writer()->write_u4(_thread_serial_num);
|
||||
writer()->write_u4((u4)-1); // empty
|
||||
writer()->write_u4((u4)_frame_num);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1269,6 +1307,9 @@ class VM_HeapDumper : public VM_GC_Operation {
|
||||
bool _gc_before_heap_dump;
|
||||
bool _is_segmented_dump;
|
||||
jlong _dump_start;
|
||||
GrowableArray<Klass*>* _klass_map;
|
||||
ThreadStackTrace** _stack_traces;
|
||||
int _num_threads;
|
||||
|
||||
// accessors
|
||||
DumpWriter* writer() const { return _writer; }
|
||||
@ -1291,9 +1332,16 @@ class VM_HeapDumper : public VM_GC_Operation {
|
||||
static void do_basic_type_array_class_dump(klassOop k);
|
||||
|
||||
// HPROF_GC_ROOT_THREAD_OBJ records
|
||||
void do_thread(JavaThread* thread, u4 thread_serial_num);
|
||||
int do_thread(JavaThread* thread, u4 thread_serial_num);
|
||||
void do_threads();
|
||||
|
||||
void add_class_serial_number(Klass* k, int serial_num) {
|
||||
_klass_map->at_put_grow(serial_num, k);
|
||||
}
|
||||
|
||||
// HPROF_TRACE and HPROF_FRAME records
|
||||
void dump_stack_traces();
|
||||
|
||||
// writes a HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT record
|
||||
void write_dump_header();
|
||||
|
||||
@ -1313,6 +1361,18 @@ class VM_HeapDumper : public VM_GC_Operation {
|
||||
_gc_before_heap_dump = gc_before_heap_dump;
|
||||
_is_segmented_dump = false;
|
||||
_dump_start = (jlong)-1;
|
||||
_klass_map = new (ResourceObj::C_HEAP) GrowableArray<Klass*>(INITIAL_CLASS_COUNT, true);
|
||||
_stack_traces = NULL;
|
||||
_num_threads = 0;
|
||||
}
|
||||
~VM_HeapDumper() {
|
||||
if (_stack_traces != NULL) {
|
||||
for (int i=0; i < _num_threads; i++) {
|
||||
delete _stack_traces[i];
|
||||
}
|
||||
FREE_C_HEAP_ARRAY(ThreadStackTrace*, _stack_traces);
|
||||
}
|
||||
delete _klass_map;
|
||||
}
|
||||
|
||||
VMOp_Type type() const { return VMOp_HeapDumper; }
|
||||
@ -1436,6 +1496,9 @@ void VM_HeapDumper::do_load_class(klassOop k) {
|
||||
Klass* klass = Klass::cast(k);
|
||||
writer->write_classID(klass);
|
||||
|
||||
// add the klassOop and class serial number pair
|
||||
dumper->add_class_serial_number(klass, class_serial_num);
|
||||
|
||||
writer->write_u4(STACK_TRACE_ID);
|
||||
|
||||
// class name ID
|
||||
@ -1465,15 +1528,15 @@ void VM_HeapDumper::do_basic_type_array_class_dump(klassOop k) {
|
||||
// Walk the stack of the given thread.
|
||||
// Dumps a HPROF_GC_ROOT_JAVA_FRAME record for each local
|
||||
// Dumps a HPROF_GC_ROOT_JNI_LOCAL record for each JNI local
|
||||
void VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
|
||||
//
|
||||
// It returns the number of Java frames in this thread stack
|
||||
int VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
|
||||
JNILocalsDumper blk(writer(), thread_serial_num);
|
||||
|
||||
oop threadObj = java_thread->threadObj();
|
||||
assert(threadObj != NULL, "sanity check");
|
||||
|
||||
// JNI locals for the top frame
|
||||
java_thread->active_handles()->oops_do(&blk);
|
||||
|
||||
int stack_depth = 0;
|
||||
if (java_thread->has_last_Java_frame()) {
|
||||
|
||||
// vframes are resource allocated
|
||||
@ -1484,13 +1547,14 @@ void VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
|
||||
RegisterMap reg_map(java_thread);
|
||||
frame f = java_thread->last_frame();
|
||||
vframe* vf = vframe::new_vframe(&f, ®_map, java_thread);
|
||||
frame* last_entry_frame = NULL;
|
||||
|
||||
while (vf != NULL) {
|
||||
blk.set_frame_number(stack_depth);
|
||||
if (vf->is_java_frame()) {
|
||||
|
||||
// java frame (interpreted, compiled, ...)
|
||||
javaVFrame *jvf = javaVFrame::cast(vf);
|
||||
|
||||
if (!(jvf->method()->is_native())) {
|
||||
StackValueCollection* locals = jvf->locals();
|
||||
for (int slot=0; slot<locals->size(); slot++) {
|
||||
@ -1501,44 +1565,61 @@ void VM_HeapDumper::do_thread(JavaThread* java_thread, u4 thread_serial_num) {
|
||||
writer()->write_u1(HPROF_GC_ROOT_JAVA_FRAME);
|
||||
writer()->write_objectID(o);
|
||||
writer()->write_u4(thread_serial_num);
|
||||
writer()->write_u4((u4)-1); // empty
|
||||
writer()->write_u4((u4) stack_depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// native frame
|
||||
if (stack_depth == 0) {
|
||||
// JNI locals for the top frame.
|
||||
java_thread->active_handles()->oops_do(&blk);
|
||||
} else {
|
||||
if (last_entry_frame != NULL) {
|
||||
// JNI locals for the entry frame
|
||||
assert(last_entry_frame->is_entry_frame(), "checking");
|
||||
last_entry_frame->entry_frame_call_wrapper()->handles()->oops_do(&blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// increment only for Java frames
|
||||
stack_depth++;
|
||||
last_entry_frame = NULL;
|
||||
|
||||
} else {
|
||||
// externalVFrame - if it's an entry frame then report any JNI locals
|
||||
// as roots
|
||||
// as roots when we find the corresponding native javaVFrame
|
||||
frame* fr = vf->frame_pointer();
|
||||
assert(fr != NULL, "sanity check");
|
||||
if (fr->is_entry_frame()) {
|
||||
fr->entry_frame_call_wrapper()->handles()->oops_do(&blk);
|
||||
last_entry_frame = fr;
|
||||
}
|
||||
}
|
||||
|
||||
vf = vf->sender();
|
||||
}
|
||||
} else {
|
||||
// no last java frame but there may be JNI locals
|
||||
java_thread->active_handles()->oops_do(&blk);
|
||||
}
|
||||
return stack_depth;
|
||||
}
|
||||
|
||||
|
||||
// write a HPROF_GC_ROOT_THREAD_OBJ record for each java thread. Then walk
|
||||
// the stack so that locals and JNI locals are dumped.
|
||||
void VM_HeapDumper::do_threads() {
|
||||
u4 thread_serial_num = 0;
|
||||
for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) {
|
||||
for (int i=0; i < _num_threads; i++) {
|
||||
JavaThread* thread = _stack_traces[i]->thread();
|
||||
oop threadObj = thread->threadObj();
|
||||
if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) {
|
||||
++thread_serial_num;
|
||||
|
||||
writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ);
|
||||
writer()->write_objectID(threadObj);
|
||||
writer()->write_u4(thread_serial_num);
|
||||
writer()->write_u4(STACK_TRACE_ID);
|
||||
|
||||
do_thread(thread, thread_serial_num);
|
||||
}
|
||||
u4 thread_serial_num = i+1;
|
||||
u4 stack_serial_num = thread_serial_num + STACK_TRACE_ID;
|
||||
writer()->write_u1(HPROF_GC_ROOT_THREAD_OBJ);
|
||||
writer()->write_objectID(threadObj);
|
||||
writer()->write_u4(thread_serial_num); // thread number
|
||||
writer()->write_u4(stack_serial_num); // stack trace serial number
|
||||
int num_frames = do_thread(thread, thread_serial_num);
|
||||
assert(num_frames == _stack_traces[i]->get_stack_depth(),
|
||||
"total number of Java frames not matched");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1547,16 +1628,16 @@ void VM_HeapDumper::do_threads() {
|
||||
// records:
|
||||
//
|
||||
// HPROF_HEADER
|
||||
// HPROF_TRACE
|
||||
// [HPROF_UTF8]*
|
||||
// [HPROF_LOAD_CLASS]*
|
||||
// [[HPROF_FRAME]*|HPROF_TRACE]*
|
||||
// [HPROF_GC_CLASS_DUMP]*
|
||||
// HPROF_HEAP_DUMP
|
||||
//
|
||||
// The HPROF_TRACE record after the header is "dummy trace" record which does
|
||||
// not include any frames. Other records which require a stack trace ID will
|
||||
// specify the trace ID of this record (1). It also means we can run HAT without
|
||||
// needing the -stack false option.
|
||||
// The HPROF_TRACE records represent the stack traces where the heap dump
|
||||
// is generated and a "dummy trace" record which does not include
|
||||
// any frames. The dummy trace record is used to be referenced as the
|
||||
// unknown object alloc site.
|
||||
//
|
||||
// The HPROF_HEAP_DUMP record has a length following by sub-records. To allow
|
||||
// the heap dump be generated in a single pass we remember the position of
|
||||
@ -1592,12 +1673,6 @@ void VM_HeapDumper::doit() {
|
||||
writer()->write_u4(oopSize);
|
||||
writer()->write_u8(os::javaTimeMillis());
|
||||
|
||||
// HPROF_TRACE record without any frames
|
||||
DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4));
|
||||
writer()->write_u4(STACK_TRACE_ID);
|
||||
writer()->write_u4(0); // thread number
|
||||
writer()->write_u4(0); // frame count
|
||||
|
||||
// HPROF_UTF8 records
|
||||
SymbolTableDumper sym_dumper(writer());
|
||||
SymbolTable::oops_do(&sym_dumper);
|
||||
@ -1606,6 +1681,10 @@ void VM_HeapDumper::doit() {
|
||||
SystemDictionary::classes_do(&do_load_class);
|
||||
Universe::basic_type_classes_do(&do_load_class);
|
||||
|
||||
// write HPROF_FRAME and HPROF_TRACE records
|
||||
// this must be called after _klass_map is built when iterating the classes above.
|
||||
dump_stack_traces();
|
||||
|
||||
// write HPROF_HEAP_DUMP or HPROF_HEAP_DUMP_SEGMENT
|
||||
write_dump_header();
|
||||
|
||||
@ -1646,6 +1725,47 @@ void VM_HeapDumper::doit() {
|
||||
end_of_dump();
|
||||
}
|
||||
|
||||
void VM_HeapDumper::dump_stack_traces() {
|
||||
// write a HPROF_TRACE record without any frames to be referenced as object alloc sites
|
||||
DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4));
|
||||
writer()->write_u4((u4) STACK_TRACE_ID);
|
||||
writer()->write_u4(0); // thread number
|
||||
writer()->write_u4(0); // frame count
|
||||
|
||||
_stack_traces = NEW_C_HEAP_ARRAY(ThreadStackTrace*, Threads::number_of_threads());
|
||||
int frame_serial_num = 0;
|
||||
for (JavaThread* thread = Threads::first(); thread != NULL ; thread = thread->next()) {
|
||||
oop threadObj = thread->threadObj();
|
||||
if (threadObj != NULL && !thread->is_exiting() && !thread->is_hidden_from_external_view()) {
|
||||
// dump thread stack trace
|
||||
ThreadStackTrace* stack_trace = new ThreadStackTrace(thread, false);
|
||||
stack_trace->dump_stack_at_safepoint(-1);
|
||||
_stack_traces[_num_threads++] = stack_trace;
|
||||
|
||||
// write HPROF_FRAME records for this thread's stack trace
|
||||
int depth = stack_trace->get_stack_depth();
|
||||
int thread_frame_start = frame_serial_num;
|
||||
for (int j=0; j < depth; j++) {
|
||||
StackFrameInfo* frame = stack_trace->stack_frame_at(j);
|
||||
methodOop m = frame->method();
|
||||
int class_serial_num = _klass_map->find(Klass::cast(m->method_holder()));
|
||||
// the class serial number starts from 1
|
||||
assert(class_serial_num > 0, "class not found");
|
||||
DumperSupport::dump_stack_frame(writer(), ++frame_serial_num, class_serial_num, m, frame->bci());
|
||||
}
|
||||
|
||||
// write HPROF_TRACE record for one thread
|
||||
DumperSupport::write_header(writer(), HPROF_TRACE, 3*sizeof(u4) + depth*oopSize);
|
||||
int stack_serial_num = _num_threads + STACK_TRACE_ID;
|
||||
writer()->write_u4(stack_serial_num); // stack trace serial number
|
||||
writer()->write_u4((u4) _num_threads); // thread serial number
|
||||
writer()->write_u4(depth); // frame count
|
||||
for (int j=1; j <= depth; j++) {
|
||||
writer()->write_id(thread_frame_start + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dump the heap to given path.
|
||||
int HeapDumper::dump(const char* path) {
|
||||
|
@ -242,6 +242,7 @@ class ThreadStackTrace : public CHeapObj {
|
||||
ThreadStackTrace(JavaThread* thread, bool with_locked_monitors);
|
||||
~ThreadStackTrace();
|
||||
|
||||
JavaThread* thread() { return _thread; }
|
||||
StackFrameInfo* stack_frame_at(int i) { return _frames->at(i); }
|
||||
int get_stack_depth() { return _depth; }
|
||||
|
||||
|
@ -29,6 +29,8 @@
|
||||
*/
|
||||
|
||||
public class Test6700047 {
|
||||
static byte[] dummy = new byte[256];
|
||||
|
||||
public static void main(String[] args) {
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
intToLeftPaddedAsciiBytes();
|
||||
@ -53,6 +55,7 @@ public class Test6700047 {
|
||||
if (offset > 0) {
|
||||
for(int j = 0; j < offset; j++) {
|
||||
result++;
|
||||
dummy[i] = 0;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
Loading…
x
Reference in New Issue
Block a user