8052081: Optimize generated by C2 code for Intel's Atom processor
Allow to execute vectorization and crc32 optimization on Atom. Enable UseFPUForSpilling by default on x86. Reviewed-by: roland
This commit is contained in:
parent
97512f58ec
commit
7764490363
@ -3853,6 +3853,15 @@ void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
|
|||||||
emit_int8((unsigned char)(0xC0 | encode));
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Carry-Less Multiplication Quadword
|
||||||
|
void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
|
||||||
|
assert(VM_Version::supports_clmul(), "");
|
||||||
|
int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
|
||||||
|
emit_int8(0x44);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
emit_int8((unsigned char)mask);
|
||||||
|
}
|
||||||
|
|
||||||
// Carry-Less Multiplication Quadword
|
// Carry-Less Multiplication Quadword
|
||||||
void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
|
void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
|
||||||
assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
|
assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
|
||||||
|
@ -1837,6 +1837,7 @@ private:
|
|||||||
void vpbroadcastd(XMMRegister dst, XMMRegister src);
|
void vpbroadcastd(XMMRegister dst, XMMRegister src);
|
||||||
|
|
||||||
// Carry-Less Multiplication Quadword
|
// Carry-Less Multiplication Quadword
|
||||||
|
void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
|
||||||
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
|
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
|
||||||
|
|
||||||
// AVX instruction which is used to clear upper 128 bits of YMM registers and
|
// AVX instruction which is used to clear upper 128 bits of YMM registers and
|
||||||
|
@ -7316,17 +7316,34 @@ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register tabl
|
|||||||
* Fold 128-bit data chunk
|
* Fold 128-bit data chunk
|
||||||
*/
|
*/
|
||||||
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
|
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
|
||||||
vpclmulhdq(xtmp, xK, xcrc); // [123:64]
|
if (UseAVX > 0) {
|
||||||
vpclmulldq(xcrc, xK, xcrc); // [63:0]
|
vpclmulhdq(xtmp, xK, xcrc); // [123:64]
|
||||||
vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
|
vpclmulldq(xcrc, xK, xcrc); // [63:0]
|
||||||
pxor(xcrc, xtmp);
|
vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
|
||||||
|
pxor(xcrc, xtmp);
|
||||||
|
} else {
|
||||||
|
movdqa(xtmp, xcrc);
|
||||||
|
pclmulhdq(xtmp, xK); // [123:64]
|
||||||
|
pclmulldq(xcrc, xK); // [63:0]
|
||||||
|
pxor(xcrc, xtmp);
|
||||||
|
movdqu(xtmp, Address(buf, offset));
|
||||||
|
pxor(xcrc, xtmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
|
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
|
||||||
vpclmulhdq(xtmp, xK, xcrc);
|
if (UseAVX > 0) {
|
||||||
vpclmulldq(xcrc, xK, xcrc);
|
vpclmulhdq(xtmp, xK, xcrc);
|
||||||
pxor(xcrc, xbuf);
|
vpclmulldq(xcrc, xK, xcrc);
|
||||||
pxor(xcrc, xtmp);
|
pxor(xcrc, xbuf);
|
||||||
|
pxor(xcrc, xtmp);
|
||||||
|
} else {
|
||||||
|
movdqa(xtmp, xcrc);
|
||||||
|
pclmulhdq(xtmp, xK);
|
||||||
|
pclmulldq(xcrc, xK);
|
||||||
|
pxor(xcrc, xbuf);
|
||||||
|
pxor(xcrc, xtmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -7444,9 +7461,17 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
|
|||||||
// Fold 128 bits in xmm1 down into 32 bits in crc register.
|
// Fold 128 bits in xmm1 down into 32 bits in crc register.
|
||||||
BIND(L_fold_128b);
|
BIND(L_fold_128b);
|
||||||
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
|
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
|
||||||
vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
|
if (UseAVX > 0) {
|
||||||
vpand(xmm3, xmm0, xmm2, false /* vector256 */);
|
vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
|
||||||
vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
|
vpand(xmm3, xmm0, xmm2, false /* vector256 */);
|
||||||
|
vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
|
||||||
|
} else {
|
||||||
|
movdqa(xmm2, xmm0);
|
||||||
|
pclmulqdq(xmm2, xmm1, 0x1);
|
||||||
|
movdqa(xmm3, xmm0);
|
||||||
|
pand(xmm3, xmm2);
|
||||||
|
pclmulqdq(xmm0, xmm3, 0x1);
|
||||||
|
}
|
||||||
psrldq(xmm1, 8);
|
psrldq(xmm1, 8);
|
||||||
psrldq(xmm2, 4);
|
psrldq(xmm2, 4);
|
||||||
pxor(xmm0, xmm1);
|
pxor(xmm0, xmm1);
|
||||||
|
@ -966,6 +966,16 @@ public:
|
|||||||
void mulss(XMMRegister dst, Address src) { Assembler::mulss(dst, src); }
|
void mulss(XMMRegister dst, Address src) { Assembler::mulss(dst, src); }
|
||||||
void mulss(XMMRegister dst, AddressLiteral src);
|
void mulss(XMMRegister dst, AddressLiteral src);
|
||||||
|
|
||||||
|
// Carry-Less Multiplication Quadword
|
||||||
|
void pclmulldq(XMMRegister dst, XMMRegister src) {
|
||||||
|
// 0x00 - multiply lower 64 bits [0:63]
|
||||||
|
Assembler::pclmulqdq(dst, src, 0x00);
|
||||||
|
}
|
||||||
|
void pclmulhdq(XMMRegister dst, XMMRegister src) {
|
||||||
|
// 0x11 - multiply upper 64 bits [64:127]
|
||||||
|
Assembler::pclmulqdq(dst, src, 0x11);
|
||||||
|
}
|
||||||
|
|
||||||
void sqrtsd(XMMRegister dst, XMMRegister src) { Assembler::sqrtsd(dst, src); }
|
void sqrtsd(XMMRegister dst, XMMRegister src) { Assembler::sqrtsd(dst, src); }
|
||||||
void sqrtsd(XMMRegister dst, Address src) { Assembler::sqrtsd(dst, src); }
|
void sqrtsd(XMMRegister dst, Address src) { Assembler::sqrtsd(dst, src); }
|
||||||
void sqrtsd(XMMRegister dst, AddressLiteral src);
|
void sqrtsd(XMMRegister dst, AddressLiteral src);
|
||||||
|
@ -559,7 +559,7 @@ void VM_Version::get_processor_features() {
|
|||||||
FLAG_SET_DEFAULT(UseCLMUL, false);
|
FLAG_SET_DEFAULT(UseCLMUL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (UseCLMUL && (UseAVX > 0) && (UseSSE > 2)) {
|
if (UseCLMUL && (UseSSE > 2)) {
|
||||||
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
|
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
|
||||||
UseCRC32Intrinsics = true;
|
UseCRC32Intrinsics = true;
|
||||||
}
|
}
|
||||||
@ -805,6 +805,21 @@ void VM_Version::get_processor_features() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ((cpu_family() == 0x06) &&
|
||||||
|
((extended_cpu_model() == 0x36) || // Centerton
|
||||||
|
(extended_cpu_model() == 0x37) || // Silvermont
|
||||||
|
(extended_cpu_model() == 0x4D))) {
|
||||||
|
#ifdef COMPILER2
|
||||||
|
if (FLAG_IS_DEFAULT(OptoScheduling)) {
|
||||||
|
OptoScheduling = true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (supports_sse4_2()) { // Silvermont
|
||||||
|
if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
|
||||||
|
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use count leading zeros count instruction if available.
|
// Use count leading zeros count instruction if available.
|
||||||
@ -892,23 +907,25 @@ void VM_Version::get_processor_features() {
|
|||||||
AllocatePrefetchDistance = allocate_prefetch_distance();
|
AllocatePrefetchDistance = allocate_prefetch_distance();
|
||||||
AllocatePrefetchStyle = allocate_prefetch_style();
|
AllocatePrefetchStyle = allocate_prefetch_style();
|
||||||
|
|
||||||
if( is_intel() && cpu_family() == 6 && supports_sse3() ) {
|
if (is_intel() && cpu_family() == 6 && supports_sse3()) {
|
||||||
if( AllocatePrefetchStyle == 2 ) { // watermark prefetching on Core
|
if (AllocatePrefetchStyle == 2) { // watermark prefetching on Core
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
AllocatePrefetchDistance = 384;
|
AllocatePrefetchDistance = 384;
|
||||||
#else
|
#else
|
||||||
AllocatePrefetchDistance = 320;
|
AllocatePrefetchDistance = 320;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if( supports_sse4_2() && supports_ht() ) { // Nehalem based cpus
|
if (supports_sse4_2() && supports_ht()) { // Nehalem based cpus
|
||||||
AllocatePrefetchDistance = 192;
|
AllocatePrefetchDistance = 192;
|
||||||
AllocatePrefetchLines = 4;
|
AllocatePrefetchLines = 4;
|
||||||
|
}
|
||||||
#ifdef COMPILER2
|
#ifdef COMPILER2
|
||||||
if (AggressiveOpts && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
|
if (supports_sse4_2()) {
|
||||||
|
if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
|
||||||
FLAG_SET_DEFAULT(UseFPUForSpilling, true);
|
FLAG_SET_DEFAULT(UseFPUForSpilling, true);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
|
assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
|
||||||
|
|
||||||
|
@ -464,7 +464,9 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
|
|||||||
iop == Op_CreateEx || // Create-exception must start block
|
iop == Op_CreateEx || // Create-exception must start block
|
||||||
iop == Op_CheckCastPP
|
iop == Op_CheckCastPP
|
||||||
) {
|
) {
|
||||||
worklist.map(i,worklist.pop());
|
// select the node n
|
||||||
|
// remove n from worklist and retain the order of remaining nodes
|
||||||
|
worklist.remove((uint)i);
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -550,7 +552,9 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
|
|||||||
assert(idx >= 0, "index should be set");
|
assert(idx >= 0, "index should be set");
|
||||||
Node *n = worklist[(uint)idx]; // Get the winner
|
Node *n = worklist[(uint)idx]; // Get the winner
|
||||||
|
|
||||||
worklist.map((uint)idx, worklist.pop()); // Compress worklist
|
// select the node n
|
||||||
|
// remove n from worklist and retain the order of remaining nodes
|
||||||
|
worklist.remove((uint)idx);
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1378,6 +1378,20 @@ void SuperWord::output() {
|
|||||||
if (n->is_Load()) {
|
if (n->is_Load()) {
|
||||||
Node* ctl = n->in(MemNode::Control);
|
Node* ctl = n->in(MemNode::Control);
|
||||||
Node* mem = first->in(MemNode::Memory);
|
Node* mem = first->in(MemNode::Memory);
|
||||||
|
SWPointer p1(n->as_Mem(), this);
|
||||||
|
// Identify the memory dependency for the new loadVector node by
|
||||||
|
// walking up through memory chain.
|
||||||
|
// This is done to give flexibility to the new loadVector node so that
|
||||||
|
// it can move above independent storeVector nodes.
|
||||||
|
while (mem->is_StoreVector()) {
|
||||||
|
SWPointer p2(mem->as_Mem(), this);
|
||||||
|
int cmp = p1.cmp(p2);
|
||||||
|
if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
|
||||||
|
mem = mem->in(MemNode::Memory);
|
||||||
|
} else {
|
||||||
|
break; // dependent memory
|
||||||
|
}
|
||||||
|
}
|
||||||
Node* adr = low_adr->in(MemNode::Address);
|
Node* adr = low_adr->in(MemNode::Address);
|
||||||
const TypePtr* atyp = n->adr_type();
|
const TypePtr* atyp = n->adr_type();
|
||||||
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
|
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user